In [1]:
import nltk
from nltk.corpus import words
word_list = words.words()

word_list = [w for w in word_list if len(w) > 2]

len(word_list)

236506

In [4]:
valid_splits = []

for word in word_list:
    for i in range(2, len(word)):
        left = word[:i]
        right = word[i:]
        if len(left) > 2 and left in word_list and len(right)>2 and right in word_list:
            valid_splits.append([word, left, right])
            break 

print("Total valid splits found:", len(valid_splits))

Total valid splits found: 76591


In [6]:
import pandas as pd

df = pd.DataFrame(valid_splits, columns=['word','left','right'])
df.shape

(76591, 3)

In [7]:
flattern_lst = [x for l in valid_splits for x in l ]

In [8]:
word_counts = {}

for word in flattern_lst:
    if word in word_counts:
        word_counts[word] += 1
    else:
        word_counts[word] = 1

sorted_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

In [20]:
words_to_exclude = [
    'ing', 'ment', 'com', 'dis', 'con', 'est','ent', 'pro', 'ted', 'les', 'ian', 'ist', 'der', 'sub', 'str', 'pre', 'res',
    'mit', 'ver', 'ext', 'tion', 'rec', 'sur', 'uni', 'comm', 'dec', 'eng', 'mar', 'tions', 'pas', 'rep', 'wal', 'rel',
    'abs', 'acc', 'ness', 'ism', 'ally', 'tri', 'dom', 'ess', 'ary','tic', 'aba', 'poly', 'pseudo','iso','par','tele', 'bes',
    'oxy', 'ology'
]

In [21]:
df2 = df[ ~(df['left'].isin(words_to_exclude) | df['right'].isin(words_to_exclude) )].reset_index(drop=True)
df2['right'].value_counts().head(50)

less        1577
like        1169
able        1134
man          807
ship         805
ion          503
ate          487
age          414
ling         391
wise         371
let          369
proof        365
weed         282
lessness     271
wood         270
maker        262
wort         256
hood         255
head         250
ableness     247
work         238
ability      236
some         229
making       227
fully        222
ably         218
fish         204
ant          201
ward         184
berry        176
stone        167
way          143
ting         142
bird         142
house        142
ger          141
land         140
per          139
board        138
flower       134
led          133
hearted      132
monger       129
back         125
ure          123
tail         119
ean          117
root         115
woman        113
ose          109
Name: right, dtype: int64

In [22]:
df2['left_right'] = df2['left'] + ' ' + df2['right']

In [23]:
df2

Unnamed: 0,word,left,right,left_right
0,abactor,abac,tor,abac tor
1,abandonable,abandon,able,abandon able
2,abashless,abash,less,abash less
3,abbassi,abb,assi,abb assi
4,abbotship,abbot,ship,abbot ship
...,...,...,...,...
58595,through,thro,ugh,thro ugh
58596,together,tog,ether,tog ether
58597,tongue,ton,gue,ton gue
58598,window,win,dow,win dow


In [24]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

In [25]:
import numpy as np

def euclideanDistance(x, y):
    x = np.array(x)
    y = np.array(y)
    return np.linalg.norm(y-x)

def calculate_distance(word, left_right):
    vec1 = get_sentence_embedding(word)
    vec2 = get_sentence_embedding(left_right)
    
    dist = euclideanDistance(vec1, vec2)
    return dist

def get_sentence_embedding(sentence):
    return embedder.encode(sentence)

In [26]:
df2['dist'] = df2.apply(lambda x: calculate_distance(x['word'], x['left_right']), axis=1)

In [27]:
df3 = df2.sort_values(by='dist', ascending=False).reset_index(drop=True)
df3.head()

Unnamed: 0,word,left,right,left_right,dist
0,rapidly,rap,idly,rap idly,21.958546
1,funeral,fun,eral,fun eral,21.202032
2,beeflower,bee,flower,bee flower,21.199068
3,tenantship,ten,antship,ten antship,21.062946
4,tendance,ten,dance,ten dance,21.0431


In [28]:
df3.tail()

Unnamed: 0,word,left,right,left_right,dist
58595,freeheartedness,free,heartedness,free heartedness,1.566774
58596,meekhearted,meek,hearted,meek hearted,1.542598
58597,gentleheartedly,gentle,heartedly,gentle heartedly,1.456233
58598,gentleheartedness,gentle,heartedness,gentle heartedness,1.424568
58599,gentlehearted,gentle,hearted,gentle hearted,1.364183


In [29]:
df3.head(50)

Unnamed: 0,word,left,right,left_right,dist
0,rapidly,rap,idly,rap idly,21.958546
1,funeral,fun,eral,fun eral,21.202032
2,beeflower,bee,flower,bee flower,21.199068
3,tenantship,ten,antship,ten antship,21.062946
4,tendance,ten,dance,ten dance,21.0431
5,boycottage,boy,cottage,boy cottage,21.042885
6,warmongering,war,mongering,war mongering,20.852793
7,barbed,bar,bed,bar bed,20.750692
8,Hollywood,Holly,wood,Holly wood,20.610861
9,mandate,man,date,man date,20.53348


In [30]:
df3[df3['word']=='therapist']

Unnamed: 0,word,left,right,left_right,dist
51,therapist,the,rapist,the rapist,19.429022


In [34]:
df4 = df3.head(1000)[['word','left_right', 'dist']].rename(columns={'left_right':'split_words'})
df4.tail()

Unnamed: 0,word,split_words,dist
995,undershrubby,under shrubby,16.581631
996,anthem,ant hem,16.579014
997,overmultiplication,over multiplication,16.578173
998,overbarren,over barren,16.576887
999,important,import ant,16.575583


In [36]:
import os

pth = r'C:\Users\web3\Desktop\temp'
fName = 'split_words.csv'

df4[['word','split_words']].to_csv(os.path.join(pth, fName), index=False)

In [48]:
df3[df3['left']=='rapist']

Unnamed: 0,word,left,right,left_right,dist


In [40]:
df3[1050:1100]

Unnamed: 0,word,left,right,left_right,dist
1050,forgettable,for,gettable,for gettable,16.507336
1051,wishfully,wish,fully,wish fully,16.505486
1052,patroller,pat,roller,pat roller,16.503748
1053,perversion,per,version,per version,16.502993
1054,reckless,reck,less,reck less,16.502451
1055,philander,phi,lander,phi lander,16.501495
1056,starboard,star,board,star board,16.501354
1057,underdog,under,dog,under dog,16.500769
1058,stepuncle,step,uncle,step uncle,16.499102
1059,underofficial,under,official,under official,16.498589


## results:

|单词|中文含义|拆分后|中文含义|
|----|----|---|---|
|therapist  |  治疗师      | the rapist  |    强奸犯
|important  |  重要的       | import ant  |    进口蚂蚁
|tendance   |  趋势         | ten dance    |   十只舞
|shortstop  |  游击手       | short stop    |  短暂停留
|therein    |  其中         | the rein	   |    缰绳
|mammalogical| 哺乳动物学的  | mamma logical  | 妈妈逻辑
|boycottage |联合抵制 | boy cottage | 男孩小屋
|orchideously| 兰花般地 | orc hideously | 兽人可怕
|pierage |码头 | pie rage | 馅饼愤怒
|pastoral| 田园的 |	past oral | 过去的口头	


In [1]:
import os

os.startfile(os.getcwd())