In [None]:
# Team members: Danny Siu, Joseph Park, Timothy Cruz
# Creating a compound word classifier model that distinguishes between "real" and "corrupt" compound words

In [1]:
import numpy as np
import pandas as pd
import itertools
import os, sys

In [2]:
# os.chdir("/Users/josephpark/SCHOOL/Data 144/Final_Proj")

In [3]:
ladec = pd.read_csv("LADECv1-2019.csv", sep = ',')

In [4]:
ladec.head()

Unnamed: 0,id_master,c1,c2,stim,obs,obsc1,obsc2,stimlen,c1len,c2len,...,Juhasz_tran,st_c1_mean,st_c2_mean,Zipfvalue,c1_SLlg10wf,c2_SLlg10wf,c1_BLPbncfrequency,c1_BLPbncfrequencymillion,c2_BLPbncfrequency,c2_BLPbncfrequencymillion
0,3237,gad,about,gadabout,1,4,3,8,3,5,...,,,,1.768955,1.4314,5.2677,28.0,0.28,197116.0,1971.16
1,4592,knock,about,knockabout,1,3,4,10,5,5,...,,,,1.592864,3.5185,5.2677,1975.0,19.75,197116.0,1971.16
2,8231,turn,about,turnabout,1,19,7,9,4,5,...,,4.452,2.813,2.333227,4.194,5.2677,21026.0,210.25999,197116.0,1971.16
3,6139,race,about,raceabout,1,4,1,9,4,5,...,,,,,3.4994,5.2677,7959.0,79.589996,197116.0,1971.16
4,8331,walk,about,walkabout,1,2,5,9,4,5,...,,4.594,2.733,2.246077,4.0418,5.2677,10441.0,104.41,197116.0,1971.16


In [5]:
ladec_df = ladec[["c1", "c2", 'stim','isCommonstim', 'isCommonC1', 'isCommonC2']]
len(ladec_df[ladec_df['isCommonstim'] == 1])

3660

In [6]:
ladec.columns.values

array(['id_master', 'c1', 'c2', 'stim', 'obs', 'obsc1', 'obsc2',
       'stimlen', 'c1len', 'c2len', 'nparses', 'correctParse',
       'ratingcmp', 'ratingC1', 'ratingC2', 'isPlural', 'nc1_cmp',
       'nc2_cmp', 'nc1_cmpnoplural', 'nc2_cmpnoplural', 'sentiment_stim',
       'sentiment_c1', 'sentiment_c2', 'sentimentprobpos_stim',
       'sentimentprobpos_c1', 'sentimentprobpos_c2',
       'sentimentprobneg_stim', 'sentimentprobneg_c1',
       'sentimentprobneg_c2', 'sentimentratioposneg_stim',
       'sentimentratioposneg_c1', 'sentimentratioposneg_c2',
       'profanity_stim', 'profanity_c1', 'profanity_c2', 'isCommonstim',
       'isCommonC1', 'isCommonC2', 'bg_boundary', 'bgJonesMewhort',
       'bgSUBTLEX', 'bgFacebook', 'inSUBTLEX', 'inBLP', 'inELP',
       'inJuhaszLaiWoodcock', 'c1_inELP', 'c1_inBrysbaert',
       'c1_inWordnet', 'c1_inMMA', 'c2_inELP', 'c2_inBrysbaert',
       'c2_inWordnet', 'c2_inMMA', 'LSAc1c2', 'LSAc1stim', 'LSAc2stim',
       'stim_SLlg10wf', 'BLPbncfrequ

**Creating dataframe of real compound words**

In [7]:
# Created elp_ladec: dataframe of real compound words found in the ELP (English Lexicon Project)

elp_ladec = ladec[ladec['inELP'] == 1][["c1", "c2", 'stim','isCommonstim', 
#                                         'isCommonC1', 'isCommonC2'
                                       ]]
elp_ladec['is_real_stim'] = np.ones((len(elp_ladec),), dtype=int)

In [8]:
elp_ladec.head()  # our correct compound words

Unnamed: 0,c1,c2,stim,isCommonstim,is_real_stim
2,turn,about,turnabout,1,1
4,walk,about,walkabout,1,1
5,run,about,runabout,1,1
6,round,about,roundabout,1,1
14,where,abouts,whereabouts,1,1


In [9]:
len(elp_ladec)

3149

**Creating dataframe of "corrupt" compound words**

In [10]:
c1_uniq = np.unique(elp_ladec['c1'])
c2_uniq = np.unique(elp_ladec['c2'])

print(len(c1_uniq), len(c2_uniq))

1168 1170


In [11]:
c1_short = c1_uniq[:5]
c2_short = c2_uniq[:5]

In [12]:
wrongWords = []
c1_wrong = []
c2_wrong = []
for c1 in c1_uniq:
    c2 = np.random.choice(c2_uniq, 1)[0]
    cpd_word = ''.join((c1, c2))
    wrongWords.append(cpd_word)
    c1_wrong.append(c1)
    c2_wrong.append(c2)
    c2 = np.random.choice(c2_uniq, 1)[0]
    cpd_word = ''.join((c1, c2))
    wrongWords.append(cpd_word)
    c1_wrong.append(c1)
    c2_wrong.append(c2)
for c2 in c2_uniq:
    c1 = np.random.choice(c1_uniq, 1)[0]
    cpd_word = ''.join((c1, c2))
    wrongWords.append(cpd_word)
    c1_wrong.append(c1)
    c2_wrong.append(c2)
    c1 = np.random.choice(c1_uniq, 1)[0]
    cpd_word = ''.join((c1, c2))
    wrongWords.append(cpd_word)
    c1_wrong.append(c1)
    c2_wrong.append(c2)


In [13]:
len(wrongWords), len(c1_wrong), len(c2_wrong)

(4676, 4676, 4676)

In [14]:
wrong_df = pd.DataFrame()
wrong_df['c1'] = c1_wrong
wrong_df['c2'] = c2_wrong
wrong_df['stim'] = wrongWords
wrong_df['isCommonstim'] = np.zeros((len(wrong_df),), dtype=int)
wrong_df['is_real_stim'] = np.zeros((len(wrong_df),), dtype=int)

In [15]:
wrong_df.head()

Unnamed: 0,c1,c2,stim,isCommonstim,is_real_stim
0,after,tack,aftertack,0,0
1,after,wear,afterwear,0,0
2,air,comer,aircomer,0,0
3,air,pond,airpond,0,0
4,airs,helves,airshelves,0,0


In [16]:
wrong_df["real_word_check"] = wrong_df.stim.isin(elp_ladec.stim)
# wrong_df.head()
new_wrong_df = wrong_df[wrong_df["real_word_check"] == False]
len(wrong_df), len(new_wrong_df)

(4676, 4661)

In [17]:
new_wrong_df = new_wrong_df.drop(['real_word_check'], axis=1)

In [18]:
new_wrong_df.head()

Unnamed: 0,c1,c2,stim,isCommonstim,is_real_stim
0,after,tack,aftertack,0,0
1,after,wear,afterwear,0,0
2,air,comer,aircomer,0,0
3,air,pond,airpond,0,0
4,airs,helves,airshelves,0,0


In [19]:
df = pd.concat([new_wrong_df, elp_ladec])
df.head()

Unnamed: 0,c1,c2,stim,isCommonstim,is_real_stim
0,after,tack,aftertack,0,0
1,after,wear,afterwear,0,0
2,air,comer,aircomer,0,0
3,air,pond,airpond,0,0
4,airs,helves,airshelves,0,0


In [20]:
len(new_wrong_df), len(elp_ladec), len(df), len(new_wrong_df) + len(elp_ladec)

(4661, 3149, 7810, 7810)

In [21]:
df = df.drop(['stim'], axis=1)

In [22]:
# The first step in vectorizing our categorical values is to create a DictVectorizer() object and then 
# use fit_transform() and toarray() to get the values into a NumPy array.
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()

# create a unique dictionary for both c1 and c2

word_to_num = df[['c1']].to_dict('records')
one_hot_words1 = vec.fit_transform(word_to_num).toarray()
print("After one hot encoding ......")
print(one_hot_words1)

After one hot encoding ......
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [23]:
word_to_num = df[['c2']].to_dict('records')
# print(type(word_to_num))
one_hot_words2 = vec.fit_transform(word_to_num).toarray()
print("After one hot encoding ......")
print(one_hot_words2)

After one hot encoding ......
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [24]:
one_hot_words1.shape, one_hot_words2.shape

((7810, 1168), (7810, 1170))

In [25]:
c1_labels = [''.join((e,'_c1')) for e in c1_uniq]
c2_labels = [''.join((e,'_c2')) for e in c2_uniq]

df_oh_1 = pd.DataFrame(one_hot_words1, columns = c1_labels)
df_oh_2 = pd.DataFrame(one_hot_words2, columns = c2_labels)


In [26]:
df_oh_1.tail()

Unnamed: 0,after_c1,air_c1,airs_c1,alder_c1,ale_c1,all_c1,alley_c1,altar_c1,anchor_c1,ant_c1,...,working_c1,works_c1,worm_c1,wrist_c1,wrong_c1,yachts_c1,yard_c1,yards_c1,year_c1,zig_c1
7805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [27]:
df.shape, df_oh_1.shape, df_oh_2.shape

((7810, 4), (7810, 1168), (7810, 1170))

In [28]:
df_oh_1 = df_oh_1.loc[:,~df_oh_1.columns.duplicated()]
df_oh_2 = df_oh_2.loc[:,~df_oh_2.columns.duplicated()]

In [29]:
final_df = df.reset_index().join(df_oh_1)
final_df = final_df.join(df_oh_2)
final_df.head()

Unnamed: 0,index,c1,c2,isCommonstim,is_real_stim,after_c1,air_c1,airs_c1,alder_c1,ale_c1,...,wreck_c2,wright_c2,wrights_c2,writer_c2,writers_c2,writing_c2,wurst_c2,yard_c2,yards_c2,zag_c2
0,0,after,tack,0,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,after,wear,0,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,air,comer,0,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,air,pond,0,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,airs,helves,0,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
final_df.shape

(7810, 2343)

In [31]:
final_df = final_df.drop(['c1', 'c2'], axis=1)
final_df.head()

Unnamed: 0,index,isCommonstim,is_real_stim,after_c1,air_c1,airs_c1,alder_c1,ale_c1,all_c1,alley_c1,...,wreck_c2,wright_c2,wrights_c2,writer_c2,writers_c2,writing_c2,wurst_c2,yard_c2,yards_c2,zag_c2
0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
final_df.tail()

Unnamed: 0,index,isCommonstim,is_real_stim,after_c1,air_c1,airs_c1,alder_c1,ale_c1,all_c1,alley_c1,...,wreck_c2,wright_c2,wrights_c2,writer_c2,writers_c2,writing_c2,wurst_c2,yard_c2,yards_c2,zag_c2
7805,8943,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7806,8944,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7807,8947,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7808,8952,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7809,8955,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


**Just a note: we are trying to predict "is_real_stim" in our model.**

In [33]:
train_df, validate_df, test_df = np.split(final_df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))]) 
# df.sample shuffles the dataframe to randomize the sample
# we chose to split into 60% - training set, 20% - validation set, 20% - test set

In [34]:
train_df['is_real_stim'].value_counts()

0    2817
1    1869
Name: is_real_stim, dtype: int64

In [35]:
validate_df['is_real_stim'].value_counts()

0    921
1    641
Name: is_real_stim, dtype: int64

In [36]:
test_df['is_real_stim'].value_counts()

0    923
1    639
Name: is_real_stim, dtype: int64

In [37]:
X_train = train_df.drop(columns = ['is_real_stim'])
X_validation = validate_df.drop(columns = ['is_real_stim'])
X_test = test_df.drop(columns = ['is_real_stim'])

y_train = train_df['is_real_stim']
y_validation = validate_df['is_real_stim']
y_test = test_df['is_real_stim']


****Create Classifier Models****

**Logistic Regression Model**

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [39]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

print('Accuracy on training: ')
y_pred_train=lr_model.predict(X_train)
print(accuracy_score(y_train,y_pred_train))

training_accuracy = lr_model.score(X_train, y_train)
print('Accuracy on training: ', training_accuracy)



Accuracy on training: 
0.953478446436193
Accuracy on training:  0.953478446436193


In [40]:
validation_accuracy = lr_model.score(X_validation, y_validation)
print('Accuracy on validation: ', validation_accuracy)

Accuracy on validation:  0.93854033290653


In [41]:
test_accuracy = lr_model.score(X_test, y_test)
print('Accuracy on test: ', test_accuracy)

Accuracy on test:  0.9436619718309859


**MLP Classifier Model**

In [42]:
from sklearn.neural_network import MLPClassifier


In [43]:
mlp_model = MLPClassifier(hidden_layer_sizes=(10), random_state=42, activation='logistic', solver='lbfgs')
mlp_model.fit(X_train,y_train)
training_accuracy = mlp_model.score(X_train, y_train)
print('Accuracy on training: ', training_accuracy)

# print('Accuracy on training: ')
# y_pred_train=clf.predict(X_train)
# print(accuracy_score(Y_train,y_pred_train))

Accuracy on training:  0.9462227912932138


In [44]:
validation_accuracy = mlp_model.score(X_validation, y_validation)
print('Accuracy on validation: ', validation_accuracy)

Accuracy on validation:  0.9353393085787451


In [45]:
test_accuracy = mlp_model.score(X_test, y_test)
print('Accuracy on test: ', test_accuracy)

Accuracy on test:  0.9436619718309859


**Decision Tree Classifier**

In [46]:
from sklearn.tree import DecisionTreeClassifier


In [47]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
training_accuracy = dt_model.score(X_train, y_train)
print('Accuracy on training: ', training_accuracy)

# accuracy_score(clf.predict(X_train), y_train)

Accuracy on training:  1.0


In [48]:
validation_accuracy = dt_model.score(X_validation, y_validation)
print('Accuracy on validation: ', validation_accuracy)

Accuracy on validation:  0.9340588988476313


In [49]:
test_accuracy = dt_model.score(X_test, y_test)
print('Accuracy on test: ', test_accuracy)

Accuracy on test:  0.9314980793854033
