In [1]:
import pandas as pd
import numpy as np

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split

In [3]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score

In [4]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [5]:
def map_feature_importance(alg, data):
    try:
        imp_list = list(alg.feature_importances_)
    except AttributeError:
        imp_list = list(alg.coef_)
        imp_list = imp_list[0]
    feat_list = data.columns.to_list()
    to_sort = list(zip(feat_list, imp_list))
    sorted_list = sorted(to_sort, key=lambda x: x[1], reverse=True)
    return sorted_list

In [6]:
def pick_overthreshold(seq, thresh):
    if len(seq) == 1:
        seq = seq[0]
    picked = [i[0] for i in seq if i[1] > thresh]
    return picked

In [7]:
def choose_best_feats(algo, X_train, X_test, y_train, y_test):
    best_feats = []
    best_ba = 0
    best_acc = 0
    algo.fit(X_train, y_train)
    #preds = random_forest_simples.predict(X_test)
    mapped = map_feature_importance(algo, X_train)
    for i in np.linspace(0.01, 0.1, 10):
        feats = pick_overthreshold(mapped, i)
        algo.fit(X_train[feats], y_train)
        pr = algo.predict(X_test[feats])
        ba = balanced_accuracy_score(y_test, pr)
        acc = accuracy_score(y_test, pr)
        if ba > best_ba:
            best_feats = feats
            best_ba = ba
            best_acc = acc
    print(f'features are: {best_feats}')
    print(f'balanced accuracy is {best_ba}')
    print(f'accuracy is {best_acc}')
    print()

In [8]:
def map_coefs(dataset, logit):
    for pair in list(zip(dataset.columns, logit.coef_[0])):
        print(pair)

In [9]:
data = pd.read_csv('dataset_exp.csv', encoding = 'mac_cyrillic', sep=';')

In [10]:
data

Unnamed: 0,Word,Transcription,Syllables,frequency,IPM,TF,PoS,isPr,Type,is_compound,Reduced
0,вижу,v'iZu,2,2817,216.670000,1,2,0,2,0,0
1,все,fs'e:,1,49301,3792.010000,7,9,0,2,0,0
2,руки,ruk'@,2,2599,199.900000,1,1,0,2,0,0
3,подняты,podn'itQ,3,11,0.850000,1,5,0,2,0,0
4,давайте,d@vaIt'i,3,7185,552.640000,2,2,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...
15356,всего,fs'ivo,2,8146,626.550000,1,9,0,2,0,0
15357,повезло,pQv'izlo,3,540,41.534391,1,2,0,2,0,0
15358,Григоровичу,gr'igeroiCu:,5,0,0.000000,1,1,0,2,0,1
15359,и,Q,1,332602,25582.270000,49,8,1,2,0,0


In [11]:
data['PoS'] = data['PoS'].astype('category')
data['Type'] = data['Type'].astype('category')

In [12]:
data['IPM'] = data['IPM'].astype('float')

In [13]:
parts_of_speech = {1: 'is_noun', 2: 'is_verb', 3: 'is_adj', 4: 'is_adv', 5: 'is_part', 6: 'is_pronoun', 7: 'is_prep',
                  8: 'is_conj', 9: 'is_pr_adj', 10: 'is_deepr', 11: 'is_particle', 12: 'is_pr_adv', 13: 'is_stat', 14: 'is_numer',
                  15: 'is_inter', 16: 'is_vvod', 17: 'is_pr_noun', 18: 'is_pr_num'}

In [14]:
for i in parts_of_speech:
    data[parts_of_speech[i]] = np.where(data['PoS'] == i, 1, 0)

In [15]:
data = data.drop('PoS', axis=1)

In [16]:
data = data.drop('Word', axis=1)
data = data.drop('Transcription', axis=1)

In [17]:
simples = data[data['is_compound']==0]

In [18]:
simples_y = simples['Reduced']

In [19]:
simples_X = simples.drop('Reduced', axis=1)

In [20]:
simples_X = simples_X.drop('is_compound', axis=1)

In [21]:
simples_X = simples_X.drop('frequency', axis=1)

In [22]:
compounds = data[data['is_compound']==1]

In [23]:
compounds_y = compounds['Reduced']

In [24]:
compounds_X = compounds.drop('Reduced', axis=1)

In [25]:
compounds_X = compounds_X.drop('is_compound', axis=1)

In [26]:
compounds_X = compounds_X.drop('frequency', axis=1)

In [27]:
X_simples_train, X_simples_test, y_simples_train, y_simples_test = train_test_split(simples_X, 
                                                                                    simples_y, test_size = 0.2, 
                                                                                    random_state = 0)

In [28]:
X_compound_train, X_compound_test, y_compound_train, y_compound_test = train_test_split(compounds_X, 
                                                                                        compounds_y, test_size = 0.2, 
                                                                                        random_state = 0)

In [29]:
data_normalized = data.copy()

In [30]:
mean_IPM = data_normalized['IPM'].mean()

In [31]:
std_IPM = data_normalized['IPM'].std()

In [32]:
data_normalized['IPM'] -= mean_IPM
data_normalized['IPM'] /= std_IPM

In [33]:
mean_TF = data_normalized['TF'].mean()
std_TF = data_normalized['TF'].std()

In [34]:
data_normalized['TF'] -= mean_TF
data_normalized['TF'] /= std_TF

In [35]:
data_normalized = data_normalized.drop('frequency', axis=1)

In [36]:
simple_norm = data_normalized[data_normalized['is_compound']==0]

In [37]:
simple_norm_y = simple_norm['Reduced']

In [38]:
simple_norm = simple_norm.drop('Reduced', axis=1)

In [39]:
simple_norm = simple_norm.drop('is_compound', axis=1)

In [40]:
compounds_norm = data_normalized[data_normalized['is_compound']==1]

In [41]:
compounds_norm_y = compounds_norm['Reduced']

In [42]:
compounds_norm = compounds_norm.drop('Reduced', axis=1)

In [43]:
compounds_norm = compounds_norm.drop('is_compound', axis=1)

In [44]:
X_simples_norm_train, X_simples_norm_test, y_simples_norm_train, y_simples_norm_test = train_test_split(simple_norm, 
                                                                                    simple_norm_y, test_size = 0.2, 
                                                                                    random_state = 0)

In [45]:
X_compound_norm_train, X_compound_norm_test, y_compound_norm_train, y_compound_norm_test = train_test_split(
    compounds_norm, compounds_norm_y, test_size = 0.2, random_state = 0)

In [46]:
data_log = data.copy()

In [47]:
data_log['IPM'] = np.log(data_log['IPM'])
data_log['TF'] = np.log(data_log['TF'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [49]:
data_log = data_log.drop('frequency', axis=1)

In [50]:
simple_log = data_log[data_log['is_compound']==0]

In [51]:
simple_log_y = simple_log['Reduced']

In [52]:
simple_log = simple_log.drop('Reduced', axis=1)

In [53]:
simple_log = simple_log.drop('is_compound', axis=1)

In [54]:
simple_log['IPM'] = np.where(simple_log['IPM'] == -np.inf, 0, simple_log['IPM'])

In [55]:
X_simples_log_train, X_simples_log_test, y_simples_log_train, y_simples_log_test = train_test_split(simple_log, 
                                                                                    simple_log_y, test_size = 0.2, 
                                                                                    random_state = 0)

In [56]:
compound_log = data_log[data_log['is_compound']==1]

In [57]:
compound_log_y = compound_log['Reduced']

In [58]:
compound_log = compound_log.drop('Reduced', axis=1)

In [59]:
compound_log = compound_log.drop('is_compound', axis=1)

In [60]:
compound_log['IPM'] = np.where(compound_log['IPM'] == -np.inf, 0, compound_log['IPM'])

In [61]:
X_compound_log_train, X_compound_log_test, y_compound_log_train, y_compound_log_test = train_test_split(compound_log, 
                                                                                    compound_log_y, test_size = 0.2, 
                                                                                    random_state = 0)

In [63]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

In [70]:
c = [0.0001, 0.001, 0.01, 0.1, 1, 10]

# compound

In [64]:
random_forest_compounds = RandomForestClassifier(n_estimators=300, random_state=0)

In [1089]:
for i in nn_compound[1]:
    print(i, i in nn_simple[1])

is_vvod False
isPr True
Type True
is_adj True
Syllables True
TF True
is_prep True
IPM True
is_conj True
is_verb False
is_pr_adv False
is_pronoun True
is_adv False
is_particle True


In [66]:
random_forest_compounds.fit(X_compound_train, y_compound_train)

RandomForestClassifier(n_estimators=300, random_state=0)

In [67]:
random_forest_compounds_preds = random_forest_compounds.predict(X_compound_test)

In [68]:
choose_best_feats(random_forest_compounds, X_compound_train, X_compound_test, 
                  y_compound_train, y_compound_test)

features are: ['IPM', 'Syllables', 'TF', 'is_vvod']
balanced accuracy is 0.8364130434782608
accuracy is 0.8372093023255814



In [71]:
list_of_lists = []
for i in c:
    per_sn = Perceptron(penalty='l1', alpha=i)
    per_sn.fit(X_compound_log_train, y_compound_log_train)
    preds = per_sn.predict(X_compound_log_test)
    list_feats = sorted(map_feature_importance(per_sn, X_compound_log_train), key=lambda x: x[1], reverse=True)
    feats = [i[0] for i in list_feats if i[1] != 0]
    if feats:
        list_of_lists.append(feats)


In [72]:
max_ba = 0
for feats in list_of_lists:
    for i in c:
        per = Perceptron(penalty='l1', alpha=i)
        per.fit(X_compound_log_train[feats], y_compound_log_train)
        preds = per.predict(X_compound_log_test[feats])
        ba = balanced_accuracy_score(y_compound_log_test, preds)
        if ba > max_ba:
            max_ba = ba
            print(f'C is {i}')
            print(f'Features are {feats}')
            print(f'ba is {ba}')
            print(accuracy_score(y_compound_log_test, preds))
            print()

C is 0.0001
Features are ['is_adj', 'isPr', 'is_vvod', 'TF', 'is_prep', 'is_numer', 'is_inter', 'is_pr_noun', 'Type', 'Syllables', 'is_pr_adj', 'is_noun', 'IPM', 'is_conj', 'is_verb', 'is_adv', 'is_pronoun', 'is_pr_adv', 'is_particle']
ba is 0.5836956521739131
0.5930232558139535

C is 0.001
Features are ['is_adj', 'isPr', 'is_vvod', 'TF', 'is_prep', 'is_numer', 'is_inter', 'is_pr_noun', 'Type', 'Syllables', 'is_pr_adj', 'is_noun', 'IPM', 'is_conj', 'is_verb', 'is_adv', 'is_pronoun', 'is_pr_adv', 'is_particle']
ba is 0.6967391304347825
0.686046511627907



In [76]:
gb_compound = GradientBoostingClassifier(random_state=0, n_estimators=300)

In [77]:
gb_compound.fit(X_compound_train, y_compound_train)

GradientBoostingClassifier(n_estimators=300, random_state=0)

In [78]:
choose_best_feats(gb_compound, X_compound_train, X_compound_test, y_compound_train, y_compound_test)

features are: ['IPM', 'Syllables', 'TF']
balanced accuracy is 0.8130434782608695
accuracy is 0.813953488372093



In [79]:
nn_compound = [['IPM', 'Syllables', 'TF', 'is_vvod'],
['is_adj', 'isPr', 'is_vvod', 'TF', 'is_prep', 'is_numer', 
 'is_inter', 'is_pr_noun', 'Type', 'Syllables', 'is_pr_adj', 
 'is_noun', 'IPM', 'is_conj', 'is_verb', 'is_adv', 'is_pronoun', 'is_pr_adv', 'is_particle'],
['IPM', 'Syllables', 'TF']]

In [145]:
parameters_grid = {
    'fit_intercept': [True, False],
    'max_iter': np.linspace(100, 900, num=17, dtype=int),
    'penalty': ['l1', 'l2'],   
    'solver': ['liblinear', 'saga'], 
    'C': c,
}

In [146]:
log_compound = LogisticRegression(random_state=0)

In [147]:
grid_compound4 = RandomizedSearchCV(log_compound, parameters_grid, scoring = 'balanced_accuracy', cv = kf, random_state=0)

In [148]:
X_compound_4_train = X_compound_log_train[nn_compound[0]]
X_compound_4_test = X_compound_log_test[nn_compound[0]]

In [149]:
grid_compound4.fit(X_compound_4_train, y_compound_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                                        'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [150]:
grid_compound4.best_params_

{'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 600,
 'fit_intercept': True,
 'C': 0.1}

In [151]:
log_compound4 = LogisticRegression(random_state=0, solver='saga', penalty='l2', max_iter=600, fit_intercept=True, C=0.1)

In [152]:
log_compound4.fit(X_compound_4_train, y_compound_log_train)

LogisticRegression(C=0.1, max_iter=600, random_state=0, solver='saga')

In [153]:
preds_compound_4 = log_compound4.predict(X_compound_4_test)

In [154]:
map_coefs(X_compound_4_train, log_compound4)

('IPM', 0.03369022515099606)
('Syllables', 0.4000754109048525)
('TF', 0.19743211913736905)
('is_vvod', 0.7813460957216112)


In [155]:
balanced_accuracy_score(y_compound_log_test, preds_compound_4)

0.6777173913043478

In [156]:
accuracy_score(y_compound_log_test, preds_compound_4)

0.6744186046511628

In [157]:
X_compound_19_train = X_compound_log_train[nn_compound[1]]
X_compound_19_test = X_compound_log_test[nn_compound[1]]

In [158]:
grid_compound19 = RandomizedSearchCV(log_compound, parameters_grid, scoring = 'balanced_accuracy', cv = kf, random_state=0)

In [159]:
grid_compound19.fit(X_compound_19_train, y_compound_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                                        'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [160]:
grid_compound19.best_params_

{'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 600,
 'fit_intercept': True,
 'C': 0.1}

In [161]:
log_compound19 = LogisticRegression(random_state=0, solver='saga', penalty='l2', max_iter=350, fit_intercept=True, C=0.1)

In [162]:
log_compound19.fit(X_compound_19_train, y_compound_log_train)

LogisticRegression(C=0.1, max_iter=350, random_state=0, solver='saga')

In [163]:
preds_compound_19 = log_compound19.predict(X_compound_19_test)

In [164]:
map_coefs(X_compound_19_train, log_compound19)

('is_adj', 0.1298912484926274)
('isPr', 0.2938933155793754)
('is_vvod', 0.6707690967582922)
('TF', 0.24989368507586013)
('is_prep', 0.1503735881755562)
('is_numer', 0.055563756519695515)
('is_inter', 0.0934286794975457)
('is_pr_noun', 0.06431368522252282)
('Type', 0.39380368373171576)
('Syllables', 0.390571910336091)
('is_pr_adj', 0.007068752560193474)
('is_noun', 0.07236935243378098)
('IPM', 0.03500017913213787)
('is_conj', -0.1656317488391209)
('is_verb', -0.12435543257569677)
('is_adv', -0.38350720624915785)
('is_pronoun', -0.10779481322572254)
('is_pr_adv', -0.11100096828632966)
('is_particle', -0.3547966592201097)


In [194]:
balanced_accuracy_score(y_compound_log_test, preds_compound_19)

0.7027173913043478

In [166]:
accuracy_score(y_compound_log_test, preds_compound_19)

0.6976744186046512

In [167]:
X_compound_3_train = X_compound_log_train[nn_compound[-1]]
X_compound_3_test = X_compound_log_test[nn_compound[-1]]

In [170]:
grid_compound3 = RandomizedSearchCV(log_compound, parameters_grid, scoring = 'balanced_accuracy', cv = kf, random_state=0)

In [171]:
grid_compound3.fit(X_compound_3_train, y_compound_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                                        'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [172]:
grid_compound3.best_params_

{'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 600,
 'fit_intercept': True,
 'C': 0.1}

In [173]:
log_compound3 = LogisticRegression(random_state=0, solver='saga', penalty='l2', max_iter=600, fit_intercept=True, C=0.1)

In [174]:
log_compound3.fit(X_compound_3_train, y_compound_log_train)

LogisticRegression(C=0.1, max_iter=600, random_state=0, solver='saga')

In [175]:
preds_compound_3 = log_compound3.predict(X_compound_3_test)

In [176]:
map_coefs(X_compound_3_train, log_compound3)

('IPM', 0.051363718609410663)
('Syllables', 0.4174419739068904)
('TF', 0.2491663261968389)


In [177]:
balanced_accuracy_score(y_compound_log_test, preds_compound_3)

0.6309782608695653

In [178]:
accuracy_score(y_compound_log_test, preds_compound_3)

0.627906976744186

In [179]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)

In [180]:
keras_compound_4_train = np.array(X_compound_log_train[nn_compound[0]]) 
keras_compound_4_test = np.array(X_compound_log_test[nn_compound[0]])

In [181]:
nn_compound_4 = models.Sequential()

In [182]:
nn_compound_4.add(layers.Dense(256, activation='relu'))
nn_compound_4.add(layers.Dense(256, activation='relu'))
nn_compound_4.add(layers.Dense(256, activation='relu'))
nn_compound_4.add(layers.Dense(128, activation='relu'))
nn_compound_4.add(layers.Dense(128, activation='relu'))
nn_compound_4.add(layers.Dense(128, activation='relu'))
nn_compound_4.add(layers.Dense(64, activation='relu'))
nn_compound_4.add(layers.Dense(1, activation='sigmoid'))

In [183]:
nn_compound_4.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [184]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_compound_4.fit(keras_compound_4_train, y_compound_log_train, epochs=50, batch_size=512)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1491175e0>

In [185]:
results_compound4 = nn_compound_4.evaluate(keras_compound_4_test, y_compound_log_test)



In [195]:
keras_compound_19_train = np.array(X_compound_log_train[nn_compound[1]]) 
keras_compound_19_test = np.array(X_compound_log_test[nn_compound[1]])

In [196]:
nn_compound_19 = models.Sequential()

In [197]:
nn_compound_19.add(layers.Dense(128, activation='relu'))
nn_compound_19.add(layers.Dense(128, activation='relu'))
nn_compound_19.add(layers.Dense(128, activation='relu'))
nn_compound_19.add(layers.Dense(128, activation='relu'))
nn_compound_19.add(layers.Dense(128, activation='relu'))
nn_compound_19.add(layers.Dense(128, activation='relu'))
nn_compound_19.add(layers.Dense(64, activation='relu'))
nn_compound_19.add(layers.Dense(1, activation='sigmoid'))

In [198]:
nn_compound_19.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [199]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_compound_19.fit(keras_compound_19_train, y_compound_log_train, epochs=50, batch_size=512)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x14acc45b0>

In [200]:
results_compound19 = nn_compound_19.evaluate(keras_compound_19_test, y_compound_log_test)



In [201]:
keras_compound_3_train = np.array(X_compound_log_train[nn_compound[-1]]) 
keras_compound_3_test = np.array(X_compound_log_test[nn_compound[-1]])

In [202]:
nn_compound_3 = models.Sequential()

In [203]:
nn_compound_3.add(layers.Dense(256, activation='relu'))
nn_compound_3.add(layers.Dense(256, activation='relu'))
nn_compound_3.add(layers.Dense(256, activation='relu'))
nn_compound_3.add(layers.Dense(256, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(64, activation='relu'))
nn_compound_3.add(layers.Dense(1, activation='sigmoid'))

In [204]:
nn_compound_3.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [205]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_compound_3.fit(keras_compound_3_train, y_compound_log_train, epochs=50, batch_size=512)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x14adb5310>

In [206]:
results_compound3 = nn_compound_3.evaluate(keras_compound_3_test, y_compound_log_test)




# Simple

In [207]:
random_forest_simples = RandomForestClassifier(n_estimators=300, random_state=0)

In [208]:
random_forest_simples.fit(X_simples_train, y_simples_train)

RandomForestClassifier(n_estimators=300, random_state=0)

In [209]:
random_forest_simples_preds = random_forest_simples.predict(X_simples_test)

In [210]:
choose_best_feats(random_forest_simples, X_simples_train, X_simples_test, y_simples_train, y_simples_test)

features are: ['IPM', 'Syllables', 'TF', 'is_adj']
balanced accuracy is 0.7606779515941013
accuracy is 0.7984599933043187



In [213]:
list_of_lists = []
for i in c:
    per_sn = Perceptron(penalty='l1', alpha=i)
    per_sn.fit(X_simples_log_train, y_simples_log_train)
    preds = per_sn.predict(X_simples_log_test)
    list_feats = sorted(map_feature_importance(per_sn, X_simples_log_train), key=lambda x: x[1], reverse=True)
    feats = [i[0] for i in list_feats if i[1] != 0]
    if feats:
        list_of_lists.append(feats)

In [214]:
max_ba = 0
for feats in list_of_lists:
    for i in c:
        per_sn = Perceptron(penalty='l1', alpha=i)
        per_sn.fit(X_simples_log_train[feats], y_simples_log_train)
        preds = per_sn.predict(X_simples_log_test[feats])
        ba = balanced_accuracy_score(y_simples_log_test, preds)
        if ba > max_ba:
            max_ba = ba
            print(f'Alpha is {i}')
            print(f'Features are {feats}')
            print(f'ba is {ba}')
            print(accuracy_score(y_simples_log_test, preds))
            print()

Alpha is 0.0001
Features are ['is_pr_adj', 'Syllables', 'is_adj', 'isPr', 'is_numer', 'is_pronoun', 'is_deepr', 'is_pr_adv', 'is_conj', 'is_noun', 'is_particle', 'is_pr_noun', 'is_inter', 'is_prep']
ba is 0.7290221405222098
0.7870773351188484

Alpha is 0.0001
Features are ['Syllables', 'is_pronoun', 'is_adj', 'is_pr_adj', 'is_noun', 'is_prep']
ba is 0.7476576065247695
0.771342484097757



In [215]:
gb_simples = GradientBoostingClassifier(random_state=0, n_estimators=300)

In [216]:
gb_simples.fit(X_simples_train, y_simples_train)

GradientBoostingClassifier(n_estimators=300, random_state=0)

In [218]:
choose_best_feats(gb_simples, X_simples_train, X_simples_test, y_simples_train, y_simples_test)

features are: ['Syllables', 'IPM', 'is_adj', 'is_noun', 'TF', 'is_pr_adj', 'is_adv']
balanced accuracy is 0.7620044162842073
accuracy is 0.8041513223970539



In [219]:
nn_simple = [['IPM', 'Syllables', 'TF', 'is_adj'],
             ['Syllables', 'is_pronoun', 'is_adj', 'is_pr_adj', 'is_noun', 'is_prep'],
             ['Syllables', 'IPM', 'is_adj', 'is_noun', 'TF', 'is_pr_adj', 'is_adv'],
            ]

In [220]:
keras_simple_4_train = np.array(X_simples_train[nn_simple[0]]) 
keras_simple_4_test = np.array(X_simples_test[nn_simple[0]])

In [221]:
log_simple = LogisticRegression(random_state=0)

In [222]:
grid_simple = RandomizedSearchCV(log_simple, parameters_grid, scoring = 'balanced_accuracy', cv = kf, random_state=0)

In [223]:
X_simples_4_train = X_simples_log_train[nn_simple[0]]
X_simples_4_test = X_simples_log_test[nn_simple[0]]

In [224]:
grid_simple.fit(X_simples_4_train, y_simples_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                                        'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [225]:
grid_simple.best_params_

{'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 600,
 'fit_intercept': True,
 'C': 0.1}

In [226]:
log_simple4 = LogisticRegression(random_state=0, solver='liblinear', penalty='l2', max_iter=600, fit_intercept=True, C=0.1)

In [227]:
log_simple4.fit(X_simples_4_train, y_simples_log_train)

LogisticRegression(C=0.1, max_iter=600, random_state=0, solver='liblinear')

In [228]:
preds_simple_4 = log_compound4.predict(X_simples_4_test)

In [229]:
balanced_accuracy_score(y_simples_log_test, preds_simple_4)

0.666703774730784

In [230]:
accuracy_score(y_simples_log_test, preds_simple_4)

0.7539337127552729

In [231]:
map_coefs(X_simples_4_train, log_simple4)

('IPM', 0.02549209325596713)
('Syllables', 0.9386286942695315)
('TF', -0.06824970470519526)
('is_adj', 1.2363856134869164)


In [234]:
X_simples_6_train = X_simples_log_train[nn_simple[1]]
X_simples_6_test = X_simples_log_test[nn_simple[1]]

In [235]:
grid_simple.fit(X_simples_6_train, y_simples_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                                        'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [236]:
grid_simple.best_params_

{'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 600,
 'fit_intercept': True,
 'C': 0.1}

In [237]:
log_simple6 = LogisticRegression(random_state=0, solver='saga', penalty='l2', max_iter=600, fit_intercept=True, C=0.1)

In [238]:
log_simple6.fit(X_simples_13_train, y_simples_log_train)

LogisticRegression(C=0.1, max_iter=600, random_state=0, solver='saga')

In [239]:
preds_simple_6 = log_simple6.predict(X_simples_13_test)

In [241]:
balanced_accuracy_score(y_simples_log_test, preds_simple_6)

0.7212926241736026

In [242]:
accuracy_score(y_simples_log_test, preds_simple_6)

0.7830599263475059

In [245]:
map_coefs(X_simples_13_train, log_simple6)

('Syllables', 1.0411168655321794)
('is_pronoun', 0.29600925091435426)
('is_adj', 1.1049702405999002)
('is_pr_adj', 0.5381738548541988)
('is_noun', -0.549106113277044)
('is_prep', -0.9969025823403015)


In [246]:
X_simples_7_train = X_simples_log_train[nn_simple[-1]]
X_simples_7_test = X_simples_log_test[nn_simple[-1]]

In [247]:
grid_simple.fit(X_simples_7_train, y_simples_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                                        'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [248]:
grid_simple.best_params_

{'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 600,
 'fit_intercept': True,
 'C': 0.1}

In [249]:
log_simple7 = LogisticRegression(random_state=0, solver='liblinear', penalty='l2', max_iter=600, fit_intercept=True, C=0.1)

In [250]:
log_simple7.fit(X_simples_7_train, y_simples_log_train)

LogisticRegression(C=0.1, max_iter=600, random_state=0, solver='liblinear')

In [251]:
preds_simple_7 = log_simple7.predict(X_simples_7_test)

In [252]:
balanced_accuracy_score(y_simples_log_test, preds_simple_7)

0.7207803290916354

In [253]:
accuracy_score(y_simples_log_test, preds_simple_7)

0.7827251422832273

In [254]:
map_coefs(X_simples_7_train, log_simple7)

('Syllables', 0.9863263211753793)
('IPM', 0.005001358613569071)
('is_adj', 1.048385533297452)
('is_noun', -0.5596129289594934)
('TF', -0.08166387943609986)
('is_pr_adj', 0.5299716336785135)
('is_adv', 0.0588698768648131)


In [255]:
keras_simple_4_train = np.array(X_simples_log_train[nn_simple[0]]) 
keras_simple_4_test = np.array(X_simples_log_test[nn_simple[0]])

In [256]:
nn_simple4 = models.Sequential()

In [257]:
nn_simple4.add(layers.Dense(256, activation='relu'))
nn_simple4.add(layers.Dense(256, activation='relu'))
nn_simple4.add(layers.Dense(256, activation='relu'))
nn_simple4.add(layers.Dense(256, activation='relu'))
nn_simple4.add(layers.Dense(128, activation='relu'))
nn_simple4.add(layers.Dense(128, activation='relu'))
nn_simple4.add(layers.Dense(128, activation='relu'))
nn_simple4.add(layers.Dense(64, activation='relu'))
nn_simple4.add(layers.Dense(1, activation='sigmoid'))

In [258]:
nn_simple4.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [259]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_simple4.fit(keras_simple_4_train, y_simples_log_train, epochs=30, batch_size=512)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x14b09c550>

In [260]:
results_simple4 = nn_simple4.evaluate(keras_simple_4_test, y_simples_log_test)



In [262]:
keras_simple_6_train = np.array(X_simples_log_train[nn_simple[1]]) 
keras_simple_6_test = np.array(X_simples_log_test[nn_simple[1]])

In [280]:
nn_simple6 = models.Sequential()

In [281]:
nn_simple6.add(layers.Dense(256, activation='relu'))
nn_simple6.add(layers.Dense(256, activation='relu'))
nn_simple6.add(layers.Dense(128, activation='relu'))
nn_simple6.add(layers.Dense(128, activation='relu'))
nn_simple6.add(layers.Dense(128, activation='relu'))
nn_simple6.add(layers.Dense(64, activation='relu'))
nn_simple6.add(layers.Dense(1, activation='sigmoid'))

In [282]:
nn_simple6.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [283]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_simple6.fit(keras_simple_6_train, y_simples_log_train, epochs=30, batch_size=512)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x14bad7280>

In [284]:
results_simple6 = nn_simple6.evaluate(keras_simple_6_test, y_simples_log_test)



In [270]:
keras_simple_7_train = np.array(X_simples_log_train[nn_simple[-1]]) 
keras_simple_7_test = np.array(X_simples_log_test[nn_simple[-1]])

In [271]:
nn_simple7 = models.Sequential()

In [272]:
nn_simple7.add(layers.Dense(512, activation='relu'))
nn_simple7.add(layers.Dense(512, activation='relu'))
nn_simple7.add(layers.Dense(256, activation='relu'))
nn_simple7.add(layers.Dense(256, activation='relu'))
nn_simple7.add(layers.Dense(256, activation='relu'))
nn_simple7.add(layers.Dense(256, activation='relu'))
nn_simple7.add(layers.Dense(128, activation='relu'))
nn_simple7.add(layers.Dense(128, activation='relu'))
nn_simple7.add(layers.Dense(128, activation='relu'))
nn_simple7.add(layers.Dense(64, activation='relu'))
nn_simple7.add(layers.Dense(1, activation='sigmoid'))

In [273]:
nn_simple7.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [274]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_simple7.fit(keras_simple_7_train, y_simples_log_train, epochs=30, batch_size=512)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x14b54f220>

In [338]:
results_simple7 = nn_simple7.evaluate(keras_simple_7_test, y_simples_log_test)

