In [1]:
import pandas as pd
import numpy as np

In [590]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split

In [3]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score

In [1265]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [259]:
def map_feature_importance(alg, data):
    try:
        imp_list = list(alg.feature_importances_)
    except AttributeError:
        imp_list = list(alg.coef_)
        imp_list = imp_list[0]
    feat_list = data.columns.to_list()
    to_sort = list(zip(feat_list, imp_list))
    sorted_list = sorted(to_sort, key=lambda x: x[1], reverse=True)
    return sorted_list

In [205]:
def pick_overthreshold(seq, thresh):
    if len(seq) == 1:
        seq = seq[0]
    picked = [i[0] for i in seq if i[1] > thresh]
    return picked

In [194]:
def choose_best_feats(algo, X_train, X_test, y_train, y_test):
    best_feats = []
    best_ba = 0
    best_acc = 0
    algo.fit(X_train, y_train)
    #preds = random_forest_simples.predict(X_test)
    mapped = map_feature_importance(algo, X_train)
    for i in np.linspace(0.01, 0.1, 10):
        feats = pick_overthreshold(mapped, i)
        algo.fit(X_train[feats], y_train)
        pr = algo.predict(X_test[feats])
        ba = balanced_accuracy_score(y_test, pr)
        acc = accuracy_score(y_test, pr)
        if ba > best_ba:
            best_feats = feats
            best_ba = ba
            best_acc = acc
    print(f'features are: {best_feats}')
    print(f'balanced accuracy is {best_ba}')
    print(f'accuracy is {best_acc}')
    print()

In [888]:
def map_coefs(dataset, logit):
    for pair in list(zip(dataset.columns, logit.coef_[0])):
        print(pair)

In [4]:
data = pd.read_csv('dataset_exp.csv', encoding = 'mac_cyrillic', sep=';')

In [271]:
data

Unnamed: 0,Syllables,frequency,IPM,TF,isPr,Type,is_compound,Reduced,is_noun,is_verb,...,is_pr_adj,is_deepr,is_particle,is_pr_adv,is_stat,is_numer,is_inter,is_vvod,is_pr_noun,is_pr_num
0,2,2817,216.670000,1,0,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,49301,3792.010000,7,0,2,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2,2599,199.900000,1,0,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,11,0.850000,1,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,7185,552.640000,2,0,2,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15356,2,8146,626.550000,1,0,2,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
15357,3,540,41.534391,1,0,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
15358,5,0,0.000000,1,0,2,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
15359,1,332602,25582.270000,49,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data['PoS'] = data['PoS'].astype('category')
data['Type'] = data['Type'].astype('category')

In [7]:
data['IPM'] = data['IPM'].astype('float')

In [8]:
parts_of_speech = {1: 'is_noun', 2: 'is_verb', 3: 'is_adj', 4: 'is_adv', 5: 'is_part', 6: 'is_pronoun', 7: 'is_prep',
                  8: 'is_conj', 9: 'is_pr_adj', 10: 'is_deepr', 11: 'is_particle', 12: 'is_pr_adv', 13: 'is_stat', 14: 'is_numer',
                  15: 'is_inter', 16: 'is_vvod', 17: 'is_pr_noun', 18: 'is_pr_num'}

In [9]:
for i in parts_of_speech:
    data[parts_of_speech[i]] = np.where(data['PoS'] == i, 1, 0)

In [10]:
data = data.drop('PoS', axis=1)

In [11]:
data = data.drop('Word', axis=1)
data = data.drop('Transcription', axis=1)

In [14]:
simples = data[data['is_compound']==0]

In [15]:
simples_y = simples['Reduced']

In [16]:
simples_X = simples.drop('Reduced', axis=1)

In [17]:
simples_X = simples_X.drop('is_compound', axis=1)

In [18]:
simples_X = simples_X.drop('frequency', axis=1)

In [19]:
compounds = data[data['is_compound']==1]

In [20]:
compounds_y = compounds['Reduced']

In [21]:
compounds_X = compounds.drop('Reduced', axis=1)

In [22]:
compounds_X = compounds_X.drop('is_compound', axis=1)

In [23]:
compounds_X = compounds_X.drop('frequency', axis=1)

In [24]:
X_simples_train, X_simples_test, y_simples_train, y_simples_test = train_test_split(simples_X, 
                                                                                    simples_y, test_size = 0.2, 
                                                                                    random_state = 0)

In [26]:
X_compound_train, X_compound_test, y_compound_train, y_compound_test = train_test_split(compounds_X, 
                                                                                        compounds_y, test_size = 0.2, 
                                                                                        random_state = 0)

In [311]:
data_normalized = data.copy()

In [313]:
mean_IPM = data_normalized['IPM'].mean()

In [314]:
std_IPM = data_normalized['IPM'].std()

In [315]:
data_normalized['IPM'] -= mean_IPM
data_normalized['IPM'] /= std_IPM

In [316]:
mean_TF = data_normalized['TF'].mean()
std_TF = data_normalized['TF'].std()

In [317]:
data_normalized['TF'] -= mean_TF
data_normalized['TF'] /= std_TF

In [318]:
data_normalized = data_normalized.drop('frequency', axis=1)

In [319]:
simple_norm = data_normalized[data_normalized['is_compound']==0]

In [320]:
simple_norm_y = simple_norm['Reduced']

In [321]:
simple_norm = simple_norm.drop('Reduced', axis=1)

In [322]:
simple_norm = simple_norm.drop('is_compound', axis=1)

In [323]:
compounds_norm = data_normalized[data_normalized['is_compound']==1]

In [324]:
compounds_norm_y = compounds_norm['Reduced']

In [325]:
compounds_norm = compounds_norm.drop('Reduced', axis=1)

In [326]:
compounds_norm = compounds_norm.drop('is_compound', axis=1)

In [327]:
X_simples_norm_train, X_simples_norm_test, y_simples_norm_train, y_simples_norm_test = train_test_split(simple_norm, 
                                                                                    simple_norm_y, test_size = 0.2, 
                                                                                    random_state = 0)

In [328]:
X_compound_norm_train, X_compound_norm_test, y_compound_norm_train, y_compound_norm_test = train_test_split(
    compounds_norm, compounds_norm_y, test_size = 0.2, random_state = 0)

In [832]:
data_log = data.copy()

In [833]:
data_log['IPM'] = np.log(data_log['IPM'])
data_log['TF'] = np.log(data_log['TF'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [834]:
data['TF'].max()

211

In [835]:
data_log = data_log.drop('frequency', axis=1)

In [836]:
simple_log = data_log[data_log['is_compound']==0]

In [838]:
simple_log_y = simple_log['Reduced']

In [839]:
simple_log = simple_log.drop('Reduced', axis=1)

In [840]:
simple_log = simple_log.drop('is_compound', axis=1)

In [841]:
simple_log['IPM'] = np.where(simple_log['IPM'] == -np.inf, 0, simple_log['IPM'])

In [842]:
X_simples_log_train, X_simples_log_test, y_simples_log_train, y_simples_log_test = train_test_split(simple_log, 
                                                                                    simple_log_y, test_size = 0.2, 
                                                                                    random_state = 0)

In [1001]:
compound_log = data_log[data_log['is_compound']==1]

In [1002]:
compound_log_y = compound_log['Reduced']

In [1003]:
compound_log = compound_log.drop('Reduced', axis=1)

In [1004]:
compound_log = compound_log.drop('is_compound', axis=1)

In [1005]:
compound_log['IPM'] = np.where(compound_log['IPM'] == -np.inf, 0, compound_log['IPM'])

In [1006]:
X_compound_log_train, X_compound_log_test, y_compound_log_train, y_compound_log_test = train_test_split(compound_log, 
                                                                                    compound_log_y, test_size = 0.2, 
                                                                                    random_state = 0)

In [843]:
simple_log['IPM'].min()

-2.5650473526598505

In [816]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# compound

In [49]:
random_forest_compounds = RandomForestClassifier(n_estimators=300, random_state=0)

In [1086]:
nn_compound[1]

['is_vvod',
 'isPr',
 'Type',
 'is_adj',
 'Syllables',
 'TF',
 'is_prep',
 'IPM',
 'is_conj',
 'is_verb',
 'is_pr_adv',
 'is_pronoun',
 'is_adv',
 'is_particle']

In [1089]:
for i in nn_compound[1]:
    print(i, i in nn_simple[1])

is_vvod False
isPr True
Type True
is_adj True
Syllables True
TF True
is_prep True
IPM True
is_conj True
is_verb False
is_pr_adv False
is_pronoun True
is_adv False
is_particle True


In [50]:
random_forest_compounds.fit(X_compound_train, y_compound_train)

RandomForestClassifier(n_estimators=300, random_state=0)

In [51]:
random_forest_compounds_preds = random_forest_compounds.predict(X_compound_test)

In [52]:
choose_best_feats(random_forest_compounds, X_compound_train, X_compound_test, 
                  y_compound_train, y_compound_test)

features are: ['IPM', 'Syllables', 'TF', 'is_vvod']
balanced accuracy is 0.8364130434782608
accuracy is 0.8372093023255814



In [1007]:
list_of_lists = []
for i in c:
    log_sn = LogisticRegression(penalty='l1', solver='liblinear', random_state=0, C=i)
    log_sn.fit(X_compound_log_train, y_compound_log_train)
    preds = log_sn.predict(X_compound_log_test)
    list_feats = sorted(map_feature_importance(log_sn, X_compound_log_train), key=lambda x: x[1], reverse=True)
    feats = [i[0] for i in list_feats if i[1] != 0]
    if feats:
        list_of_lists.append(feats)


In [1009]:
max_ba = 0
for feats in list_of_lists:
    for i in c:
        log = LogisticRegression(random_state=0, C=i, max_iter=300)
        log.fit(X_compound_log_train[feats], y_compound_log_train)
        preds = log.predict(X_compound_log_test[feats])
        ba = balanced_accuracy_score(y_compound_log_test, preds)
        if ba > max_ba:
            max_ba = ba
            print(f'C is {i}')
            print(f'Features are {feats}')
            print(f'ba is {ba}')
            print(accuracy_score(y_compound_log_test, preds))
            print()

C is 0.0001
Features are ['is_vvod', 'Syllables', 'TF', 'Type', 'IPM', 'is_adv']
ba is 0.45217391304347826
0.47674418604651164

C is 0.001
Features are ['is_vvod', 'Syllables', 'TF', 'Type', 'IPM', 'is_adv']
ba is 0.7103260869565218
0.7093023255813954

C is 1
Features are ['is_vvod', 'Syllables', 'TF', 'Type', 'IPM', 'is_adv']
ba is 0.7152173913043478
0.7093023255813954

C is 1
Features are ['is_vvod', 'isPr', 'Type', 'is_adj', 'Syllables', 'TF', 'is_prep', 'IPM', 'is_conj', 'is_verb', 'is_pr_adv', 'is_pronoun', 'is_adv', 'is_particle']
ba is 0.7369565217391305
0.7325581395348837

C is 10
Features are ['is_vvod', 'isPr', 'Type', 'is_adj', 'Syllables', 'TF', 'is_prep', 'IPM', 'is_conj', 'is_verb', 'is_pr_adv', 'is_pronoun', 'is_adv', 'is_particle']
ba is 0.7869565217391304
0.7790697674418605



In [383]:
gb_compound = GradientBoostingClassifier(random_state=0, n_estimators=300)

In [384]:
gb_compound.fit(X_compound_train, y_compound_train)

GradientBoostingClassifier(n_estimators=300, random_state=0)

In [385]:
choose_best_feats(gb_compound, X_compound_train, X_compound_test, y_compound_train, y_compound_test)

features are: ['IPM', 'Syllables', 'TF']
balanced accuracy is 0.8130434782608695
accuracy is 0.813953488372093



In [1010]:
nn_compound = [['IPM', 'Syllables', 'TF', 'is_vvod'],
['is_vvod', 'isPr', 'Type', 'is_adj', 'Syllables', 'TF', 
 'is_prep', 'IPM', 'is_conj', 'is_verb', 'is_pr_adv', 'is_pronoun', 
 'is_adv', 'is_particle'],
['IPM', 'Syllables', 'TF']]

In [1013]:
parameters_grid = {
    'fit_intercept': [True, False],
    'max_iter': np.linspace(100, 900, num=17, dtype=int),
    'penalty': ['l1', 'l2'],   
    'solver': ['liblinear', 'saga']
}

In [1014]:
log_compound = LogisticRegression(random_state=0)

In [1015]:
grid_compound = RandomizedSearchCV(log_compound, parameters_grid, scoring = 'balanced_accuracy', cv = kf, random_state=0)

In [1038]:
X_compound_4_train = X_compound_log_train[nn_compound[0]]
X_compound_4_test = X_compound_log_test[nn_compound[0]]

In [1055]:
grid_compound.fit(X_compound_4_train, y_compound_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [1056]:
grid_compound.best_params_

{'solver': 'saga', 'penalty': 'l2', 'max_iter': 600, 'fit_intercept': True}

In [1057]:
log_compound4 = LogisticRegression(random_state=0, solver='saga', penalty='l2', max_iter=600, fit_intercept=True)

In [1058]:
log_compound4.fit(X_compound_4_train, y_compound_log_train)

LogisticRegression(max_iter=600, random_state=0, solver='saga')

In [1059]:
preds_compound_4 = log_compound4.predict(X_compound_4_test)

In [1082]:
map_coefs(X_compound_4_train, log_compound4)

('IPM', 0.022740270009378737)
('Syllables', 0.4391103111404314)
('TF', 0.19110694451125598)
('is_vvod', 1.3236324574467557)


In [1060]:
balanced_accuracy_score(y_compound_log_test, preds_compound_4)

0.6777173913043478

In [1061]:
accuracy_score(y_compound_log_test, preds_compound_4)

0.6744186046511628

In [1064]:
X_compound_14_train = X_compound_log_train[nn_compound[1]]
X_compound_14_test = X_compound_log_test[nn_compound[1]]

In [1065]:
grid_compound.fit(X_compound_14_train, y_compound_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [1066]:
grid_compound.best_params_

{'solver': 'saga', 'penalty': 'l1', 'max_iter': 650, 'fit_intercept': True}

In [1067]:
log_compound14 = LogisticRegression(random_state=0, solver='saga', penalty='l1', max_iter=650, fit_intercept=True)

In [1068]:
log_compound14.fit(X_compound_14_train, y_compound_log_train)

LogisticRegression(max_iter=650, penalty='l1', random_state=0, solver='saga')

In [1070]:
preds_compound_14 = log_compound14.predict(X_compound_14_test)

In [1083]:
map_coefs(X_compound_14_train, log_compound14)

('is_vvod', 0.9308100252281979)
('isPr', 0.49843764452878175)
('Type', 0.7503354254223632)
('is_adj', 0.4015642253538559)
('Syllables', 0.41775720400960425)
('TF', 0.2793932094226036)
('is_prep', 0.10237928061116898)
('IPM', 0.044219717238296714)
('is_conj', -0.4380931919672399)
('is_verb', -0.2857364513390623)
('is_pr_adv', -0.2514322354202692)
('is_pronoun', -0.5368004405388007)
('is_adv', -0.643736083610058)
('is_particle', -1.6943554007665185)


In [1071]:
balanced_accuracy_score(y_compound_log_test, preds_compound_14)

0.7260869565217392

In [1072]:
accuracy_score(y_compound_log_test, preds_compound_14)

0.7209302325581395

In [1073]:
X_compound_3_train = X_compound_log_train[nn_compound[-1]]
X_compound_3_test = X_compound_log_test[nn_compound[-1]]

In [1074]:
grid_compound.fit(X_compound_3_train, y_compound_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [1075]:
grid_compound.best_params_

{'solver': 'saga', 'penalty': 'l1', 'max_iter': 650, 'fit_intercept': True}

In [1076]:
log_compound3 = LogisticRegression(random_state=0, solver='saga', penalty='l1', max_iter=650, fit_intercept=True)

In [1077]:
log_compound3.fit(X_compound_3_train, y_compound_log_train)

LogisticRegression(max_iter=650, penalty='l1', random_state=0, solver='saga')

In [1079]:
preds_compound_3 = log_compound3.predict(X_compound_3_test)

In [1084]:
map_coefs(X_compound_3_train, log_compound3)

('IPM', 0.048932854581450955)
('Syllables', 0.4537047709860716)
('TF', 0.27394140753881496)


In [1080]:
balanced_accuracy_score(y_compound_log_test, preds_compound_3)

0.6309782608695653

In [1081]:
accuracy_score(y_compound_log_test, preds_compound_3)

0.627906976744186

In [1281]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)

In [1299]:
keras_compound_4_train = np.array(X_compound_log_train[nn_compound[0]]) 
keras_compound_4_test = np.array(X_compound_log_test[nn_compound[0]])

In [1306]:
nn_compound_4 = models.Sequential()

In [1307]:
nn_compound_4.add(layers.Dense(256, activation='relu'))
nn_compound_4.add(layers.Dense(256, activation='relu'))
nn_compound_4.add(layers.Dense(256, activation='relu'))
nn_compound_4.add(layers.Dense(128, activation='relu'))
nn_compound_4.add(layers.Dense(128, activation='relu'))
nn_compound_4.add(layers.Dense(128, activation='relu'))
nn_compound_4.add(layers.Dense(64, activation='relu'))
nn_compound_4.add(layers.Dense(1, activation='sigmoid'))

In [1308]:
nn_compound_4.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [1309]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_compound_4.fit(keras_compound_4_train, y_compound_log_train, epochs=50, batch_size=512)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1605fc1c0>

In [1310]:
results_compound4 = nn_compound_4.evaluate(keras_compound_4_test, y_compound_log_test)



In [1311]:
keras_compound_14_train = np.array(X_compound_log_train[nn_compound[1]]) 
keras_compound_14_test = np.array(X_compound_log_test[nn_compound[1]])

In [1337]:
nn_compound_14 = models.Sequential()

In [1338]:
nn_compound_14.add(layers.Dense(128, activation='relu'))
nn_compound_14.add(layers.Dense(128, activation='relu'))
nn_compound_14.add(layers.Dense(128, activation='relu'))
nn_compound_14.add(layers.Dense(128, activation='relu'))
nn_compound_14.add(layers.Dense(128, activation='relu'))
nn_compound_14.add(layers.Dense(128, activation='relu'))
nn_compound_14.add(layers.Dense(64, activation='relu'))
nn_compound_14.add(layers.Dense(1, activation='sigmoid'))

In [1339]:
nn_compound_14.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [1340]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_compound_14.fit(keras_compound_13_train, y_compound_log_train, epochs=50, batch_size=512)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1611a61f0>

In [1341]:
results_compound14 = nn_compound_14.evaluate(keras_compound_14_test, y_compound_log_test)



In [1348]:
keras_compound_3_train = np.array(X_compound_log_train[nn_compound[-1]]) 
keras_compound_3_test = np.array(X_compound_log_test[nn_compound[-1]])

In [1349]:
nn_compound_3 = models.Sequential()

In [1350]:
nn_compound_3.add(layers.Dense(256, activation='relu'))
nn_compound_3.add(layers.Dense(256, activation='relu'))
nn_compound_3.add(layers.Dense(256, activation='relu'))
nn_compound_3.add(layers.Dense(256, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(128, activation='relu'))
nn_compound_3.add(layers.Dense(64, activation='relu'))
nn_compound_3.add(layers.Dense(1, activation='sigmoid'))

In [1351]:
nn_compound_3.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [1352]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_compound_3.fit(keras_compound_3_train, y_compound_log_train, epochs=50, batch_size=512)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1616ca370>

In [1353]:
results_compound3 = nn_compound_3.evaluate(keras_compound_3_test, y_compound_log_test)




# Simple

In [53]:
random_forest_simples = RandomForestClassifier(n_estimators=300, random_state=0)

In [54]:
random_forest_simples.fit(X_simples_train, y_simples_train)

RandomForestClassifier(n_estimators=300, random_state=0)

In [55]:
random_forest_simples_preds = random_forest_simples.predict(X_simples_test)

In [58]:
choose_best_feats(random_forest_simples, X_simples_train, X_simples_test, y_simples_train, y_simples_test)

features are: ['IPM', 'Syllables', 'TF', 'is_adj']
balanced accuracy is 0.7606779515941013
accuracy is 0.7984599933043187



In [847]:
c = [0.0001, 0.001, 0.01, 0.1, 1, 10]

In [848]:
list_of_lists = []
for i in c:
    log_sn = LogisticRegression(penalty='l1', solver='liblinear', random_state=0, C=i)
    log_sn.fit(X_simples_log_train, y_simples_log_train)
    preds = log_sn.predict(X_simples_log_test)
    list_feats = sorted(map_feature_importance(log_sn, X_simples_norm_train), key=lambda x: x[1], reverse=True)
    feats = [i[0] for i in list_feats if i[1] != 0]
    if feats:
        list_of_lists.append(feats)


In [1039]:
max_ba = 0
for feats in list_of_lists:
    for i in c:
        log = LogisticRegression(random_state=0, C=i, max_iter=300)
        log.fit(X_simples_log_train[feats], y_simples_log_train)
        preds = log.predict(X_simples_log_test[feats])
        ba = balanced_accuracy_score(y_simples_log_test, preds)
        if ba > max_ba:
            max_ba = ba
            print(f'C is {i}')
            print(f'Features are {feats}')
            print(f'ba is {ba}')
            print(accuracy_score(y_simples_log_test, preds))
            print()

C is 0.0001
Features are ['is_vvod', 'Syllables', 'TF', 'Type', 'IPM', 'is_adv']
ba is 0.5785752643249016
0.7084030800133914

C is 0.001
Features are ['is_vvod', 'Syllables', 'TF', 'Type', 'IPM', 'is_adv']
ba is 0.6797597843010981
0.7626380984265149

C is 0.01
Features are ['is_vvod', 'Syllables', 'TF', 'Type', 'IPM', 'is_adv']
ba is 0.6800234468619315
0.7626380984265149

C is 0.1
Features are ['is_vvod', 'Syllables', 'TF', 'Type', 'IPM', 'is_adv']
ba is 0.6824264699888319
0.7619685302979579

C is 0.001
Features are ['is_vvod', 'isPr', 'Type', 'is_adj', 'Syllables', 'TF', 'is_prep', 'IPM', 'is_conj', 'is_verb', 'is_pr_adv', 'is_pronoun', 'is_adv', 'is_particle']
ba is 0.6851614277213033
0.7656511550050218

C is 0.01
Features are ['is_vvod', 'isPr', 'Type', 'is_adj', 'Syllables', 'TF', 'is_prep', 'IPM', 'is_conj', 'is_verb', 'is_pr_adv', 'is_pronoun', 'is_adv', 'is_particle']
ba is 0.7071068141614563
0.7760294609976565

C is 0.1
Features are ['is_vvod', 'isPr', 'Type', 'is_adj', 'Syllab

In [387]:
gb_simples = GradientBoostingClassifier(random_state=0, n_estimators=300)

In [378]:
gb_simples.fit(X_simples_train, y_simples_train)

GradientBoostingClassifier(n_estimators=500, random_state=0)

In [366]:
px = x.predict(X_simples_test)

In [388]:
choose_best_feats(gb_simples, X_simples_train, X_simples_test, y_simples_train, y_simples_test)

features are: ['Syllables', 'IPM', 'is_adj', 'is_noun', 'TF', 'is_pr_adj', 'is_adv']
balanced accuracy is 0.7620044162842073
accuracy is 0.8041513223970539



In [850]:
nn_simple = [['IPM', 'Syllables', 'TF', 'is_adj'],
             ['is_adj', 'Syllables', 'is_pr_adj', 'isPr', 'is_pronoun', 'IPM', 'TF', 
              'is_particle', 'Type', 'is_conj', 'is_pr_noun', 'is_noun', 'is_prep'],
             ['Syllables', 'IPM', 'is_adj', 'is_noun', 'TF', 'is_pr_adj', 'is_adv'],
            ]

In [851]:
keras_simple_4_train = np.array(X_simples_train[nn_simple[0]]) 
keras_simple_4_test = np.array(X_simples_test[nn_simple[0]])

In [1016]:
log_simple = LogisticRegression(random_state=0)

In [1017]:
grid_simple = RandomizedSearchCV(log_simple, parameters_grid, scoring = 'balanced_accuracy', cv = kf, random_state=0)

In [855]:
X_simples_4_train = X_simples_log_train[nn_simple[0]]
X_simples_4_test = X_simples_log_test[nn_simple[0]]

In [1040]:
grid_simple.fit(X_simples_4_train, y_simples_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [1041]:
grid_simple.best_params_

{'solver': 'liblinear',
 'penalty': 'l2',
 'max_iter': 400,
 'fit_intercept': True}

In [1042]:
log_simple4 = LogisticRegression(random_state=0, solver='liblinear', penalty='l2', max_iter=400, fit_intercept=True)

In [1043]:
log_simple4.fit(X_simples_4_train, y_simples_log_train)

LogisticRegression(max_iter=400, random_state=0, solver='liblinear')

In [1044]:
preds_simple_4 = log_compound4.predict(X_simples_4_test)

In [1045]:
balanced_accuracy_score(y_simples_log_test, preds_simple_4)

0.7053363264341206

In [1046]:
accuracy_score(y_simples_log_test, preds_simple_4)

0.7743555406762638

In [1024]:
map_coefs(X_simples_4_train, log_simple4)

('IPM', 0.039612047814128)
('Syllables', 1.003815207422286)
('TF', -0.04374507388958202)
('is_adj', 1.3650194537794238)


In [1047]:
X_simples_13_train = X_simples_log_train[nn_simple[1]]
X_simples_13_test = X_simples_log_test[nn_simple[1]]

In [1048]:
grid_simple.fit(X_simples_13_train, y_simples_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [1049]:
grid_simple.best_params_

{'solver': 'saga', 'penalty': 'l1', 'max_iter': 650, 'fit_intercept': True}

In [1050]:
log_simple13 = LogisticRegression(random_state=0, solver='saga', penalty='l1', max_iter=650, fit_intercept=True)

In [1051]:
log_simple13.fit(X_simples_13_train, y_simples_log_train)

LogisticRegression(max_iter=650, penalty='l1', random_state=0, solver='saga')

In [1052]:
preds_simple_13 = log_simple13.predict(X_simples_13_test)

In [1053]:
balanced_accuracy_score(y_simples_log_test, preds_simple_13)

0.7292025009986061

In [1054]:
accuracy_score(y_simples_log_test, preds_simple_13)

0.7830599263475059

In [1030]:
map_coefs(X_simples_13_train, log_simple13)

('is_adj', 1.1130119782553187)
('Syllables', 1.0132764183741574)
('is_pr_adj', 0.47894315191003073)
('isPr', 0.3015228514456572)
('is_pronoun', 0.20203417684255437)
('IPM', 0.021423893036178275)
('TF', -0.04326815439048015)
('is_particle', -0.28858046195632364)
('Type', -0.07612787589178838)
('is_conj', -0.3282557238152318)
('is_pr_noun', -1.0411153234527935)
('is_noun', -0.640908262053397)
('is_prep', -1.480138177559162)


In [914]:
X_simples_7_train = X_simples_log_train[nn_simple[-1]]
X_simples_7_test = X_simples_log_test[nn_simple[-1]]

In [915]:
grid_simple.fit(X_simples_7_train, y_simples_log_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=0, shuffle=True),
                   estimator=LogisticRegression(random_state=0),
                   param_distributions={'fit_intercept': [True, False],
                                        'max_iter': array([100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700,
       750, 800, 850, 900]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear', 'saga']},
                   random_state=0, scoring='balanced_accuracy')

In [916]:
grid_simple.best_params_

{'solver': 'liblinear',
 'penalty': 'l2',
 'max_iter': 400,
 'fit_intercept': True}

In [1031]:
log_simple7 = LogisticRegression(random_state=0, solver='liblinear', penalty='l2', max_iter=400, fit_intercept=True)

In [1032]:
log_simple7.fit(X_simples_7_train, y_simples_log_train)

LogisticRegression(max_iter=400, random_state=0, solver='liblinear')

In [1033]:
preds_simple_7 = log_simple7.predict(X_simples_7_test)

In [1391]:
balanced_accuracy_score(y_simples_log_test, preds_simple_7)

0.7210590216921685

In [1392]:
accuracy_score(y_simples_log_test, preds_simple_7)

0.7823903582189488

In [1036]:
map_coefs(X_simples_7_train, log_simple7)

('Syllables', 1.0478415767755507)
('IPM', 0.019027723605323083)
('is_adj', 1.1897246495478506)
('is_noun', -0.5234576888383744)
('TF', -0.05754683319937606)
('is_pr_adj', 0.6017864645725379)
('is_adv', 0.10468109836332803)


In [937]:
keras_simple_4_train = np.array(X_simples_log_train[nn_simple[0]]) 
keras_simple_4_test = np.array(X_simples_log_test[nn_simple[0]])

In [1369]:
nn_simple4 = models.Sequential()

In [1370]:
nn_simple4.add(layers.Dense(256, activation='relu'))
nn_simple4.add(layers.Dense(256, activation='relu'))
nn_simple4.add(layers.Dense(256, activation='relu'))
nn_simple4.add(layers.Dense(256, activation='relu'))
nn_simple4.add(layers.Dense(128, activation='relu'))
nn_simple4.add(layers.Dense(128, activation='relu'))
nn_simple4.add(layers.Dense(128, activation='relu'))
nn_simple4.add(layers.Dense(64, activation='relu'))
nn_simple4.add(layers.Dense(1, activation='sigmoid'))

In [1371]:
nn_simple4.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [1372]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_simple4.fit(keras_simple_4_train, y_simples_log_train, epochs=30, batch_size=512)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x16212b3d0>

In [1373]:
results_simple4 = nn_simple4.evaluate(keras_simple_4_test, y_simples_log_test)



In [1374]:
keras_simple_13_train = np.array(X_simples_log_train[nn_simple[1]]) 
keras_simple_13_test = np.array(X_simples_log_test[nn_simple[1]])

In [1380]:
nn_simple13 = models.Sequential()

In [1381]:
nn_simple13.add(layers.Dense(128, activation='relu'))
nn_simple13.add(layers.Dense(128, activation='relu'))
nn_simple13.add(layers.Dense(128, activation='relu'))
nn_simple13.add(layers.Dense(64, activation='relu'))
nn_simple13.add(layers.Dense(1, activation='sigmoid'))

In [1382]:
nn_simple13.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [1383]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_simple13.fit(keras_simple_13_train, y_simples_log_train, epochs=30, batch_size=512)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1625879d0>

In [1384]:
results_simple13 = nn_simple13.evaluate(keras_simple_13_test, y_simples_log_test)



In [1385]:
keras_simple_7_train = np.array(X_simples_log_train[nn_simple[-1]]) 
keras_simple_7_test = np.array(X_simples_log_test[nn_simple[-1]])

In [1386]:
nn_simple7 = models.Sequential()

In [1387]:
nn_simple7.add(layers.Dense(512, activation='relu'))
nn_simple7.add(layers.Dense(512, activation='relu'))
nn_simple7.add(layers.Dense(256, activation='relu'))
nn_simple7.add(layers.Dense(256, activation='relu'))
nn_simple7.add(layers.Dense(256, activation='relu'))
nn_simple7.add(layers.Dense(256, activation='relu'))
nn_simple7.add(layers.Dense(128, activation='relu'))
nn_simple7.add(layers.Dense(128, activation='relu'))
nn_simple7.add(layers.Dense(128, activation='relu'))
nn_simple7.add(layers.Dense(64, activation='relu'))
nn_simple7.add(layers.Dense(1, activation='sigmoid'))

In [1388]:
nn_simple7.compile(optimizer='rmsprop', loss = 'binary_crossentropy', metrics=['binary_accuracy'])

In [1389]:
from numpy.random import seed
seed(1)
tf.random.set_seed(1)
nn_simple7.fit(keras_simple_7_train, y_simples_log_train, epochs=30, batch_size=512)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x16275c370>

In [1390]:
results_simple7 = nn_simple7.evaluate(keras_simple_7_test, y_simples_log_test)

