In [1]:
import pandas as pd
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV

from scipy.sparse import coo_matrix, hstack

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout

import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Pre-processing

In [2]:
train = pd.read_csv('/Users/atru/Desktop/kaggle/dataset/Phase 1 - similarityCompetition/train.csv')
test = pd.read_csv('/Users/atru/Desktop/kaggle/dataset/Phase 1 - similarityCompetition/test.csv')

In [3]:
bank = pd.read_csv('/Users/atru/Desktop/kaggle/wordbank.csv')
mapping = pd.melt(bank, id_vars=['Root word'])

In [4]:
mapping = mapping.dropna()
mapping = mapping[['Root word', 'value']]

In [5]:
map_dict = mapping.set_index('value').to_dict()
map_dict = map_dict['Root word']
map_dict = {k.lower(): v for k, v in map_dict.items()}

In [6]:
# >1 char dict:
multiple_char_dict = {k:v for (k,v) in map_dict.items() if ' ' in k}

In [7]:
multiple_char_dict

{'ampco pittsburgh': 'ampcopittsburgh',
 'bp plc': 'bpplc',
 'mid cap': 'mid-cap',
 'short term': 'Short-term',
 'sm cap': 'small-cap',
 'small cap': 'small-cap',
 't. rowe price': 't rowe price',
 'tax exempt': 'tax-exempt'}

In [9]:
X = train[['description_x', 'description_y']]
y = train[['same_security']]

for i in range(0, len(X)):
    for word in X['description_x'][i].split(' '):
        if word.lower() in map_dict:
            X['description_x'][i] = X['description_x'][i].replace(word, map_dict[word])

for i in range(0, len(X)):
    for word in X['description_y'][i].split(' '):
        if word.lower() in map_dict:
            X['description_y'][i] = X['description_y'][i].replace(word, map_dict[word])

# replace multiple words

for i in range(0, len(X)):
    #print(i)
    for (k,v) in multiple_char_dict.items():
        if k in X['description_x'][i]:
            #print(k)
            X['description_x'][i] = X['description_x'][i].replace(k, multiple_char_dict[k])
        if k in X['description_y'][i]:
            #print(k)
            X['description_x'][i] = X['description_y'][i].replace(k, multiple_char_dict[k])

In [11]:
X["description_x"] = X['description_x'].str.replace('[^\w\s]','')
X["description_y"] = X['description_y'].str.replace('[^\w\s]','')

X["description_x"] = X["description_x"].str.lower()
X["description_y"] = X["description_y"].str.lower()

In [12]:
y = y.applymap(lambda x: 1 if x == True else x)
y = y.applymap(lambda x: 0 if x == False else x)

In [13]:
vec_X = TfidfVectorizer(encoding='utf-8')
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = TfidfVectorizer(encoding='utf-8')
X_Y = vec_X.fit_transform(X['description_y'].values.astype('U'))


X_concat = hstack([X_X, X_Y])

X_train_all = X_concat.tocsr()

In [14]:
X_train_all.shape

(2142, 2857)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y, random_state=1)

Neural Networks

In [287]:
model_dropout = Sequential([
    Dense(1024, input_shape=(2857,), activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid'),
])

In [288]:
model_dropout.compile("adam", "binary_crossentropy", metrics=['accuracy'])
history_dropout = model_dropout.fit(X_train, y_train, batch_size=128, epochs=20, verbose=0, validation_split=.2)

score = model_dropout.evaluate(X_val, y_val, verbose=0)
print("Test loss: {:.3f}".format(score[0]))
print("Test Accuracy: {:.3f}".format(score[1]))

# This is for without removing duplicate words

ValueError: Error when checking input: expected dense_243_input to have shape (2857,) but got array with shape (2823,)

NLP

In [259]:
# removing duplicate words
    
for k in range(0, len(X)):
    text1 = X['description_x'][k]
    text2 = X['description_y'][k]
    list1 = text1.split(' ')
    list2 = text2.split(' ')
    remove_word = []
    for i in range(0, len(list1)):
        if list1[i] in list2:
            remove_word.append(list1[i])
    remove_word = list(set(remove_word))
    if len(remove_word) > 0:
        for j in range(0, len(remove_word)):    
            list1.remove(remove_word[j])
            list2.remove(remove_word[j])

        list1=" ".join(list1)
        list2=" ".join(list2)
        X['description_x'][k] = list1
        X['description_y'][k] = list2 

In [260]:
# tfid
vec_X = TfidfVectorizer(encoding='utf-8')
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = TfidfVectorizer(encoding='utf-8')
X_Y = vec_Y.fit_transform(X['description_y'].values.astype('U'))


X_concat = hstack([X_X, X_Y])

X_train_all = X_concat.tocsr()
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y, random_state=0)

lr = LogisticRegressionCV().fit(X_train, y_train)
lr.score(X_val, y_test)

#with moving dup words: .899

0.8955223880597015

In [263]:
X_train.shape

model_dropout = Sequential([
    Dense(1024, input_shape=(1689,), activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid'),
])

import keras_metrics
model_dropout.compile("adam", "binary_crossentropy", metrics=['accuracy'])
history_dropout = model_dropout.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_split=.1)

score = model_dropout.evaluate(X_test, y_test, verbose=0)
print("Test loss: {:.5f}".format(score[0]))
print("Test Precision: {:.5f}".format(score[1]))
print("Test Recall: {:.5f}".format(score[2]))

# moving dup words: .91045

Train on 1445 samples, validate on 161 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.48241
Test Precision: 0.91045


IndexError: list index out of range

In [264]:
ngram_size = (2, 5)

# tfid
vec_X = TfidfVectorizer(encoding='utf-8', ngram_range = ngram_size)
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = TfidfVectorizer(encoding='utf-8', ngram_range = ngram_size)
X_Y = vec_Y.fit_transform(X['description_y'].values.astype('U'))


X_concat = hstack([X_X, X_Y])

X_train_all = X_concat.tocsr()
X_train, X_test, y_train, y_test = train_test_split(X_train_all, y, random_state=0)

lr = LogisticRegressionCV().fit(X_train, y_train)
lr.score(X_test, y_test)

#with moving dup words: .8936

0.8992537313432836

In [266]:
X_train.shape

model_dropout = Sequential([
    Dense(1024, input_shape=(5332,), activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid'),
])

import keras_metrics
model_dropout.compile("adam", "binary_crossentropy", metrics=['accuracy'])
history_dropout = model_dropout.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_split=.1)

score = model_dropout.evaluate(X_test, y_test, verbose=0)
print("Test loss: {:.5f}".format(score[0]))
print("Test Precision: {:.5f}".format(score[1]))
print("Test Recall: {:.5f}".format(score[2]))


#with moving dup words: 0.89552

Train on 1445 samples, validate on 161 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.36692
Test Precision: 0.88806


IndexError: list index out of range

In [267]:
# countvec
vec_X = CountVectorizer( encoding='utf-8')
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = CountVectorizer(encoding='utf-8')
X_Y = vec_Y.fit_transform(X['description_y'].values.astype('U'))


X_concat = hstack([X_X, X_Y])

X_train_all = X_concat.tocsr()

X_train, X_test, y_train, y_test = train_test_split(X_train_all, y, random_state=0)

lr = LogisticRegressionCV().fit(X_train, y_train)
lr.score(X_test, y_test)

0.9011194029850746

In [269]:
X_train.shape

model_dropout = Sequential([
    Dense(1024, input_shape=(1689,), activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid'),
])

import keras_metrics
model_dropout.compile("adam", "binary_crossentropy", metrics=['accuracy'])
history_dropout = model_dropout.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_split=.1)

score = model_dropout.evaluate(X_test, y_test, verbose=0)
print("Test loss: {:.5f}".format(score[0]))
print("Test Precision: {:.5f}".format(score[1]))
print("Test Recall: {:.5f}".format(score[2]))

Train on 1445 samples, validate on 161 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.48411
Test Precision: 0.91045


IndexError: list index out of range

In [270]:
# adding ngram
vec_X = CountVectorizer(ngram_range=(1, 4), encoding='utf-8')
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = CountVectorizer(ngram_range=(1, 4), encoding='utf-8')
X_Y = vec_Y.fit_transform(X['description_y'].values.astype('U'))


X_concat = hstack([X_X, X_Y])

X_train_all = X_concat.tocsr()

X_train, X_test, y_train, y_test = train_test_split(X_train_all, y, random_state=0)

lr = LogisticRegressionCV().fit(X_train, y_train)
lr.score(X_test, y_test)

0.9011194029850746

In [271]:
X_train.shape

(1606, 6660)

In [272]:
model_dropout = Sequential([
    Dense(1024, input_shape=(6660,), activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid'),
])

In [273]:
import keras_metrics
model_dropout.compile("adam", "binary_crossentropy", metrics=['accuracy'])
history_dropout = model_dropout.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_split=.1)

Train on 1445 samples, validate on 161 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [274]:
score = model_dropout.evaluate(X_test, y_test, verbose=0)
print("Test loss: {:.5f}".format(score[0]))
print("Test Precision: {:.5f}".format(score[1]))
print("Test Recall: {:.5f}".format(score[2]))

Test loss: 0.59065
Test Precision: 0.90858


IndexError: list index out of range

In [275]:
# adding stop words (not good)
vec_X = CountVectorizer(ngram_range=(1, 4), encoding='utf-8', stop_words = 'english')
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = CountVectorizer(ngram_range=(1, 4), encoding='utf-8', stop_words = 'english')
X_Y = vec_Y.fit_transform(X['description_y'].values.astype('U'))


X_concat = hstack([X_X, X_Y])

X_train_all = X_concat.tocsr()

X_train, X_test, y_train, y_test = train_test_split(X_train_all, y, random_state=0)

lr = LogisticRegressionCV().fit(X_train, y_train)
lr.score(X_test, y_test)

0.9011194029850746

In [277]:
X_train.shape

model_dropout = Sequential([
    Dense(1024, input_shape=(6408,), activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid'),
])

import keras_metrics
model_dropout.compile("adam", "binary_crossentropy", metrics=['accuracy'])
history_dropout = model_dropout.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_split=.1)

score = model_dropout.evaluate(X_test, y_test, verbose=0)
print("Test loss: {:.5f}".format(score[0]))
print("Test Precision: {:.5f}".format(score[1]))
print("Test Recall: {:.5f}".format(score[2]))

Train on 1445 samples, validate on 161 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.56463
Test Precision: 0.89925


IndexError: list index out of range

In [178]:

vec_X = TfidfVectorizer(ngram_range=(1, 4), encoding='utf-8', stop_words = 'english')
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = TfidfVectorizer(ngram_range=(1, 4), encoding='utf-8', stop_words = 'english')
X_Y = vec_Y.fit_transform(X['description_y'].values.astype('U'))


X_concat = hstack([X_X, X_Y])

X_train_all = X_concat.tocsr()

X_train, X_test, y_train, y_test = train_test_split(X_train_all, y, random_state=0)

lr = LogisticRegressionCV().fit(X_train, y_train)
lr.score(X_test, y_test)

0.9123134328358209

In [179]:
# THIS ONE HAS THE BEST

X_train.shape

model_dropout = Sequential([
    Dense(1024, input_shape=(7045,), activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid'),
])

import keras_metrics
model_dropout.compile("adam", "binary_crossentropy", metrics=['accuracy'])
history_dropout = model_dropout.fit(X_train, y_train, batch_size=128, epochs=20, verbose=1, validation_split=.1)

score = model_dropout.evaluate(X_test, y_test, verbose=0)
print("Test loss: {:.5f}".format(score[0]))
print("Test Precision: {:.5f}".format(score[1]))
print("Test Recall: {:.5f}".format(score[2]))

Train on 1445 samples, validate on 161 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.43300
Test Precision: 0.91791


IndexError: list index out of range

In [280]:
train = pd.read_csv('/Users/atru/Desktop/kaggle/dataset/Phase 1 - similarityCompetition/train.csv')
test = pd.read_csv('/Users/atru/Desktop/kaggle/dataset/Phase 1 - similarityCompetition/test.csv')

In [282]:
X = train[['description_x', 'description_y']]
y = train[['same_security']]

vec_X = TfidfVectorizer(encoding='utf-8')
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = TfidfVectorizer(encoding='utf-8')
X_Y = vec_X.fit_transform(X['description_y'].values.astype('U'))


X_concat = hstack([X_X, X_Y])

X_train_all = X_concat.tocsr()

In [283]:
X_train_all.shape

(2142, 2823)

In [284]:
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y, random_state=1)

In [285]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)  
clf.score(X_val, y_val)

0.8992537313432836

Levechstein Distance

In [12]:
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]/len(s1 + s2)

In [13]:
X['lev_distance'] = ''

In [14]:
for i in range(0, len(X)):
    X['lev_distance'][i] = levenshtein(X['description_x'][i], X['description_y'][i])

In [279]:
X.head(20)

Unnamed: 0,description_x,description_y
0,,index
1,company,strategic tr fundamental corporation
2,adm,fund institutional
3,new us4 b7jzsk0,new us26441c2044 b7jzs
4,class a,
5,new div 0600,
6,steel,stl new
7,,
8,smallcap,small cap
9,,


In [91]:
X['pred'] = ''

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [14]:
cutoff_bracket = [10]


In [15]:
X_train['lev_distance'][0]

KeyError: 'lev_distance'

In [281]:
temp = pd.merge(X_train, train, how = 'inner')

In [301]:
len(temp[(temp['lev_distance'] < 0.6) & (temp['same_security'] == True)])/len(X_train)

0.43088418430884184

In [325]:
X_train.head()

Unnamed: 0,description_x,description_y,lev_distance,pred
1217,google inc cl a,google inc class a,0.0909091,
927,enterprise prods partners l com,enterprise prods partners l unit,0.0634921,
1656,facebook inc,facebook inc class a common stock,0.466667,
5,ford motor co new div 0600,ford motor co,0.333333,
58,vereit inc com,vereit inc reit,0.137931,


Cleaning up everything

In [296]:
train = pd.read_csv('/Users/atru/Desktop/kaggle/dataset/Phase 1 - similarityCompetition/train.csv')
test = pd.read_csv('/Users/atru/Desktop/kaggle/dataset/Phase 1 - similarityCompetition/test.csv')

In [323]:
X = train[['description_x', 'description_y']].append(test[['description_x', 'description_y']])

In [325]:
len(X)

2658

In [326]:
len(train)

2142

In [327]:
len(test)

516

In [328]:
bank = pd.read_csv('/Users/atru/Desktop/kaggle/wordbank.csv')
mapping = pd.melt(bank, id_vars=['Root word'])

In [329]:
mapping = mapping.dropna()
mapping = mapping[['Root word', 'value']]

In [330]:
map_dict = mapping.set_index('value').to_dict()
map_dict = map_dict['Root word']
map_dict = {k.lower(): v for k, v in map_dict.items()}

In [331]:
# >1 char dict:
multiple_char_dict = {k:v for (k,v) in map_dict.items() if ' ' in k}

In [332]:
multiple_char_dict

{'ampco pittsburgh': 'ampcopittsburgh',
 'bp plc': 'bpplc',
 'mid cap': 'mid-cap',
 'short term': 'Short-term',
 'sm cap': 'small-cap',
 'small cap': 'small-cap',
 't. rowe price': 't rowe price',
 'tax exempt': 'tax-exempt'}

In [336]:
X = X.reset_index()

In [337]:
#X = train[['description_x', 'description_y']]
y = train[['same_security']]

for i in range(0, len(X)):
    for word in X['description_x'][i].split(' '):
        if word.lower() in map_dict:
            X['description_x'][i] = X['description_x'][i].replace(word, map_dict[word])

for i in range(0, len(X)):
    for word in X['description_y'][i].split(' '):
        if word.lower() in map_dict:
            X['description_y'][i] = X['description_y'][i].replace(word, map_dict[word])

# replace multiple words

for i in range(0, len(X)):
    #print(i)
    for (k,v) in multiple_char_dict.items():
        if k in X['description_x'][i]:
            #print(k)
            X['description_x'][i] = X['description_x'][i].replace(k, multiple_char_dict[k])
        if k in X['description_y'][i]:
            #print(k)
            X['description_x'][i] = X['description_y'][i].replace(k, multiple_char_dict[k])

In [353]:
# removing duplicate words
    
for k in range(0, len(X)):
    text1 = X['description_x'][k]
    text2 = X['description_y'][k]
    list1 = text1.split(' ')
    list2 = text2.split(' ')
    remove_word = []
    for i in range(0, len(list1)):
        if list1[i] in list2:
            remove_word.append(list1[i])
    remove_word = list(set(remove_word))
    if len(remove_word) > 0:
        for j in range(0, len(remove_word)):    
            list1.remove(remove_word[j])
            list2.remove(remove_word[j])

        list1=" ".join(list1)
        list2=" ".join(list2)
        X['description_x'][k] = list1
        X['description_y'][k] = list2 

In [355]:
X["description_x"] = X['description_x'].str.replace('[^\w\s]','')
X["description_y"] = X['description_y'].str.replace('[^\w\s]','')

X["description_x"] = X["description_x"].str.lower()
X["description_y"] = X["description_y"].str.lower()

In [None]:
y = y.applymap(lambda x: 1 if x == True else x)
y = y.applymap(lambda x: 0 if x == False else x)

In [372]:
y = y['same_security']

In [388]:
vec_X = TfidfVectorizer(ngram_range=(1, 4), encoding='utf-8', stop_words = 'english')
X_X = vec_X.fit_transform(X['description_x'].values.astype('U'))


vec_Y = TfidfVectorizer(ngram_range=(1, 4), encoding='utf-8', stop_words = 'english')
X_Y = vec_Y.fit_transform(X['description_y'].values.astype('U'))

In [392]:
X_concat = hstack([X_X, X_Y])
X_all = X_concat.tocsr()

In [394]:
X_all

<2658x7420 sparse matrix of type '<class 'numpy.float64'>'
	with 18954 stored elements in Compressed Sparse Row format>

In [395]:
X_train = X_all[0:2142]
X_test = X_all[2142:]


lr = LogisticRegressionCV().fit(X_train, y)

In [397]:
# THIS ONE HAS THE BEST

X_train.shape

model_dropout = Sequential([
    Dense(1024, input_shape=(7420,), activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(1, activation='sigmoid'),
])

import keras_metrics
model_dropout.compile("adam", "binary_crossentropy", metrics=['accuracy'])
history_dropout = model_dropout.fit(X_train, y, batch_size=128, epochs=20, verbose=1, validation_split=.1)

Train on 1927 samples, validate on 215 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [399]:
y_pred = model_dropout.predict(X_test)

In [427]:
for i in range(0, len(y_pred)):
    if y_pred[i] < 0.5:
        y_pred[i] =  False
    else:
        y_pred[i] = True

In [436]:
for i in range(0, 10):
    if test['same_security'][i] == 1.0:
        test['same_security'][i] = True
    else:
        test['same_security'][i] = False

In [440]:
test['same_security'].replace(0.0, False, inplace=True)

In [445]:
test = test.applymap(lambda x: True if x == float(1) else x)
test = test.applymap(lambda x: False if x == float(0) else x)

In [447]:
test.to_csv('submission_phase1_modelbehavior_team2.csv')