In [24]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropCorrelatedFeatures
# possible change to pytorch  
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense

In [5]:
def loadData(path):
    df= pd.read_csv(path)
    return pd.DataFrame(df)


In [46]:
def dropDuplicatedRowAndColumn(train, test):
    
    train = train.drop_duplicates()
    drop = train.columns.duplicated()
    return [train.loc[:,~drop], test.loc[:,~drop[:len(drop)-1]]]


In [7]:
def quasiConstantRemoval(train, threshold, test):
    constant_filter = VarianceThreshold(threshold= threshold)
    constant_filter.fit(train)
    return [pd.DataFrame(constant_filter.transform(train)),pd.DataFrame(constant_filter.transform(test))]

In [58]:
def dropCorrelatedFeatures(train,test):
    drop_correlated = DropCorrelatedFeatures(
    variables=None, method='pearson', threshold=0.9)
    drop_correlated.fit(train)
    return [pd.DataFrame(drop_correlated.transform(train)),pd.DataFrame(drop_correlated.transform(test))]


In [59]:
def dropColumnWithName(data, name):
    column_to_drop = [c for c in data if c.startswith(name)]
    data = data.drop(columns = column_to_drop )


In [60]:
def normalise(data):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(data))
  

In [67]:
def main():
    df_train = loadData("train.csv")
    df_test = loadData("test.csv")
    
    # dropping "ID" from both training and test data
    ID_train = df_train["ID"].copy()
    ID_test = df_test["ID"].copy()
    df_train = df_train.drop(columns = "ID")
    df_test = df_test.drop(columns = "ID")
    

    dup = dropDuplicatedRowAndColumn(df_train, df_test)
    df_train = dup[0]
    df_test = dup[1]
   
    y_train = df_train['TARGET'].copy()
    df_train_x = df_train.drop(columns="TARGET")
    
    quasi_res = quasiConstantRemoval(df_train_x, 0.01, df_test)
    df_train_x = quasi_res[0]
    df_test = quasi_res[1]
    
    correlated  = dropCorrelatedFeatures(df_train_x,df_test)
    df_train_x = correlated[0]
    df_test = correlated[1]
   

    df_train_x = normalise(df_train_x)
    df_test = normalise(df_test)
    
    model = build_model(df_train_x)
    model.fit(df_train_x, y_train)
    prediction = model.predict_proba(df_test)[:,0]
    
    return pd.DataFrame({'ID':ID_test, "Target": prediction})


In [68]:
result = main()
print(result)





           ID    Target
0           2  0.024762
1           5  0.034654
2           6  0.014696
3           7  0.040811
4           9  0.015390
...       ...       ...
75813  151831  0.066246
75814  151832  0.013987
75815  151833  0.013182
75816  151834  0.023897
75817  151837  0.013686

[75818 rows x 2 columns]


In [31]:
# 10 fold and the measure is loss
def build_model(data):
    model = Sequential()
    model.add(Dense(120,input_dim = data.shape[1], kernel_initializer='uniform', activation = 'tanh'))
    model.add(Dense(1,input_dim = 120, kernel_initializer='uniform', activation = 'sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')   
    return model