In [110]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropCorrelatedFeatures
# possible change to pytorch  
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense

In [34]:
def loadData(path):
    df= pd.read_csv(path)
    return pd.DataFrame(df)


In [144]:
def dropDuplicatedRowAndColumn(train, test):
    
    train = train.drop_duplicates()
    drop = train.columns.duplicated()
    return [train.loc[:,~drop], test.loc[:,~drop[:len(drop)-1]]]


In [134]:
def quasiConstantRemoval(train, threshold, test):
    constant_filter = VarianceThreshold(threshold= threshold)
    constant_filter.fit(train)
    return [pd.DataFrame(constant_filter.transform(train)),pd.DataFrame(constant_filter.transform(test))]

In [135]:
def dropCorrelatedFeatures(train,test):
    drop_correlated = DropCorrelatedFeatures(
    variables=None, method='pearson', threshold=0.9)
    drop_correlated.fit(train)
    return [pd.DataFrame(drop_correlated.transform(train)),pd.DataFrame(drop_correlated.transform(train))]


In [81]:
def dropColumnWithName(data, name):
    column_to_drop = [c for c in data if c.startswith(name)]
    data = data.drop(columns = column_to_drop )


In [56]:
def normalise(data):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(data))
  

In [149]:
def main():
    df_train = loadData("train.csv")
    df_test = loadData("test.csv")
    
    # dropping "ID" from both training and test data
    ID_train = df_train["ID"].copy()
    ID_test = df_test["ID"].copy()
    df_train = df_train.drop(columns = "ID")
    df_test = df_test.drop(columns = "ID")
    

    dup = dropDuplicatedRowAndColumn(df_train, df_test)
    df_train = dup[0]
    df_test = dup[1]
    
    y_train = df_train['TARGET'].copy()
    df_train_x = df_train.drop(columns="TARGET")
    
    quasi_res = quasiConstantRemoval(df_train_x, 0.01, df_test)
    df_train_x = quasi_res[0]
    df_test = quasi_res[1]
    
    correlated  = dropCorrelatedFeatures(df_train_x,df_test)
    df_train_x = correlated[0]
    df_test = correlated[1]
 
    df_train_x = normalise(df_train_x)
    print(df_train_x)

In [150]:
main()

            0         1         2         3         4         5         6    \
0      0.038772 -0.805834 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
1      0.038772  0.033327 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
2      0.038772 -0.805834 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
3      0.038772  0.262189 -0.055166  0.336397  0.119616 -0.039477 -0.043503   
4      0.038772  0.414763 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
...         ...       ...       ...       ...       ...       ...       ...   
71208  0.038772  0.414763 -0.055166  0.853451  0.492882 -0.039477 -0.043503   
71209  0.038772  1.101350 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
71210  0.038772  0.414763 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
71211  0.038772 -0.805834 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   
71212  0.038772 -0.653259 -0.055166 -0.220682 -0.226444 -0.039477 -0.043503   

            7         8        9    ...       151  

In [114]:
# 10 fold and the measure is loss
def build_model(data):
    model = Sequential()
    model.add(Dense(input_dim = data.shape[1], output_dim=120, init='uniform', activation = 'tanh'))
    model.add(Dense(input_dim = 120, output_dim=1, init='uniform', activation = 'sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')     