In [54]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropCorrelatedFeatures
# possible change to pytorch  
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense

In [5]:
def loadData(path):
    df= pd.read_csv(path)
    return pd.DataFrame(df)


In [46]:
def dropDuplicatedRowAndColumn(train, test):
    
    train = train.drop_duplicates()
    drop = train.columns.duplicated()
    return [train.loc[:,~drop], test.loc[:,~drop[:len(drop)-1]]]


In [7]:
def quasiConstantRemoval(train, threshold, test):
    constant_filter = VarianceThreshold(threshold= threshold)
    constant_filter.fit(train)
    return [pd.DataFrame(constant_filter.transform(train)),pd.DataFrame(constant_filter.transform(test))]

In [58]:
def dropCorrelatedFeatures(train,test):
    drop_correlated = DropCorrelatedFeatures(
    variables=None, method='pearson', threshold=0.9)
    drop_correlated.fit(train)
    return [pd.DataFrame(drop_correlated.transform(train)),pd.DataFrame(drop_correlated.transform(test))]


In [59]:
def dropColumnWithName(data, name):
    column_to_drop = [c for c in data if c.startswith(name)]
    data = data.drop(columns = column_to_drop )


In [60]:
def normalise(data):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(data))
  

In [42]:
def main():
    df_train = loadData("train.csv")
    df_test = loadData("test.csv")
    
    # dropping "ID" from both training and test data
    ID_train = df_train["ID"].copy()
    ID_test = df_test["ID"].copy()
    df_train = df_train.drop(columns = "ID")
    df_test = df_test.drop(columns = "ID")
    

    dup = dropDuplicatedRowAndColumn(df_train, df_test)
    df_train = dup[0]
    df_test = dup[1]
   
    y_train = df_train['TARGET'].copy()
    df_train_x = df_train.drop(columns="TARGET")
    
    quasi_res = quasiConstantRemoval(df_train_x, 0.01, df_test)
    df_train_x = quasi_res[0]
    df_test = quasi_res[1]
    
    correlated  = dropCorrelatedFeatures(df_train_x,df_test)
    df_train_x = correlated[0]
    df_test = correlated[1]
   

    df_train_x = normalise(df_train_x)
    df_test = normalise(df_test)
    
    output_dim = findBestOutputSize(df_train_x, y_train)
    
    model = build_model(df_train_x.shape[1], output_dim)
    model.fit(df_train_x, y_train)
    prediction = model.predict_proba(df_test)[:,0]
    
    return pd.DataFrame({'ID':ID_test, "Target": prediction})


In [103]:
result = main()
print(result)

[0.15231068730354308, 0.1528145268559456, 0.15518811643123626, 0.15828638821840285, 0.16213497668504714, 0.1605364739894867, 0.16455858200788498, 0.16331611275672914, 0.16333200931549072, 0.1662281036376953]
0.15231068730354308 0




           ID    Target
0           2  0.032010
1           5  0.064854
2           6  0.022014
3           7  0.061794
4           9  0.018291
...       ...       ...
75813  151831  0.097336
75814  151832  0.016857
75815  151833  0.019888
75816  151834  0.026842
75817  151837  0.016411

[75818 rows x 2 columns]


In [71]:
# 10 fold and the measure is loss
def build_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(output_dim,input_dim = input_dim, kernel_initializer='uniform', activation = 'tanh'))
    model.add(Dense(1,input_dim = output_dim, kernel_initializer='uniform', activation = 'sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')   
    return model

In [102]:
# use cross fold to get the accuracy of the model with different size 
def findBestOutputSize(data, y):
    data = data.to_numpy()
    y = y.to_numpy()
    scores = [0] * 10
    alpha_list = [i+1 for i in range(10)]
    kfold = KFold(n_splits=10)
    for alpha in alpha_list:
        for train, test in kfold.split(data, y):
            output_dim = getNumberOfNeurons(data, alpha)
            model = build_model(data[train].shape[1],output_dim)
            model.fit(data[train], y[train])
#             data[test]
            scores[alpha-1]+=model.evaluate(data[test], y[test], verbose=0)
        scores[alpha-1] = scores[alpha-1]/10
    print(scores)
    print(min(scores),scores.index(min(scores)))
    alpha = scores.index(min(scores)) +1
    return getNumberOfNeurons(data, alpha)
    

NameError: name 'scores' is not defined

In [36]:
def getNumberOfNeurons(data, alpha ):
    return (data.shape[0]/(alpha*(data.shape[1]+1)))