In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import sklearn
import math
import joblib
from matplotlib import pyplot as plt

In [2]:
# Load the npz file
train_1 = np.load("data/train_data.npy")
train_2 = np.load("data/train_labels.npy")
train_df = DataFrame(train_1)
del train_1
train_lb_df = DataFrame(train_2)
del train_2

# For validation
val_1 = np.load("data/eval_data.npy")
val_2 = np.load("data/eval_labels.npy")
val_df = DataFrame(val_1)
del val_1
val_lb_df = DataFrame(val_2)
del val_2

# for unit_test

uni_df = train_df.iloc[100:,:]
uni_lb_df = train_lb_df.iloc[100:,:]


In [3]:
print(val_lb_df.iloc[:,0].unique())

[1 0 2]


In [4]:
print(train_lb_df.iloc[:,0].unique()) #[1, 0, 2]
# Draw some plots to simply see the characteristics of training data
def DrawDist( ser_ft, ser_lb, ps="" ):
    ft_name = str(ser_ft.name)
    fig = plt.figure(figsize=(4,5), dpi=70)
    plt.hist(ser_ft[ser_lb==0], color='red', cumulative=False, 
             alpha=0.3,histtype='step', density=True)
    plt.hist(ser_ft[ser_lb==1], color='blue', cumulative=False, 
             alpha=0.3,histtype='step', density=True)
    plt.hist(ser_ft[ser_lb==2], color='grey', cumulative=False, 
             alpha=0.3,histtype='step', density=True)
    plt.show()
    fig.savefig("Fig/" + ft_name + ps + "_label.png")

#for ft in train_df.columns:
#   DrawDist( train_df[ft], train_lb_df.iloc[:,0] )

#train_lb_df = pd.Series(train_lb_df.iloc[:,0])
# Check if the classes are imbalance
no_0 = len(train_lb_df[train_lb_df==0])
no_1 = len(train_lb_df[train_lb_df==1])
no_2 = len(train_lb_df[train_lb_df==2])
print("class 0 : ", no_0/(no_0+no_1+no_2))
print("class 1 : ", no_1/(no_0+no_1+no_2))
print("class 2 : ", no_2/(no_0+no_1+no_2))

[1 0 2]
class 0 :  0.3333333333333333
class 1 :  0.3333333333333333
class 2 :  0.3333333333333333


In [5]:
for ft in train_df.columns:
    ft_uni_list = train_df[ft].unique()
    if len(ft_uni_list) < 100: #if len(ft_uni_list) < 10:
        print("feature:" + str(ft) + ", categories #:" + str(len(ft_uni_list)), ft_uni_list )
# deal with the categories feature => they have been already one-hot
print(type(train_df.columns[2]))

feature:56, categories #:2 [0. 1.]
feature:57, categories #:2 [0. 1.]
feature:58, categories #:2 [0. 1.]
feature:59, categories #:2 [0. 1.]
feature:60, categories #:2 [0. 1.]
<class 'int'>


In [7]:
# standardlize the datas
#to check if the features have already standardlized
"""
for ft in train_df.columns:
    print(str(ft) + "'s mean:", train_df[ft].mean())
    print(str(ft) + "'s std:", train_df[ft].std())
"""

def StandardlizeData(df, continuous_list=[]):
    if len(continuous_list) == 0:
        continuous_list = df.columns
    from sklearn import preprocessing
    scaler = preprocessing.StandardScaler().fit( df[continuous_list] )
    df[continuous_list] = scaler.transform( df[continuous_list] )
    
    return df, scaler
# Except the binary features, others have been already standardlized

In [None]:
# training for [0,1]



In [10]:
def GridSearchingBDT( X, y, _scoring="accuracy" ):
    from sklearn.tree import DecisionTreeClassifier as DTC
    from sklearn.ensemble import AdaBoostClassifier as AdaB
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV as GS

    param = { "base_estimator__max_features" : ["sqrt", 0.7, None], "base_estimator__max_leaf_nodes" : [100, None], 
              "base_estimator__max_depth" : range(4,28,2), "n_estimators": [30, 50, 70] } #for f1
    tree = DTC( random_state=1 )
    ABC = AdaB( base_estimator=tree, random_state=11, learning_rate=0.1 )
    grid_search_ABC = GS(ABC, param_grid=param, scoring = _scoring, verbose=10, cv=3, n_jobs=-1)
    grid_search_ABC.fit(X, y)
    print(grid_search_ABC.best_score_, grid_search_ABC.best_estimator_, grid_search_ABC.best_params_)
    # Store the best model
    final_model = grid_search_ABC.best_estimator_
    model_name = "Model/GS_01_BDT_training_best_" + _scoring
    joblib.dump( final_model, model_name )


In [19]:
print(train_lb_df.index)
print((train_df.iloc[:1000,:].loc[(train_lb_df[0]==0)|(train_lb_df[0]==1),:]).shape)
print((train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<10000),:]).shape)

RangeIndex(start=0, stop=1174461, step=1)
(803, 70)
(8028, 70)


In [20]:
GridSearchingBDT( train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<10000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<10000),0] )

Fitting 3 folds for each of 216 candidates, totalling 648 fits
0.6990533134030891 AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=12,
                                                         max_features='sqrt',
                                                         random_state=1),
                   learning_rate=0.1, n_estimators=70, random_state=11) {'base_estimator__max_depth': 12, 'base_estimator__max_features': 'sqrt', 'base_estimator__max_leaf_nodes': None, 'n_estimators': 70}


In [65]:
def BDTTraining(X, y, name=""):
    from sklearn.tree import DecisionTreeClassifier as DTC
    from sklearn.ensemble import AdaBoostClassifier as AdaB
    from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

    tree = DTC( random_state=1, max_depth=12, max_features="sqrt", max_leaf_nodes=None )
    ABC = AdaB( base_estimator=tree, random_state=11, learning_rate=0.1, n_estimators=70 )
    ABC.fit(X, y)
    # Store the best model
    final_model = ABC
    model_name = "Model/01_BDT_training_" + name
    joblib.dump( final_model, model_name )
    print("Accuracy : ", accuracy_score(val_lb_df.loc[(val_lb_df[0]==0)|(val_lb_df[0]==1),:],  final_model.predict(val_df.loc[(val_lb_df[0]==0)|(val_lb_df[0]==1),:])))


In [66]:
BDTTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),0]
                  , "12sqrtnone70")

Accuracy :  0.7204341317365269


In [21]:
def GridSearchingRF( X, y, _scoring="accuracy" ):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV as GS

    param = { "n_estimators" : [100, 150, 200, 250, 300, 350, 400], "max_features":["sqrt", 0.7, None] } 
    forest = RandomForestClassifier( criterion='gini', random_state=22, n_jobs=-1)
    grid_search = GS(forest, param_grid=param, scoring = _scoring, verbose=10, cv=3, n_jobs=-1)
    grid_search.fit(X, y)
    print(grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_)
    # Store the best model
    final_model = grid_search.best_estimator_
    model_name = "Model/GS_01_RF_training_best_" + _scoring
    joblib.dump( final_model, model_name )

    return final_model

In [22]:
GridSearchingRF( train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<10000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<10000),0] )

Fitting 3 folds for each of 21 candidates, totalling 63 fits
0.6948181365221724 RandomForestClassifier(max_features='sqrt', n_estimators=350, n_jobs=-1,
                       random_state=22) {'max_features': 'sqrt', 'n_estimators': 350}


RandomForestClassifier(max_features='sqrt', n_estimators=350, n_jobs=-1,
                       random_state=22)

In [59]:
def RFTraining(X, y, name=""):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV as GS
    from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

    forest = RandomForestClassifier( criterion='gini', n_estimators=350, max_features=None, random_state=22, n_jobs=-1)
    forest.fit(X, y)
    # Store the best model
    final_model = forest
    model_name = "Model/01_RF_training_" + name
    joblib.dump( final_model, model_name )
    print("Accuracy : ", accuracy_score(val_lb_df.loc[(val_lb_df[0]==0)|(val_lb_df[0]==1),:],  final_model.predict(val_df.loc[(val_lb_df[0]==0)|(val_lb_df[0]==1),:])))


In [58]:
RFTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),0]
                  , "25none")

Accuracy :  0.7287558568663278


In [56]:
RFTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),0]
                  , "25sqrt")

Accuracy :  0.7300787398013641


In [53]:
RFTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),0]
                  , "350sqrt")

Accuracy :  0.742245870796172


In [60]:
RFTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),0]
                  , "350none")

Accuracy :  0.7440707708450169


In [50]:
RFTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),0]
                  , "150sqrt")

Accuracy :  0.742245870796172


In [67]:
def GridSearchingLR( X, y, _scoring="accuracy" ):
    from sklearn import preprocessing
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import GridSearchCV as GS
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression

    grid_search = GS(LogisticRegression(), param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 10., 100.]}, cv=2, refit=True, scoring = _scoring, verbose=10, n_jobs=-1)
    pipeline = make_pipeline(StandardScaler(), grid_search)
    
    pipeline.fit(X, y)
    print(grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_)
    # Store the best model
    final_model = grid_search.best_estimator_
    model_name = "Model/GS_01_LR_training_best_" + _scoring
    joblib.dump( final_model, model_name )

    return final_model

In [69]:
GridSearchingLR( train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<100000),0] )

Fitting 2 folds for each of 6 candidates, totalling 12 fits
0.7445564559440297 LogisticRegression(C=0.01) {'C': 0.01}


LogisticRegression(C=0.01)

In [96]:
def LRTraining(X, y, name=""):
    from sklearn import preprocessing
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

    lr = LogisticRegression(C=0.001)
    pipeline = make_pipeline(StandardScaler(), lr)
    pipeline.fit(X, y)
    # Store the best model
    final_model = pipeline
    model_name = "Model/01_final_LR_training_" + name
    joblib.dump( final_model, model_name )
    print("Accuracy : ", accuracy_score(val_lb_df.loc[(val_lb_df[0]==0)|(val_lb_df[0]==1),:],  final_model.predict(val_df.loc[(val_lb_df[0]==0)|(val_lb_df[0]==1),:])))

    return final_model

In [74]:
LRTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<300000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<300000),0]
                  , "0pt01")

Accuracy :  0.7401993595889791


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=0.01))])

In [77]:
LRTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<300000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<300000),0]
                  , "0pt001")

Accuracy :  0.7403780053186678


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=0.001))])

In [80]:
LRTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<300000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<300000),0]
                  , "0pt1")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy :  0.7402570237169166


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=0.1))])

In [87]:
LRTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<300000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index<300000),0]
                  , "1")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy :  0.7402491090326899


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

In [92]:
LRTraining(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index>900000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index>900000),0]
                  , "0pt001_other")

Accuracy :  0.7398952095808383


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=0.001))])

### Do the model ensembling for [0,1]

In [94]:
def Training_EnsembleModel(X, y, name=""):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import VotingClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import make_pipeline
    from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
    
    lr = LogisticRegression(C=0.001)
    pipeline = make_pipeline(StandardScaler(), lr)
    
    forest = RandomForestClassifier( n_estimators=350, criterion='gini', random_state=22, n_jobs=-1, max_features=None )
    
    tree = DecisionTreeClassifier( random_state=1, max_depth=12, max_features="sqrt", max_leaf_nodes=None )
    ABC = AdaBoostClassifier( base_estimator=tree, random_state=11, learning_rate=0.1, n_estimators=70 )
    
    model = VotingClassifier(estimators=[
            ('lr', pipeline), ('rf', forest), ('bdt', ABC)],
            voting='soft')
    model = model.fit(X, y)
    joblib.dump( model, "01_final_Ensemble_training_"+name )
    print("Accuracy : ", accuracy_score(val_lb_df.loc[(val_lb_df[0]==0)|(val_lb_df[0]==1),:],  model.predict(val_df.loc[(val_lb_df[0]==0)|(val_lb_df[0]==1),:])))

In [95]:
Training_EnsembleModel(train_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index>1000000),:],
                  train_lb_df.loc[((train_lb_df[0]==0)|(train_lb_df[0]==1)) & (train_lb_df.index>1000000),0]
                  , "1")

Accuracy :  0.7414363116666968


In [84]:
print(len(train_lb_df))

1174461


In [74]:
"""
from sklearn.ensemble import RandomForestClassifier
forest_0_1 = RandomForestClassifier(criterion='gini',
    n_estimators=100,
    random_state=1,
    n_jobs=-1)
scores_RF_0_1_all = cross_val_score(forest_0_1, train_df.loc[(train_lb_df[0]==0)|(train_lb_df[0]==1),:],
                         train_lb_df.loc[(train_lb_df[0]==0)|(train_lb_df[0]==1),0], scoring='accuracy', cv=5)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_RF_0_1_all), np.std(scores_RF_0_1_all)))
"""

CV accuracy: 0.725 +/- 0.007
