Firstly, import necessary dependencies to the project. In this first version, we can start our study on some fundamental algorithms in Scikit-learn.

In [None]:
import os, sys
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import pickle

In [None]:
!pip install imblearn

Give configuration parameters for data, model, and also algorithms

In [97]:
parameters = {
        'predict_mode': 'all', # 'all' or 'single'
        'n_sample': 1000,
        'input_path': 'data/',
        'model': 'GradientBoosting',
        'params_model': {
            'loss': 'log_loss',
            'n_estimators': 100,
            'max_depth': 3,
            'random_state': 0,
            'learning_rate': 0.7,
            'subsample': 0.7,
            'min_samples_split': 20,
            'min_samples_leaf': 20
            
        },
        're_train': False,
        're_balancing_features': True
}
    

Load raw data into dataframes

In [None]:
dfs = pd.read_csv(parameters['input_path'] + "FS1.txt", sep="\t", header = None)
dps = pd.read_csv(parameters['input_path'] + "PS2.txt", sep="\t", header = None)
dpf = pd.read_csv(parameters['input_path'] + "profile.txt", sep="\t", header = None)

The raw data is numerical type, so we can consider its distribution first.

In [54]:
import matplotlib.pyplot as plt
# Histogram for data in FS1.txt
tmp = np.concatenate([np.array(dfs[i].tolist()) for i in dfs.dtypes.index])
tmp
counts, bins = np.histogram(x)

counts, bins

#plt.stairs(counts, bins)
#plt.show() 

(array([2090248, 2665817, 4865653,  921079, 3589301,  396344,    6817,
           3184,    7109,    7448]),
 array([0. , 0.9, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9. ]))

In [56]:
# Histogram for data in PS2.txt
tmp = np.concatenate([np.array(dfs[i].tolist()) for i in dfs.dtypes.index])
tmp
counts, bins = np.histogram(x)
counts, bins

#plt.stairs(counts, bins)
#plt.show() 

(array([2090248, 2665817, 4865653,  921079, 3589301,  396344,    6817,
           3184,    7109,    7448]),
 array([0. , 0.9, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9. ]))

The distribution of data give the possibility to discretize them. In this version, we use Kmeans as a model to categorize the data into features. 

In [102]:
kmeans = dict()
for i in dfs.dtypes.index:
    tmp = np.column_stack((np.array(dfs[i].tolist()),np.array([0]*len(dfs))))
    kmeans["fs_" + str(i)] = KMeans(n_clusters=10, random_state=0).fit(tmp)



for i in dps.dtypes.index:
    tmp = np.column_stack((np.array(dps[i].tolist()),np.array([0]*len(dps))))
    kmeans["ps_" + str(i)] = KMeans(n_clusters=5, random_state=0).fit(tmp)



            
with open('models/embeddings.pkl', 'wb') as file:
    pickle.dump(kmeans, file)
    
    

In [103]:
x=[]
#Feature processing for data in FS1.txt
for i in dfs.dtypes.index:
    tmp = np.column_stack((np.array(dfs[i].tolist()),np.array([0]*len(dfs))))
    x.append(kmeans["fs_" + str(i)].predict(tmp))


#Feature processing for data in PS2.txt
for i in dps.dtypes.index:
    tmp = np.column_stack((np.array(dps[i].tolist()),np.array([0]*len(dps))))
    x.append(kmeans["ps_" + str(i)].predict(tmp))
        
x = np.column_stack(x)
#get label from profile.txt
y = np.array(dpf[4].tolist())

In [93]:
#distribution of label
np.unique(y, return_counts = True)

(array([0, 1]), array([1449,  756]))

We start the model training. Here some of our remarques

1. We user GradientBoostingClassifier with log_loss as loss to train the data.
2. Due to the imbalance of the data (e.g. label), we suggest doing a re-sampling to handle it. In this study, we simplify by using imblearn.RandomUnderSampler
3. Turn the parameters of GradientBoostingClassifier to have better performance and reduce overfitting.
4. If you want to use a deeplearning model, we suggest applying at least one-hot encoder on the features to a better performance.

In [105]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.1, random_state=42)
if parameters['re_balancing_features']:
    rus = RandomUnderSampler(random_state=0)
    X_train, Y_train = rus.fit_resample(X_train, Y_train) 
    
clf = GradientBoostingClassifier(**parameters['params_model']).fit(X_train, Y_train)


y_pred = clf.predict(X_test)
print(classification_report(Y_test, y_pred))


#Save the model
with open('models/model.pkl', 'wb') as file:
    pickle.dump(clf, file)

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       149
           1       0.96      0.89      0.92        72

    accuracy                           0.95       221
   macro avg       0.95      0.93      0.94       221
weighted avg       0.95      0.95      0.95       221

