In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, ShuffleSplit, StratifiedShuffleSplit
from sklearn.metrics import matthews_corrcoef, accuracy_score, make_scorer
from sklearn.utils import resample
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import VarianceThreshold, SelectPercentile, SelectKBest, chi2

from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

In [2]:
VALIDATION = True

train1_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Training.csv',index_col=0).T
train2_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Training.csv',index_col=0).T

test1_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_1_Testing.csv',index_col=0).T
test2_df = pd.read_csv(f'PRML_Datacontest_MKN_JUL_2021/Dataset_2_Testing.csv',index_col=0).T

train1 = train1_df
train2 = train2_df
test1 = test1_df
test2 = test2_df

if(VALIDATION):
    #Shuffling the dataset
    valSize = int(train1.shape[0]*0.75)
    train1 = train1.sample(frac=1).reset_index(drop=True)
    test1 = train1[valSize:]
    train1 = train1[:valSize]
    
    valSize = int(train2.shape[0]*0.75)
    train2 = train2.sample(frac=1).reset_index(drop=True)
    test2 = train2[valSize:]
    train2 = train2[:valSize]

In [3]:
print(train1.shape)
print(test1.shape)
print(train2.shape)
print(test2.shape)

(97, 22285)
(33, 22285)
(255, 54679)
(85, 54679)


In [4]:
oversample = SMOTE()

X = train1.drop(['CO: 1','CO: 2'], axis=1, errors='ignore')
y = np.array(train1[['CO: 1']])

X.shape

(97, 22283)

In [5]:
X,y = oversample.fit_resample(X,y)
X.shape

(148, 22283)

In [6]:
np.unique(y,return_counts=True)

(array([0., 1.]), array([74, 74], dtype=int64))

In [7]:
test_cols1 = ['CO: 1','CO: 2']
test_cols2 = ['CO: 3','CO: 4','CO: 5','CO: 6']
test_cols = ['CO: 1','CO: 2','CO: 3','CO: 4','CO: 5','CO: 6']

In [8]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 100, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [9]:
random_grid

{'n_estimators': [100, 575, 1050, 1525, 2000],
 'max_features': ['auto', 'log2'],
 'max_depth': [5, 28, 52, 76, 100, None],
 'bootstrap': [False]}

In [10]:
param = random_grid
def pred(train,test,col,model=RandomForestClassifier(),params=param):
    X = train.drop(test_cols, axis=1, errors='ignore')
    y = np.array(train[[col]])
    y = y.reshape(y.shape[0],)
    Xtest = test.drop(test_cols, axis=1, errors='ignore')
    
    X,y = oversample.fit_resample(X,y)

    if(VALIDATION): 
        ytest = test[[col]]
        ytrue.extend(list(ytest[col]))

    scaler = preprocessing.MinMaxScaler()
    scaledX = scaler.fit_transform(X)
#     selector = SelectPercentile(chi2, percentile=10)
#     scaledX = selector.fit_transform(scaledX,y)
    
    scaledXtest = scaler.transform(Xtest)
#     scaledXtest = selector.transform(scaledXtest)
    
    #model = XGBClassifier()
    model = GridSearchCV(model,params,verbose=1,n_jobs=4,cv=StratifiedShuffleSplit(n_splits=3,test_size=0.25),scoring=make_scorer(matthews_corrcoef))
    model.fit(scaledX,y)
    best_params.append(model.best_params_)
    print(model.best_params_)
      
    ypred.extend(model.predict(scaledXtest))   

    if(VALIDATION): print(col,model.score(scaledXtest,ytest),accuracy_score(ytest,model.predict(scaledXtest)))
    else: print(col)
        
    return pd.DataFrame(model.cv_results_).sort_values(by=['rank_test_score']).head(10)

In [11]:
ypred = []
ytrue = []
best_params = []
random_grid = {}
pred(train1,test1,'CO: 1',params=random_grid)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    1.3s finished


{}
CO: 1 0.563241847975046 0.8181818181818182


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.435453,0.008197,0.010306,0.00047,{},0.897365,0.740442,0.897365,0.845057,0.073974,1


In [12]:
pred(train1,test1,'CO: 2',params=random_grid)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    0.9s finished


{}
CO: 2 0.35 0.7272727272727273


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.318174,0.024729,0.008976,0.000814,{},0.288675,0.511891,0.57735,0.459305,0.123578,1


In [13]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 100, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [14]:
random_grid

{'n_estimators': [500, 1000, 1500, 2000],
 'max_features': ['auto', 'log2'],
 'max_depth': [50, 75, 100, None],
 'bootstrap': [False]}

In [15]:
random_grid = {}
pred(train2,test2,'CO: 3',params=random_grid)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    2.6s finished


{}
CO: 3 0.1105077777477141 0.8117647058823529


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,2.468806,0.043581,0.023282,0.000868,{},0.871819,0.851116,0.851321,0.858085,0.009711,1


In [16]:
pred(train2,test2,'CO: 4',params=random_grid)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    3.1s finished


{}
CO: 4 0.0 0.9058823529411765


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,3.02448,0.019605,0.026313,0.001297,{},0.909718,0.925187,0.944911,0.926605,0.014403,1


In [17]:
pred(train2,test2,'CO: 5',params=random_grid)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    2.0s finished


{}
CO: 5 0.5476858553567217 0.7764705882352941


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1.98273,0.036716,0.019616,0.000471,{},0.621622,0.654654,0.702959,0.659745,0.033401,1


In [18]:
pred(train2,test2,'CO: 6',params=random_grid)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    2.1s finished


{}
CO: 6 -0.025321020464017845 0.5294117647058824


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,2.037573,0.012798,0.023962,0.003289,{},0.228639,0.454902,0.306543,0.330028,0.093852,1


In [19]:
# pred(train1,test1,'CO: 1')
# pred(train1,test1,'CO: 2')

# pred(train2,test2,'CO: 3')
# pred(train2,test2,'CO: 4')
# pred(train2,test2,'CO: 5')
# pred(train2,test2,'CO: 6')

In [20]:
print(best_params)

[{}, {}, {}, {}, {}, {}]


In [21]:
if(VALIDATION):
    print(matthews_corrcoef(ytrue,ypred))
else:
#     submission = pd.DataFrame(ypred,columns=['Predicted'])
#     submission.index.name = 'Id'
    submission = pd.read_csv('dummy_submission.csv')
    submission.Predicted = np.array(ypred,dtype=int)
    submission.to_csv('smoteRFGridCV.csv',index=False)
    print(submission.shape)

0.42685505078322244
