In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
train = pd.read_csv("Data/training_data.csv", names=["sequence", "class"])
train.head()
X = train["sequence"].apply(lambda x: pd.Series(list(x))).drop(columns=[30, 31, 32])
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,f,g,i,a,d,c,f,a,b,d,...,e,f,e,e,c,i,e,d,c,b
1,b,g,h,f,c,d,b,a,e,c,...,e,i,c,i,h,d,c,c,b,e
2,a,g,i,d,f,e,d,a,d,g,...,c,d,b,a,h,g,c,d,e,h
3,g,c,h,c,d,f,c,g,h,g,...,c,f,c,d,e,c,f,g,b,g
4,h,g,b,d,g,h,a,f,c,c,...,c,a,g,c,d,f,c,b,b,e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5452,b,e,h,a,d,e,f,g,f,h,...,b,a,a,f,a,i,g,c,g,d
5453,d,f,f,f,i,e,h,c,c,e,...,g,d,e,i,e,g,h,b,c,b
5454,h,b,b,c,d,d,a,h,i,c,...,b,i,b,b,e,g,e,g,g,e
5455,c,a,c,i,a,b,g,d,d,g,...,e,h,h,b,h,g,i,e,i,b


In [3]:
enc = OneHotEncoder()
enc.fit(X)
X = enc.transform(X).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [4]:
test = pd.read_csv("Data/imbalanced_test_data.csv", names=["sequence", "class"])
X_test = test["sequence"].apply(lambda x: pd.Series(list(x)))
X_test = enc.transform(X_test).toarray()
X_test

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.combine  import SMOTETomek
sm = SMOTE(random_state=42, k_neighbors=5)
# sm = SMOTETomek(random_state=42)
X_res, y_res = sm.fit_resample(X, train["class"])

X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_res, y_res, test_size=0.20, random_state=42, stratify=y_res)

print(len(X))
print(len(X_res))

from sklearn.utils import class_weight
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_new_train
)
len(classes_weights)

5457
6070


4856

In [6]:
from xgboost import XGBClassifier

smallest_class_count = train["class"].sum()
largest_class_count = len(train["class"]) - smallest_class_count
spw = largest_class_count / smallest_class_count
print(spw)

clf = XGBClassifier(booster='gbtree', 
   			objective='multi:softmax',
			num_class=5, 
			learning_rate=0.1, reg_alpha=0.005,
  			n_estimators=201, 
			gamma = 0.8,
			sub_sample=0.1,
			colsample_bytree=0.8,
  			scale_pos_weight=spw, 
  			random_state=101, 
            n_jobs=-1, 
			sample_weight=classes_weights,
			eval_metric="aucpr")

# clf.fit(X_new_train, y_new_train, sample_weight=classes_weights)
clf.fit(X_new_train, y_new_train)
# make predictions
preds = clf.predict(X_new_test)
target_names=["cls1", "cls2", "cls3", "cls4", "cls5"]
print(classification_report(y_new_test, preds))

-0.49109391028630045
Parameters: { "sample_weight", "scale_pos_weight", "sub_sample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  from pandas import MultiIndex, Int64Index


              precision    recall  f1-score   support

           0       0.96      0.96      0.96       243
           1       0.96      0.96      0.96       242
           2       0.99      0.98      0.99       243
           3       0.99      1.00      0.99       243
           4       0.99      1.00      0.99       243

    accuracy                           0.98      1214
   macro avg       0.98      0.98      0.98      1214
weighted avg       0.98      0.98      0.98      1214



In [7]:
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_res
)
len(classes_weights)

6070

In [8]:
from sklearn.metrics import f1_score

def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred))
    return 'f1_err', err

In [9]:
smallest_class_count = train["class"].sum()
largest_class_count = len(train["class"]) - smallest_class_count
spw = largest_class_count / smallest_class_count
print(spw)

bst =  XGBClassifier(booster='gbtree', 
   			objective='multi:softmax',
			num_class=5, 
			learning_rate=0.1, reg_alpha=0.005,
  			n_estimators=201, 
			gamma = 0.8,
			sub_sample=0.1,
			colsample_bytree=0.8,
  			scale_pos_weight=spw, 
  			random_state=101, 
            n_jobs=-1, 
			sample_weight=classes_weights,
			eval_metric="aucpr")


bst.fit(X_res, y_res, sample_weight=classes_weights)
# make predictions
preds = bst.predict(X_test)


-0.49109391028630045
Parameters: { "sample_weight", "scale_pos_weight", "sub_sample" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.






In [10]:
f = open("level_8_submission.csv", "a")
for i in preds.tolist():
    f.write(str(i) + "\n")
f.close()