In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("./pre_processed_train.csv",index_col="id")
test = pd.read_csv("./pre_processed_test.csv",index_col="id")

X=train.copy()
X_test=test.copy()


y=X.label
y_test=X_test.label

X=X.drop(['label'],axis=1)
X_test=X_test.drop(['label'],axis=1)

train.head()

Unnamed: 0_level_0,label,confidence,social_timestamp,social_karma,syntax_ari,lex_liwc_WC,lex_liwc_Analytic,lex_liwc_Clout,lex_liwc_Authentic,lex_liwc_Tone,...,Tokenized_Segment_1601,Tokenized_Segment_1602,Tokenized_Segment_1603,Tokenized_Segment_1604,Tokenized_Segment_1605,Tokenized_Segment_1606,Tokenized_Segment_1607,Tokenized_Segment_1608,Tokenized_Segment_1609,Tokenized_Segment_1610
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33181,1,0.8,1521614353,5,1.806818,116,72.64,15.04,89.26,1.0,...,0,0,0,0,0,0,0,0,0,0
2606,0,1.0,1527009817,4,9.429737,109,79.08,76.85,56.75,98.18,...,0,0,0,0,0,0,0,0,0,0
38816,1,0.8,1535935605,2,7.769821,167,33.8,76.38,86.24,25.77,...,0,0,0,0,0,0,0,0,0,0
239,1,0.6,1516429555,0,2.667798,273,2.98,15.25,95.42,79.26,...,0,0,0,0,0,0,0,0,0,0
1421,1,0.8,1539809005,24,7.554238,89,32.22,28.71,84.01,1.0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# evaluate a given model using cross-validation
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
def evaluate_model(model, X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=42)
    model.fit(X_train, y_train, early_stopping_rounds=50,eval_metric='auc',   verbose=False,eval_set=[(X_valid, y_valid)])
    
    y_hat = model.predict(X_valid)
    
    return f1_score(y_valid,y_hat,average="binary")


In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)


In [16]:
### Optimizing parameters for XGBoost, GBM, LGBM which gave them highest base accuracy of 78%

##XGBoost

# optuna to optimize
import optuna
from sklearn.model_selection import cross_val_score

def xgb_objective(trial):
    
    n_estimators = trial.suggest_int("n_estimators", 300, 5000)
    max_depth = trial.suggest_int("max_depth", 1, 10)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-1, log=True)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 7, step=2)
    subsample = trial.suggest_float("subsample", 0.5, 1.0, step=0.1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0, step=0.1)
    
    model = XGBClassifier(n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_child_weight=min_child_weight,
                         colsample_bytree=colsample_bytree,
                         subsample=subsample,
                         eval_metric='auc',
                         n_jobs=-1, 
                         tree_method='gpu_hist', 
                          predictor="gpu_predictor",
                         gpu_id=0)  
    return evaluate_model(model, X, y)

study_xgb = optuna.create_study(study_name='xgboost_optimization',direction='maximize')
study_xgb.optimize(xgb_objective, n_trials=100)

[32m[I 2021-08-29 15:48:38,682][0m A new study created in memory with name: xgboost_optimization[0m
[32m[I 2021-08-29 15:48:42,737][0m Trial 0 finished with value: 0.7266775777414076 and parameters: {'n_estimators': 1525, 'max_depth': 2, 'learning_rate': 0.0018540438937934133, 'min_child_weight': 7, 'subsample': 0.8, 'colsample_bytree': 0.9}. Best is trial 0 with value: 0.7266775777414076.[0m
[32m[I 2021-08-29 15:48:56,623][0m Trial 1 finished with value: 0.7548387096774194 and parameters: {'n_estimators': 1996, 'max_depth': 10, 'learning_rate': 0.002221049544497063, 'min_child_weight': 3, 'subsample': 0.9, 'colsample_bytree': 0.5}. Best is trial 1 with value: 0.7548387096774194.[0m
[32m[I 2021-08-29 15:49:02,582][0m Trial 2 finished with value: 0.7798742138364779 and parameters: {'n_estimators': 4597, 'max_depth': 5, 'learning_rate': 0.008719503800237062, 'min_child_weight': 3, 'subsample': 0.7, 'colsample_bytree': 0.5}. Best is trial 2 with value: 0.7798742138364779.[0m


In [17]:
print(study_xgb.best_trial)
print(study_xgb.best_trial.params)

FrozenTrial(number=61, values=[0.7954545454545454], datetime_start=datetime.datetime(2021, 8, 29, 16, 13, 37, 595397), datetime_complete=datetime.datetime(2021, 8, 29, 16, 13, 58, 175131), params={'n_estimators': 3744, 'max_depth': 2, 'learning_rate': 0.09916927539385754, 'min_child_weight': 3, 'subsample': 1.0, 'colsample_bytree': 0.5}, distributions={'n_estimators': IntUniformDistribution(high=5000, low=300, step=1), 'max_depth': IntUniformDistribution(high=10, low=1, step=1), 'learning_rate': LogUniformDistribution(high=0.1, low=1e-06), 'min_child_weight': IntUniformDistribution(high=7, low=1, step=2), 'subsample': DiscreteUniformDistribution(high=1.0, low=0.5, q=0.1), 'colsample_bytree': DiscreteUniformDistribution(high=1.0, low=0.5, q=0.1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=61, state=TrialState.COMPLETE, value=None)
{'n_estimators': 3744, 'max_depth': 2, 'learning_rate': 0.09916927539385754, 'min_child_weight': 3, 'subsample': 1.0, 'colsample_bytree