In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

from lightgbm import LGBMClassifier

from pathlib import Path

In [25]:
train = pd.read_csv('data/train_pre_processing_nlp_5000.csv')
test = pd.read_csv('data/test_pre_processing_nlp_5000.csv')
train.info()

submit = test['id'].to_frame()
if 'id' in train.columns:
    train.drop(labels=['id'], axis=1, inplace=True)
test.drop(labels=['id'], axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 5057 entries, keyword to target
dtypes: bool(21), float64(5000), int64(31), object(5)
memory usage: 292.7+ MB


In [26]:
noise_cols = ['location', 'len_location_cero_default', 
              'len_location_mean_default', 'total_words_location_cero_default',
             'total_words_location_mean_default', 'text']
train.drop(labels=noise_cols, axis=1, inplace=True)
test.drop(labels=noise_cols, axis=1, inplace=True)

In [27]:
def prepare_df_for_fit(df):
    columns_str = df.select_dtypes(include=['object']).columns
    
    # Encode with LabelEncoder
    encoded_cols = df[columns_str]
    encoded_cols = encoded_cols.astype('str')
    encoded_cols = encoded_cols.apply(LabelEncoder().fit_transform)
    encoded_drop = df.drop(columns_str, axis = 1)
    encoded_df = pd.concat([encoded_drop, encoded_cols], axis = 1)
    # Drop Target column
    if 'target' in encoded_df.columns:
        encoded_df.drop(axis=1, labels=['target'], inplace=True)

    return encoded_df

In [28]:
train_X = prepare_df_for_fit(train) # train.drop(axis=1, labels=['target'])
test_X = prepare_df_for_fit(test) 
train_Y = train['target']

## KNN 

In [29]:
scaler = StandardScaler()
scaler.fit(train_X)
sc_transform = scaler.transform(train_X)
X = pd.DataFrame(sc_transform)

In [30]:
seed = 7
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, train_Y, test_size=0.20, random_state=42)

In [31]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

KNeighborsClassifier()

In [32]:
%%time
preds = knn_model.predict(X_test)
print(roc_auc_score(y_test, preds))

0.6119315757740301
CPU times: user 1min 26s, sys: 502 ms, total: 1min 27s
Wall time: 2min 31s


## LGBM

In [33]:
seed = 7
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.20, random_state=42)

In [34]:
light_model = LGBMClassifier(random_state=1, colsample_bytree = 0.5, 
                             gamma = 0.1, learning_rate = 0.06, max_depth = 5, min_child_weight = 2, 
                             n_estimators = 300, reg_alpha = 0.1, seed = 123, subsample = 0.9)
light_model.fit(X_train, y_train)

LGBMClassifier(colsample_bytree=0.5, gamma=0.1, learning_rate=0.06, max_depth=5,
               min_child_weight=2, n_estimators=300, random_state=1,
               reg_alpha=0.1, seed=123, subsample=0.9)

In [35]:
%%time
preds = light_model.predict(X_test)
print(roc_auc_score(y_test, preds))

0.7843302316889563
CPU times: user 904 ms, sys: 196 ms, total: 1.1 s
Wall time: 870 ms


## Voting Classifier

In [36]:
%%time
eclf1 = VotingClassifier(estimators=[
        ('knn', knn_model), ('lgbm', light_model)], voting='hard', weights=[1,3])
eclf1 = eclf1.fit(X_train, y_train)
preds = eclf1.predict(X_test)
print(roc_auc_score(y_test, preds))

0.7843302316889563
CPU times: user 14.9 s, sys: 2.19 s, total: 17.1 s
Wall time: 28.9 s
