In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

import xgboost as xgb

from pathlib import Path

In [2]:
train = pd.read_csv('data/train_pre_processing_nlp_5000.csv')
test = pd.read_csv('data/test_pre_processing_nlp_5000.csv')
train.info()

submit = test['id'].to_frame()
train.drop(labels=['id'], axis=1, inplace=True)
test.drop(labels=['id'], axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 5057 entries, keyword to target
dtypes: bool(21), float64(5000), int64(31), object(5)
memory usage: 292.7+ MB


In [3]:
noise_cols = ['location', 'len_location_cero_default', 
              'len_location_mean_default', 'total_words_location_cero_default',
             'total_words_location_mean_default', 'text']
train.drop(labels=noise_cols, axis=1, inplace=True)
test.drop(labels=noise_cols, axis=1, inplace=True)

In [4]:
def prepare_df_for_fit(df):
    columns_str = ['keyword_grouped', 'keyword', 'text_clean']
    
    # Encode with LabelEncoder
    encoded_cols = df[columns_str]
    encoded_cols = encoded_cols.astype('str')
    encoded_cols = encoded_cols.apply(LabelEncoder().fit_transform)
    encoded_drop = df.drop(columns_str, axis = 1)
    encoded_df = pd.concat([encoded_drop, encoded_cols], axis = 1)
    # Drop Target column
    if 'target' in encoded_df.columns:
        encoded_df.drop(axis=1, labels=['target'], inplace=True)

    return encoded_df

In [5]:
train_X = prepare_df_for_fit(train) #train.drop(axis=1, labels=['target']) 
test_X = prepare_df_for_fit(test) #train.drop(axis=1, labels=['target'])
train_Y = train['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size = 0.25, random_state = 10)

In [None]:
%%time
xgb_model = xgb.XGBClassifier(objective="reg:linear", random_state=10, colsample_bytree = 0.5, 
                             gamma = 0.1, learning_rate = 0.06, max_depth = 5, min_child_weight = 2, 
                             n_estimators = 300, reg_alpha = 0.1, seed = 123, subsample = 0.9)

xgb_model.fit(X_train, y_train)

In [None]:
preds = xgb_model.predict(X_test)

In [None]:
roc_auc_score(y_test, preds)

In [None]:
acc = accuracy_score(preds,y_test)
print("ACC: %f" % (acc))

In [None]:
# 0.7875611684509691
# ACC: 0.799895

## KNN

In [None]:
scaler = StandardScaler()
scaler.fit(train_X)
sc_transform = scaler.transform(train_X)
X = pd.DataFrame(sc_transform)

In [None]:
seed = 7
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, train_Y, test_size=0.20, random_state=42)

In [None]:
%%time
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(X_train, y_train)

In [None]:
preds = knn_model.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
# 0.789257720908421

#### Optimal K

## Voting Classifier

In [None]:
%%time
eclf1 = VotingClassifier(estimators=[
        ('knn', knn_model), ('xgb', xgb_model)], voting='hard', weights=[1,3])
eclf1 = eclf1.fit(X_train, y_train)
preds = eclf1.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
# 0.7900968925966017 (5000 TF IDF)

## Submit

In [None]:
%%time
eclf1 = eclf1.fit(train_X, train_Y)
preds = eclf1.predict(test_X)
submit['target'] = pd.DataFrame(preds)
submit

In [None]:
Path("result").mkdir(parents=True, exist_ok=True)
submit.to_csv('result/submit.csv', index=False)