In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

from lightgbm import LGBMClassifier

from pathlib import Path

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('data/train_pre_processing_nlp_5000.csv')
test = pd.read_csv('data/test_pre_processing_nlp_5000.csv')
train.info()

submit = test['id'].to_frame()
train.drop(labels=['id'], axis=1, inplace=True)
test.drop(labels=['id'], axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 5057 entries, keyword to target
dtypes: bool(21), float64(5000), int64(31), object(5)
memory usage: 292.7+ MB


In [3]:
train = train.select_dtypes(include=['float64','int64','bool'])
test = test.select_dtypes(include=['float64','int64','bool'])

In [4]:
#noise_cols = ['location', 'len_location_cero_default', 
#              'len_location_mean_default', 'total_words_location_cero_default',
#             'total_words_location_mean_default', 'text']
#train.drop(labels=noise_cols, axis=1, inplace=True)
#test.drop(labels=noise_cols, axis=1, inplace=True)

In [5]:
def prepare_df_for_fit(df):
    columns_str = ['keyword_grouped', 'keyword', 'text_clean']
    
    # Encode with LabelEncoder
    encoded_cols = df[columns_str]
    encoded_cols = encoded_cols.astype('str')
    encoded_cols = encoded_cols.apply(LabelEncoder().fit_transform)
    encoded_drop = df.drop(columns_str, axis = 1)
    encoded_df = pd.concat([encoded_drop, encoded_cols], axis = 1)
    # Drop Target column
    if 'target' in encoded_df.columns:
        encoded_df.drop(axis=1, labels=['target'], inplace=True)

    return encoded_df

In [6]:
train_X = train.drop(axis=1, labels=['target'])  # prepare_df_for_fit(train) 
test_X = test # prepare_df_for_fit(test) 
train_Y = train['target']

## KNN 

In [7]:
scaler = StandardScaler()
scaler.fit(train_X)
sc_transform = scaler.transform(train_X)
X = pd.DataFrame(sc_transform)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
seed = 7
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(X, train_Y, test_size=0.20, random_state=42)

In [9]:
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [10]:
%%time
preds = knn_model.predict(X_test)
print(roc_auc_score(y_test, preds))

0.6382632319392976
CPU times: user 1min 6s, sys: 607 ms, total: 1min 7s
Wall time: 1min 12s


## LGBM

In [None]:
light_model = LGBMClassifier(random_state=1)
light_model.fit(X_train, y_train)

In [None]:
%%time
preds = light_model.predict(X_test)
print(roc_auc_score(y_test, preds))

## Voting Classifier

In [None]:
%%time
eclf1 = VotingClassifier(estimators=[
        ('knn', knn_model), ('lgbm', light_model)], voting='hard', weights=[1,2])
eclf1 = eclf1.fit(X_train, y_train)
preds = eclf1.predict(X_test)
print(roc_auc_score(y_test, preds))