In [1]:
import os
import sys
import glob
import math
import random

import numpy as np
import pandas as pd

import matplotlib.pylab as plt
import seaborn as sns

from tqdm import tqdm
from itertools import cycle

from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import ensemble
from sklearn import decomposition
from sklearn import tree
import category_encoders as ce

import lightgbm as lgb
import xgboost as xgb
import catboost as cat
import optuna

import torch
import torch.nn as nn
import torch.nn.functional as F

from random import choice, choices
pd.set_option("display.max_columns", None)

plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]
color_cycle = cycle(plt.rcParams["axes.prop_cycle"].by_key()["color"])

import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv("../input/netflix-appetency/train.csv")
test_df = pd.read_csv("../input/netflix-appetency/test.csv")
ss_df = pd.read_csv("../input/netflix-appetency/sample_submission.csv")

train_df.shape, test_df.shape, ss_df.shape

In [2]:
fig, ax = plt.subplots(figsize=(10,5))
sns.countplot(train_df.target)
plt.show()

In [3]:
train_df.head()

In [4]:
train_df.drop(columns=['id', 'target']).duplicated().sum()

In [5]:
train_df.drop_duplicates().reset_index(drop=True)

## Null Values

In [6]:
display(train_df.isna().sum(axis=0).sort_values(ascending=False).to_frame().T)

fig, ax = plt.subplots(figsize=(25, 9))
sns.distplot(train_df.isna().sum(axis=0), bins=100, color=choice(color_pal), label='time_id')
plt.title("Null values Distribution")
plt.show()

In [7]:
# all null values
null_all = np.where(train_df.isna().sum(axis=0)==len(train_df))
# 90% null values
null_80 = np.where(train_df.isna().sum(axis=0)>len(train_df)%80)
null_cols = np.hstack((null_all, null_80)).squeeze()

## Categorical columns

In [8]:
# Number of unique catgories 
display(train_df.select_dtypes('object').nunique().sort_values().to_frame().T)

In [9]:
# categorical features
cat_cols = train_df.select_dtypes('object')
single_cat = np.where(cat_cols.nunique()==1)[0]
# many cat
large_cat = np.where(cat_cols.nunique()>20)[0]

# numerical features
num_col = train_df.select_dtypes(['int', 'float'])
# single num
single_num = np.where(num_col.nunique()==1)[0]
# many num
#large_num = np.where(num_col.nunique()>100)

drop_cols = np.hstack((single_num,single_cat, large_cat, null_cols))
drop_cols = drop_cols.squeeze()
drop_cols.sort()
drop_cols = train_df.iloc[:,drop_cols].columns

Remove unused columns

In [None]:
cols = [col for col in train_df.columns if col not in drop_cols]
df_train_reduced = train_df[cols]
df_test_reduced = test_df[cols]

In [None]:
top9_columns =  df_train_reduced.select_dtypes(['int', 'float']).nunique().sort_values(ascending=False).index.values[:9]

print(top9_columns)

fig, ax = plt.subplots(3,3, figsize=(18, 18))
for i, col in enumerate(top9_columns):
    sns.distplot(train_df[col], color = choice(color_pal), ax=ax[math.floor(i/3),i%3]).set_title(f'{col} Distribution')
fig.show()

Correlation of Target and Numerical Features

In [None]:
corr_df = train_df[cols].select_dtypes(['int', 'float'].corr()

# Correlation with target
display(corr_df['target'].abs().sort_values(ascending=False).to_frame().T)

plt.figure(figsize=(25,5))
corr_df['target'].abs().sort_values(ascending=False).iloc[1:].plot()
plt.title("Correlation target vs features")
plt.show()

In [None]:
corr_df['target'].abs()

In [12]:
# feature_307 feature auc with target 
metrics.roc_auc_score(train_df['target'], train_df['feature_307'])

## Preprocessing

In [21]:
# target
target = train_df['target']
train_df.drop(columns=['target', 'id'], inplace=True)

cols = [col for col in train_df.columns if col not in drop_cols]
df_train_reduced = train_df[cols]
df_test_reduced = test_df[cols]

In [24]:
# # label encoding those cat columns
def label_encoding(train_df, test_df, columns):
    for col in columns:
        le = preprocessing.LabelEncoder()
        # combine value
        values = train_df[col].append(test_df[col])
        le.fit(values)
        train_df[col] = le.transform(train_df[col])
        test_df[col] = le.transform(test_df[col])
    return train_df, test_df

cat_cols = train_df.select_dtypes(['object'])
train_df, test_df = label_encoding(df_train_reduced, df_test_reduced, cat_cols)
train_df[cat_columns].head()

## Modeling

In [None]:
target_col = 'target'
drop_cols = [target_col, 'id']
train_cols = [col for col in train_df.columns if col not in drop_cols]

In [None]:
drop_cols

Hyperparameter Tunning

In [None]:
def run_lgbm(train_df, train_cols, target_col, params):
    skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    test_predictions = []

    oof_predictions_proba = np.zeros(len(train_df))

    for idx, (train_idx, valid_idx) in enumerate(skf.split(train_df, train_df[target_col])):

        X_train = train_df.iloc[train_idx][train_cols]
        y_train = train_df.iloc[train_idx][target_col]

        X_valid = train_df.iloc[valid_idx][train_cols]
        y_valid = train_df.iloc[valid_idx][target_col]

        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], 
                eval_metric='auc',
                verbose=-1
        )

        y_valid_pred = model.predict_proba(X_valid)[:, 1]
        oof_predictions_proba[valid_idx] = y_valid_pred

    score = metrics.roc_auc_score(train_df[target_col], oof_predictions_proba)
    
    return score

