<div class="alert alert-block alert-info">
    <p><img src="https://geekbrains.ru/apple-touch-icon-57x57.png" align="right" alt="GeekBrains"></p>
    <p style="color:DarkSlateGray"><b>Course:</b> Machine Learning Systems in Production</p>
    <p style="color:DarkSlateGray"><b>Project:</b> Churn Prediction in Game Project</p>
    <p style="color:DarkSlateGray"><b>Supervisor:</b> Fedor Erin</p>
    <p style="color:DarkSlateGray"><b>Author:</b> Dmitry Doni</p>
</div>

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE, ADASYN

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import chi2, mutual_info_classif, RFECV
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, auc, \
                            log_loss, roc_auc_score, average_precision_score, confusion_matrix
from scipy import interp
from mpl_toolkits.axes_grid1 import make_axes_locatable
# import xgboost as xgb

import time
from datetime import datetime, timedelta

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Display charts in Jupyter Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
# Display inline plots as SVG
%config InlineBackend.figure_formats = ['svg']
# Set the limit of columns displayed in the notebook
pd.options.display.max_columns = 100

## Data Collection

Data was loaded from Apache Impala (using `dwhimpalautil` for connection). In this project, we are using ready-to-use datasets in CSV format:
* `../../datasets/gamechurn/train/`
* `../../datasets/gamechurn/test/`

## Building Datasets

In [3]:
def time_format(sec):
    return str(timedelta(seconds=sec))

In [4]:
def build_dataset_raw(churned_start_date='2019-01-01', 
                      churned_end_date='2019-02-01', 
                      inter_list=[(1,7),(8,14)],
                      raw_data_path='train/',
                      dataset_path='dataset/', 
                      mode='train'):
    
    start_t = time.time()
 
    sample = pd.read_csv('{}sample.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    profiles = pd.read_csv('{}profiles.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    payments = pd.read_csv('{}payments.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    reports = pd.read_csv('{}reports.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    abusers = pd.read_csv('{}abusers.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    logins = pd.read_csv('{}logins.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    pings = pd.read_csv('{}pings.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    sessions = pd.read_csv('{}sessions.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    shop = pd.read_csv('{}shop.csv'.format(raw_data_path), sep=';', na_values=['\\N', 'None'], encoding='utf-8')
    
    print('Run time (reading csv files): {}'.format(time_format(time.time()-start_t)))    
#-----------------------------------------------------------------------------------------------------    
    print('NO dealing with outliers, missing values and categorical features...')
#-----------------------------------------------------------------------------------------------------        
    # На основании дня отвала (last_login_dt) строим признаки, которые описывают активность игрока перед уходом
    
    print('Creating dataset...')
    # Создадим пустой датасет - в зависимости от режима построения датасета - train или test
    if mode == 'train':
        dataset = sample.copy()[['user_id', 'is_churned', 'level', 'donate_total']]
    elif mode == 'test':
        dataset = sample.copy()[['user_id', 'level', 'donate_total']]

    # Пройдемся по всем источникам, содержащим "динамичекие" данные
    for df in [payments, reports, abusers, logins, pings, sessions, shop]:

        # Получим 'day_num_before_churn' для каждого из значений в источнике для определения недели
        data = pd.merge(sample[['user_id', 'login_last_dt']], df, on='user_id')
        data['day_num_before_churn'] = 1 + (data['login_last_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) - 
                                data['log_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))).apply(lambda x: x.days)
        df_features = data[['user_id']].drop_duplicates().reset_index(drop=True)

        # Для каждого признака создадим признаки для каждого из времененно интервала (в нашем примере 4 интервала по 7 дней)
        features = list(set(data.columns) - set(['user_id', 'login_last_dt', 'log_dt', 'day_num_before_churn']))
        print('Processing features:', features)
        for feature in features:
            for i, inter in enumerate(inter_list):
                inter_df = data.loc[data['day_num_before_churn'].between(inter[0], inter[1], inclusive=True)].\
                                groupby('user_id')[feature].mean().reset_index().\
                                rename(index=str, columns={feature: feature+'_{}'.format(i+1)})
                df_features = pd.merge(df_features, inter_df, how='left', on='user_id')

        # Добавляем построенные признаки в датасет
        dataset = pd.merge(dataset, df_features, how='left', on='user_id')
        
        print('Run time (calculating features): {}'.format(time_format(time.time()-start_t)))

    # Добавляем "статические" признаки
    dataset = pd.merge(dataset, profiles, on='user_id')
#---------------------------------------------------------------------------------------------------------------------------
    dataset.to_csv('{}dataset_raw_{}.csv'.format(dataset_path, mode), sep=';', index=False)
    print('Dataset is successfully built and saved to {}, run time "build_dataset_raw": {}'.\
          format(dataset_path, time_format(time.time()-start_t)))

In [5]:
# Parameters for building dataset

CHURNED_START_DATE = '2019-09-01' 
CHURNED_END_DATE = '2019-10-01'

INTER_1 = (1,7)
INTER_2 = (8,14)
INTER_3 = (15,21)
INTER_4 = (22,28)
INTER_LIST = [INTER_1, INTER_2, INTER_3, INTER_4]

In [6]:
# Train dataset
build_dataset_raw(churned_start_date=CHURNED_START_DATE,
                  churned_end_date=CHURNED_END_DATE,
                  inter_list=INTER_LIST,
                  raw_data_path='../../datasets/gamechurn/train/',
                  dataset_path='../../datasets/gamechurn/dataset/', 
                  mode='train')

Run time (reading csv files): 0:01:15.483905
NO dealing with outliers, missing values and categorical features...
Creating dataset...
Processing features: ['pay_amt', 'trans_amt']
Run time (calculating features): 0:01:38.537630
Processing features: ['reports_amt']
Run time (calculating features): 0:03:02.450657
Processing features: ['sess_with_abusers_amt']
Run time (calculating features): 0:07:23.248741
Processing features: ['disconnect_amt', 'session_amt']
Run time (calculating features): 0:12:28.290808
Processing features: ['avg_min_ping']
Run time (calculating features): 0:17:17.881149
Processing features: ['leavings_rate', 'session_player', 'win_rate', 'kd']
Run time (calculating features): 0:22:27.049922
Processing features: ['silver_spent', 'gold_spent']
Run time (calculating features): 0:27:47.400263
Dataset is successfully built and saved to ../../datasets/gamechurn/dataset/, run time "build_dataset_raw": 0:28:14.388309


In [7]:
# Test dataset
build_dataset_raw(churned_start_date=CHURNED_START_DATE,
                  churned_end_date=CHURNED_END_DATE,
                  inter_list=INTER_LIST,
                  raw_data_path='../../datasets/gamechurn/test/',
                  dataset_path='../../datasets/gamechurn/dataset/', 
                  mode='test')

Run time (reading csv files): 0:00:05.367628
NO dealing with outliers, missing values and categorical features...
Creating dataset...
Processing features: ['pay_amt', 'trans_amt']
Run time (calculating features): 0:00:07.431817
Processing features: ['reports_amt']
Run time (calculating features): 0:00:14.955355
Processing features: ['sess_with_abusers_amt']
Run time (calculating features): 0:00:37.752802
Processing features: ['disconnect_amt', 'session_amt']
Run time (calculating features): 0:01:04.676554
Processing features: ['avg_min_ping']
Run time (calculating features): 0:01:32.048010
Processing features: ['leavings_rate', 'session_player', 'win_rate', 'kd']
Run time (calculating features): 0:01:59.246776
Processing features: ['silver_spent', 'gold_spent']
Run time (calculating features): 0:02:29.062937
Dataset is successfully built and saved to ../../datasets/gamechurn/dataset/, run time "build_dataset_raw": 0:02:31.879023


In [8]:
train = pd.read_csv('../../datasets/gamechurn/dataset/dataset_raw_train.csv', sep=';')
test = pd.read_csv('../../datasets/gamechurn/dataset/dataset_raw_test.csv', sep=';')
print(train.shape, test.shape)

(469475, 62) (44764, 61)


In [9]:
train.head()

Unnamed: 0,user_id,is_churned,level,donate_total,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,trans_amt_1,trans_amt_2,trans_amt_3,trans_amt_4,reports_amt_1,reports_amt_2,reports_amt_3,reports_amt_4,sess_with_abusers_amt_1,sess_with_abusers_amt_2,sess_with_abusers_amt_3,sess_with_abusers_amt_4,disconnect_amt_1,disconnect_amt_2,disconnect_amt_3,disconnect_amt_4,session_amt_1,session_amt_2,session_amt_3,session_amt_4,avg_min_ping_1,avg_min_ping_2,avg_min_ping_3,avg_min_ping_4,leavings_rate_1,leavings_rate_2,leavings_rate_3,leavings_rate_4,session_player_1,session_player_2,session_player_3,session_player_4,win_rate_1,win_rate_2,win_rate_3,win_rate_4,kd_1,kd_2,kd_3,kd_4,silver_spent_1,silver_spent_2,silver_spent_3,silver_spent_4,gold_spent_1,gold_spent_2,gold_spent_3,gold_spent_4,age,gender,days_between_reg_fl,days_between_fl_df,has_return_date,has_phone_number
0,1e7edd8347e3aaeedf8c494b11240851e3fa0ad231b8f8...,0,43,88730.0,,,,,,,,,,,,2.0,,6.5,1.0,6.0,0.0,0.0,0.0,0.5,1.0,2.0,1.0,0.5,54.0,31.0,42.0,28.0,0.0,0.216667,0.0,0.166667,1321.0,8592.0,1352.333333,7941.5,3.0,0.659091,1.0,0.65,2.0,2.387302,1.034877,0.952632,0.0,40900.0,300.0,25000.0,0.0,0.0,78.666667,0.0,26.0,M,0,7,1,1
1,f43cac5f14e06ca039b173e14c323ac0c1fd8492f0cf08...,0,50,44149.0,,,,,,,,,1.0,1.0,,1.0,3.6,4.8,4.666667,4.0,0.0,0.25,0.0,0.142857,1.4,1.5,2.2,1.428571,65.666666,46.0,48.333334,70.571429,0.018182,0.013333,0.0,0.035714,4685.0,5632.6,5526.833333,5444.0,1.123333,1.678571,2.125631,1.661905,1.37578,1.247101,1.262016,1.432277,0.0,41480.0,6266.666667,1400.0,0.0,0.0,0.0,0.0,27.0,M,0,37,1,1
2,cc7450e0b182947998534ef137b05e07109c100aced0b6...,0,37,44931.0,63.0,350.0,,130.0,1.0,1.0,,2.0,1.0,1.0,1.0,2.0,4.166667,8.2,8.0,8.571429,0.857143,0.857143,1.0,1.142857,1.0,1.857143,1.714286,1.857143,60.972222,49.161904,63.061508,63.773809,0.066667,0.096667,0.064935,0.077063,4710.166667,6543.4,6317.0,7550.571429,0.684722,0.653571,0.291667,0.518254,1.075181,1.36443,0.872192,0.938326,14471.428571,31400.0,5714.285714,414.285714,104.285714,0.0,1.428571,2.857143,21.0,M,0,153,1,1
3,5c583d57a1e9e53341fc239d41fb6983e667a04b1b4d94...,0,20,37538.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,22.0,M,0,156,1,1
4,9bbaa1a2501e8dc83cf6c0c54ef139c75c99de09dcf4dc...,0,10,4100.97998,66.580002,,,,1.0,,,,,,2.0,,4.25,2.0,4.0,1.4,0.8,0.25,0.5,0.25,0.6,1.5,1.5,1.5,24.9,24.5,24.833333,24.733333,0.05,0.305556,0.054286,0.111111,4124.4,2755.833333,3776.8,1949.0,1.113333,0.733333,1.316667,1.0,9.825229,10.025196,3.536477,10.131818,0.0,0.0,1666.666667,9838.0,0.0,0.0,0.0,0.0,2.0,M,0,21,1,1


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 469475 entries, 0 to 469474
Data columns (total 62 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   user_id                  469475 non-null  object 
 1   is_churned               469475 non-null  int64  
 2   level                    469475 non-null  int64  
 3   donate_total             469475 non-null  float64
 4   pay_amt_1                67485 non-null   float64
 5   pay_amt_2                60928 non-null   float64
 6   pay_amt_3                56720 non-null   float64
 7   pay_amt_4                57896 non-null   float64
 8   trans_amt_1              67485 non-null   float64
 9   trans_amt_2              60928 non-null   float64
 10  trans_amt_3              56720 non-null   float64
 11  trans_amt_4              57896 non-null   float64
 12  reports_amt_1            144916 non-null  float64
 13  reports_amt_2            145909 non-null  float64
 14  repo

## Data Selection and Preparation

### Data Cleansing

In [11]:
def prepare_dataset(dataset, 
                    dataset_type='train',
                    dataset_path='dataset/'):
    print(dataset_type)
    start_t = time.time()
    print('Dealing with missing values, outliers, categorical features...')
    
    # Профили
    dataset['age'] = dataset['age'].fillna(dataset['age'].median())
    dataset['gender'] = dataset['gender'].fillna(dataset['gender'].mode()[0])
    dataset.loc[~dataset['gender'].isin(['M', 'F']), 'gender'] = dataset['gender'].mode()[0]
    dataset['gender'] = dataset['gender'].map({'M': 1., 'F':0.})
    dataset.loc[(dataset['age'] > 80) | (dataset['age'] < 7), 'age'] = round(dataset['age'].median())
    dataset.loc[dataset['days_between_fl_df'] < -1, 'days_between_fl_df'] = -1
    # Пинги
    for period in range(1,len(INTER_LIST)+1):
        col = 'avg_min_ping_{}'.format(period)
        dataset.loc[(dataset[col] < 0) | 
                    (dataset[col].isnull()), col] = dataset.loc[dataset[col] >= 0][col].median()
    # Сессии и прочее
    dataset.fillna(0, inplace=True)
    dataset.to_csv('{}dataset_{}.csv'.format(dataset_path, dataset_type), sep=';', index=False)
         
    print('Dataset is successfully prepared and saved to {}, run time (dealing with bad values): {}'.\
          format(dataset_path, time_format(time.time()-start_t))) 

In [12]:
prepare_dataset(dataset=train, dataset_type='train', dataset_path='../../datasets/gamechurn/dataset/')
prepare_dataset(dataset=test, dataset_type='test', dataset_path='../../datasets/gamechurn/dataset/')

train
Dealing with missing values, outliers, categorical features...
Dataset is successfully prepared and saved to ../../datasets/gamechurn/dataset/, run time (dealing with bad values): 0:00:28.855000
test
Dealing with missing values, outliers, categorical features...
Dataset is successfully prepared and saved to ../../datasets/gamechurn/dataset/, run time (dealing with bad values): 0:00:03.108703


In [13]:
train_new = pd.read_csv('../../datasets/gamechurn/dataset/dataset_train.csv', sep=';')
# test_new = pd.read_csv('../../datasets/gamechurn/dataset/dataset_test.csv', sep=';')

train_new.info()
# test_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 469475 entries, 0 to 469474
Data columns (total 62 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   user_id                  469475 non-null  object 
 1   is_churned               469475 non-null  int64  
 2   level                    469475 non-null  int64  
 3   donate_total             469475 non-null  float64
 4   pay_amt_1                469475 non-null  float64
 5   pay_amt_2                469475 non-null  float64
 6   pay_amt_3                469475 non-null  float64
 7   pay_amt_4                469475 non-null  float64
 8   trans_amt_1              469475 non-null  float64
 9   trans_amt_2              469475 non-null  float64
 10  trans_amt_3              469475 non-null  float64
 11  trans_amt_4              469475 non-null  float64
 12  reports_amt_1            469475 non-null  float64
 13  reports_amt_2            469475 non-null  float64
 14  repo

In [14]:
train_new.describe()

Unnamed: 0,is_churned,level,donate_total,pay_amt_1,pay_amt_2,pay_amt_3,pay_amt_4,trans_amt_1,trans_amt_2,trans_amt_3,trans_amt_4,reports_amt_1,reports_amt_2,reports_amt_3,reports_amt_4,sess_with_abusers_amt_1,sess_with_abusers_amt_2,sess_with_abusers_amt_3,sess_with_abusers_amt_4,disconnect_amt_1,disconnect_amt_2,disconnect_amt_3,disconnect_amt_4,session_amt_1,session_amt_2,session_amt_3,session_amt_4,avg_min_ping_1,avg_min_ping_2,avg_min_ping_3,avg_min_ping_4,leavings_rate_1,leavings_rate_2,leavings_rate_3,leavings_rate_4,session_player_1,session_player_2,session_player_3,session_player_4,win_rate_1,win_rate_2,win_rate_3,win_rate_4,kd_1,kd_2,kd_3,kd_4,silver_spent_1,silver_spent_2,silver_spent_3,silver_spent_4,gold_spent_1,gold_spent_2,gold_spent_3,gold_spent_4,age,gender,days_between_reg_fl,days_between_fl_df,has_return_date,has_phone_number
count,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0,469475.0
mean,0.029448,26.792698,48002.06,140.415895,134.262708,129.077459,133.30028,0.198513,0.182546,0.173069,0.173995,0.708455,0.707977,0.709422,0.70338,2.810628,2.938302,3.075788,3.175711,0.375617,0.385396,0.397086,0.414983,0.997169,0.997714,1.007816,1.028452,33.190376,33.453583,33.563482,33.682889,0.072856,0.071267,0.072153,0.07336,3265.955581,3379.027138,3512.25197,3622.309714,0.747296,0.723235,0.715966,0.71084,3.565562,3.389694,3.300992,3.210472,6444.131,5700.765,4550.476,4724.096,73.334719,72.931138,70.150866,69.289718,26.002226,0.933899,14.340597,218.709164,0.882946,0.830589
std,0.169058,12.680296,85767.42,597.573949,603.78524,613.631584,632.451995,0.573882,0.567848,0.562879,0.553329,1.586962,1.574771,1.567295,1.556799,3.677683,3.834196,3.993347,4.109293,1.88848,2.157656,1.811968,2.20108,1.134619,1.156297,1.168026,1.188481,23.906486,23.553533,23.591891,23.7291,1.239741,1.137892,0.638524,0.27125,4015.691844,4158.659831,4286.840526,4384.824816,3.125124,2.527616,2.412572,1.638846,60.417143,34.960178,26.118473,17.093699,25599.7,19785.9,14521.83,14757.99,380.674537,385.894325,407.242002,385.043766,8.341266,0.24846,114.81823,363.410345,0.321485,0.375115
min,0.0,10.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,-1.0,-1.0,0.0,0.0
25%,0.0,17.0,6312.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.642857,24.361508,24.571429,24.625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,1.0,0.0,8.0,1.0,1.0
50%,0.0,23.0,20196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.333333,1.5,1.5,1.666667,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,28.433333,28.972222,29.0,29.0,0.0,0.0,0.0,0.0,1861.0,1899.5,2019.5,2128.5,0.5,0.491667,0.492424,0.5,0.835618,0.833333,0.852273,0.871795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,1.0,0.0,56.0,1.0,1.0
75%,0.0,36.0,55432.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,4.666667,5.0,5.166667,5.333333,0.5,0.5,0.571429,0.571429,1.6,1.6,1.666667,1.666667,34.0,33.944445,34.0,34.214286,0.075794,0.076923,0.083333,0.085242,5465.242857,5700.761905,5952.0,6136.666667,1.14,1.110254,1.094872,1.083333,1.411259,1.436786,1.49554,1.548676,4500.0,4000.0,3542.857,3800.0,0.0,0.0,0.0,0.0,31.0,1.0,0.0,266.0,1.0,1.0
max,1.0,50.0,4356043.0,56700.0,56598.857143,46400.0,71288.0,18.0,44.285714,26.857143,18.0,38.571429,36.0,35.0,48.0,174.0,208.0,267.666667,279.25,446.571429,573.0,393.285714,682.428571,39.333333,44.857143,30.5,33.833333,537.0,568.0,610.0,631.0,591.357143,586.785714,277.527779,100.0,58530.285714,58610.0,58547.428571,64368.285714,1261.0,1161.571429,1023.714286,473.47619,37791.0,14475.166667,10780.333333,2806.428571,8060050.0,1618050.0,2244387.0,1514500.0,36115.0,47376.857143,62268.0,56068.0,80.0,1.0,2534.0,2683.0,1.0,1.0


In [15]:
train_new['is_churned'].value_counts()

0    455650
1     13825
Name: is_churned, dtype: int64

### Class Balancing

In [16]:
X_train = train_new.drop(['user_id', 'is_churned'], axis=1)
y_train = train_new['is_churned']

X_train_mm = MinMaxScaler().fit_transform(X_train)

In [20]:
%%time
# SMOTE (Synthetic Minority Over-sampling Technique)
sm = SMOTE(sampling_strategy=0.3, random_state=42)
X_train_balanced, y_train_balanced = sm.fit_sample(X_train_mm, y_train.values)

CPU times: user 9.77 s, sys: 723 ms, total: 10.5 s
Wall time: 8.63 s


In [18]:
# %%time
# ADASYN (Adaptive Synthetic) algorithm
# ada = ADASYN(sampling_strategy=0.3, random_state=42)
# X_train_balanced, y_train_balanced = ada.fit_resample(X_train_mm, y_train.values)

CPU times: user 10min 55s, sys: 2.01 s, total: 10min 57s
Wall time: 10min 57s


In [21]:
print('Original dataset shape: %s' % Counter(y_train.values))
print('Resampled dataset shape: %s' % Counter(y_train_balanced))

Original dataset shape: Counter({0: 455650, 1: 13825})


NameError: name 'y_train_balanced' is not defined

### Feature Selection

In [None]:
def evaluation(y_true, y_pred, y_prob):
    precision = precision_score(y_true=y_true, y_pred=y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred)
    ll = log_loss(y_true=y_true, y_pred=y_prob)
    roc_auc = roc_auc_score(y_true=y_true, y_score=y_prob)
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F1: {}'.format(f1))
    print('Log Loss: {}'.format(ll)) 
    print('ROC AUC: {}'.format(roc_auc)) 
    return precision, recall, f1, ll, roc_auc

def xgb_fit_predict(X_train, y_train, X_test, y_test):
    clf = xgb.XGBClassifier(max_depth=3,
                            n_estimators=100,
                            learning_rate=0.1,
                            nthread=5,
                            subsample=1.,
                            colsample_bytree=0.5,
                            min_child_weight = 3,
                            reg_alpha=0.,
                            reg_lambda=0.,
                            seed=42,
                            missing=1e10)

    clf.fit(X_train, y_train, eval_metric='aucpr', verbose=10)
    predict_proba_test = clf.predict_proba(X_test)
    predict_test = clf.predict(X_test)
    precision_test, recall_test, f1_test, log_loss_test, roc_auc_test = \
        evaluation(y_test, predict_test, predict_proba_test[:, 1])
    return clf

def plot_importance(importance, features, name):
    fi = pd.DataFrame(list(zip(features, importance))).sort_values(by=1, ascending=False)
    plt.figure(figsize=(16,6))
    plt.bar(range(fi.shape[0]), fi[1], align='center')
    plt.xticks(range(fi.shape[0]), fi[0], rotation=90)
    plt.title(name)
    plt.show()
    return fi

### Test Data

In [None]:
# test_data_raw = pd.read_csv('../../datasets/gamechurn/test/sample.csv', delimiter=';')
# test_data_raw.head()

### Predictions

In [None]:
# test_predictions.to_csv('../../datasets/gamechurn/DDoni_predictions.csv', index=None)