In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostRegressor, CatBoostClassifier
from matplotlib import pyplot
import shap
import random
from collections import Counter
from random import choice
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupKFold
from pprint import pprint
from bayes_opt import BayesianOptimization
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import gc
import json
pd.set_option('display.max_columns', 1000)


In [73]:
def eval_qwk_lgb_regr(y_true, y_pred, reduce_train, is_classifier):

    if not is_classifier:
        dist = Counter(reduce_train['accuracy_group'])
        for k in dist:
            dist[k] /= len(reduce_train)

        acum = 0
        bound = {}
        for i in range(3):
            acum += dist[i]
            bound[i] = np.percentile(y_pred, acum * 100)

        def classify(x):
            if x <= bound[0]:
                return 0
            elif x <= bound[1]:
                return 1
            elif x <= bound[2]:
                return 2
            else:
                return 3

        y_pred = np.array(list(map(classify, y_pred))).reshape(y_true.shape)

    return 'cappa', cohen_kappa_score(y_true, y_pred, weights='quadratic'), True


In [3]:
def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(
        set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100 * np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])

    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code



In [4]:
def get_train_and_test(train, test, assess_titles, list_of_event_code, list_of_event_id, activities_labels,
                       all_title_event_code, win_code):
    compiled_train = []
    compiled_test = []
    assessment_sessions_by_instid = {}

    # Loop through each train installation id
    for ins_id, user_sample in tqdm(train.groupby('installation_id', sort=False),
                                    total=train['installation_id'].nunique()):
        compiled_train += get_data(user_sample, assess_titles, list_of_event_code, list_of_event_id, activities_labels,
                                   all_title_event_code, win_code, test_set=False)

    reduce_train = pd.DataFrame(compiled_train)
    del compiled_train

    # Loop through each test installation id
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False),
                                    total=test['installation_id'].nunique()):
        test_data = get_data(user_sample, assess_titles, list_of_event_code, list_of_event_id, activities_labels,
                             all_title_event_code, win_code, test_set=True)
        compiled_test.append(test_data)

    reduce_test = pd.DataFrame(compiled_test)
    del compiled_test

    categoricals = ['session_title']

    return reduce_train, reduce_test, categoricals

In [5]:
def get_data(user_sample, assess_titles, list_of_event_code, list_of_event_id, activities_labels, all_title_event_code,
             win_code, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0

    user_activities_count = {'Clip': 0, 'Activity': 0, 'Assessment': 0, 'Game': 0}

    # new features: time spent in each activity
    last_session_time_sec = 0
    accuracy_groups = {0: 0, 1: 0, 2: 0, 3: 0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    durations_game = []
    durations_activity = []
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    last_game_time_title = {'lgt_' + title: 0 for title in assess_titles}
    ac_game_time_title = {'agt_' + title: 0 for title in assess_titles}
    ac_true_attempts_title = {'ata_' + title: 0 for title in assess_titles}
    ac_false_attempts_title = {'afa_' + title: 0 for title in assess_titles}
    event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
    event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
    title_count: Dict[str, int] = {eve: 0 for eve in activities_labels.values()}
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in all_title_event_code}
    session_count = 0

    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session

        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        game_session = session['game_session'].iloc[0]

        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session) > 1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens:
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(event_code_count.copy())
            features.update(title_count.copy())
            features.update(event_id_count.copy())
            features.update(title_event_code_count.copy())
            features.update(last_game_time_title.copy())
            features.update(ac_game_time_title.copy())
            features.update(ac_true_attempts_title.copy())
            features.update(ac_false_attempts_title.copy())
            features['installation_session_count'] = session_count

            variety_features = [('var_event_code', event_code_count),
                                ('var_event_id', event_id_count),
                                ('var_title', title_count),
                                ('var_title_event_code', title_event_code_count)]

            for name, dict_counts in variety_features:
                arr = np.array(list(dict_counts.values()))
                features[name] = np.count_nonzero(arr)

            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['game_session'] = game_session
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0]
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts
            accumulated_uncorrect_attempts += false_attempts

            # ----------------------------------------------
            ac_true_attempts_title['ata_' + session_title_text] += true_attempts
            ac_false_attempts_title['afa_' + session_title_text] += false_attempts

            last_game_time_title['lgt_' + session_title_text] = session['game_time'].iloc[-1]
            ac_game_time_title['agt_' + session_title_text] += session['game_time'].iloc[-1]
            # ----------------------------------------------

            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['last_duration'] = 0
                features['duration_max'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['last_duration'] = durations[-1]
                features['duration_max'] = np.max(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)

            if durations_game == []:
                features['duration_game_mean'] = 0
                features['duration_game_std'] = 0
                features['game_last_duration'] = 0
                features['game_max_duration'] = 0
            else:
                features['duration_game_mean'] = np.mean(durations_game)
                features['duration_game_std'] = np.std(durations_game)
                features['game_last_duration'] = durations_game[-1]
                features['game_max_duration'] = np.max(durations_game)

            if durations_activity == []:
                features['duration_activity_mean'] = 0
                features['duration_activity_std'] = 0
                features['game_activity_duration'] = 0
                features['game_activity_max'] = 0
            else:
                features['duration_activity_mean'] = np.mean(durations_activity)
                features['duration_activity_std'] = np.std(durations_activity)
                features['game_activity_duration'] = durations_activity[-1]
                features['game_activity_max'] = np.max(durations_activity)

            # the accuracy is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy / counter if counter > 0 else 0
            accuracy = true_attempts / (true_attempts + false_attempts) if (true_attempts + false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group / counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions

            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts + false_attempts > 0:
                all_assessments.append(features)

            counter += 1

        if session_type == 'Game':
            durations_game.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)

        if session_type == 'Activity':
            durations_activity.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)

        session_count += 1

        # this piece counts how many actions was made in each event_code so far
        def update_counters(counter: dict, col: str):
            num_of_session_count = Counter(session[col])
            for k in num_of_session_count.keys():
                x = k
                if col == 'title':
                    x = activities_labels[k]
                counter[x] += num_of_session_count[k]
            return counter

        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        title_count = update_counters(title_count, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type

            # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        if len(all_assessments) > 0:
            return all_assessments[-1]
        else:
            return all_assessments[0]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [6]:
def remove_dead_weight(df, train_labels, test_set=False):
    df = df[df['world'] != 'NONE']

    # filtering by ids that took assessments
    ids_w_assessments = df[df['type'] == 'Assessment']['installation_id'].drop_duplicates()
    df = df[df['installation_id'].isin(ids_w_assessments)]

    # If training set then make sure the installation ids are in the labels and remove assements not in the labels
    if test_set == False:
        # drop data whose installation does not contain any scored assessments in train_labels
        df = df[df['installation_id'].isin(train_labels['installation_id'].unique())]

        assessments = df[df.type == 'Assessment']
        assessments = assessments[~assessments.game_session.isin(train_labels.game_session)]
        df = df[~df.game_session.isin(assessments.game_session)]
        df.reset_index(drop=True, inplace=True)

    return df

In [7]:
def stract_hists(feature, train, test, adjust=False, plot=False):
    n_bins = 10
    train_data = train[feature]
    test_data = test[feature]
    if adjust:
        test_data *= train_data.mean() / test_data.mean()
    perc_90 = np.percentile(train_data, 95)
    train_data = np.clip(train_data, 0, perc_90)
    test_data = np.clip(test_data, 0, perc_90)
    train_hist = np.histogram(train_data, bins=n_bins)[0] / len(train_data)
    test_hist = np.histogram(test_data, bins=n_bins)[0] / len(test_data)
    msre = mean_squared_error(train_hist, test_hist)
    if plot:
        print(msre)
        plt.bar(range(n_bins), train_hist, color='blue', alpha=0.5)
        plt.bar(range(n_bins), test_hist, color='red', alpha=0.5)
        plt.show()
    return msre

In [8]:
# get prediction
def get_class_pred(pred, train_t):
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(train_t['accuracy_group'])
    for k in dist:
        dist[k] /= len(train_t)

    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, pred)))

    return y_pred

In [9]:
def preprocess(reduce_train, reduce_test, cols_to_drop):
    for df in [reduce_train, reduce_test]:
        df['installation_session_count'] = df.groupby(['installation_id'])['Clip'].transform('count')
        df['installation_duration_mean'] = df.groupby(['installation_id'])['duration_mean'].transform('mean')
        # df['installation_duration_std'] = df.groupby(['installation_id'])['duration_mean'].transform('std')
        df['installation_title_nunique'] = df.groupby(['installation_id'])['session_title'].transform('nunique')

        df['sum_event_code_count'] = df[
            [2050, 4100, 4230, 5000, 4235, 2060, 4110, 5010, 2070, 2075, 2080, 2081, 2083, 3110, 4010, 3120, 3121, 4020,
             4021,
             4022, 4025, 4030, 4031, 3010, 4035, 4040, 3020, 3021, 4045, 2000, 4050, 2010, 2020, 4070, 2025, 2030, 4080,
             2035,
             2040, 4090, 4220, 4095]].sum(axis=1)

        df['installation_event_code_count_mean'] = df.groupby(['installation_id'])['sum_event_code_count'].transform(
            'mean')
        # df['installation_event_code_count_std'] = df.groupby(['installation_id'])['sum_event_code_count'].transform('std')

    features = reduce_train.loc[
        (reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns  # delete useless columns
    features = [x for x in features if x not in cols_to_drop]

    return reduce_train, reduce_test, features

In [10]:
# Create Catagoricals
def create_cats(train, test, categoricals, activities_labels):
    tmp_train = train.copy()
    tmp_test = test.copy()
    tmp_train['session_title'] = tmp_train['session_title'].astype(CategoricalDtype(categories=activities_labels))
    tmp_test['session_title'] = tmp_test['session_title'].astype(CategoricalDtype(categories=activities_labels))
    # tmp_train['world'] = tmp_train['world'].astype(CategoricalDtype(categories=[0, 1, 2]))
    # tmp_test['world'] = tmp_test['world'].astype(CategoricalDtype(categories=[0, 1, 2]))
    train_cats = pd.get_dummies(tmp_train[categoricals], prefix=categoricals)
    test_cats = pd.get_dummies(tmp_test[categoricals], prefix=categoricals)

    tmp_train = tmp_train.drop(categoricals, axis=1)
    tmp_test = tmp_test.drop(categoricals, axis=1)

    tmp_train = pd.concat([tmp_train, train_cats], axis=1, sort=False)
    tmp_test = pd.concat([tmp_test, test_cats], axis=1, sort=False)

    return tmp_train, tmp_test

In [11]:
# Min Max
def create_min_max(train, test, categoricals, cols_to_drop):
    tmp_train = train.copy()
    tmp_test = test.copy()
    scalars = [x for x in tmp_train.columns if x not in (categoricals + cols_to_drop)]
    tmp_train[scalars] = tmp_train[scalars].apply(lambda x: (x - x.min()) / (x.max() - x.min())).fillna(0)
    tmp_test[scalars] = tmp_test[scalars].apply(lambda x: (x - x.min()) / (x.max() - x.min())).fillna(0)

    return tmp_train, tmp_test

In [77]:
class Base_Model(object):

    def __init__(self, train_df, test_df, features, params, reduce_train, reduce_test, 
                 categoricals=[], n_splits=5, verbose=True, is_classifier=False, is_lgb=False):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'accuracy_group'
        self.cv = self.get_cv()
        self.verbose = verbose
        self.params = params
        self.is_classifier = is_classifier
        self.is_lgb = is_lgb
        self.y_pred, self.score, self.model, self.oof_pred = self.fit(reduce_train, reduce_test)

    def train_model(self, train_set, val_set):
        raise NotImplementedError

    def get_cv(self):
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        return cv.split(self.train_df, self.train_df[self.target])

    def get_params(self):
        raise NotImplementedError

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError

    def convert_x(self, x):
        return x

    def fit(self, reduce_train, reduce_test):
        oof_pred = np.zeros((len(reduce_train),))
        y_pred = np.zeros((len(reduce_test),))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            try:
                x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[
                    val_idx]
                y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
                train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
                model = self.train_model(train_set, val_set)
                conv_x_val = self.convert_x(x_val)
                tmp_pred = model.predict(conv_x_val)
                if self.is_lgb and self.is_classifier:
                    tmp_pred = np.argmax(tmp_pred, axis=1)
                oof_pred[val_idx] = tmp_pred.reshape(oof_pred[val_idx].shape)
                x_test = self.convert_x(self.test_df[self.features])
                tmp_y_pred = model.predict(x_test)
                if self.is_lgb and self.is_classifier:
                    tmp_y_pred = np.argmax(tmp_y_pred, axis=1)
                y_pred += tmp_y_pred.reshape(y_pred.shape) / self.n_splits
                print('Partial score of fold {} is: {}'.format(fold, eval_qwk_lgb_regr(y_val, tmp_pred, 
                                                                                       reduce_train, self.is_classifier)[1]))
                _, loss_score, _ = eval_qwk_lgb_regr(self.train_df[self.target], oof_pred, 
                                                     reduce_train, self.is_classifier)
            except Exception as e:
                print(e)
                print('Error training: val_idx = ', val_idx)
        if self.verbose:
            print('Our oof cohen kappa score is: ', loss_score)
        del self.train_df, self.test_df, self.cv
        gc.collect()
        return y_pred, loss_score, model, oof_pred

In [78]:
class Xgb_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return xgb.train(self.params, train_set,
                         num_boost_round=667, evals=[(train_set, 'train'), (val_set, 'val')],
                         verbose_eval=verbosity, early_stopping_rounds=100)

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = xgb.DMatrix(x_train, y_train)
        val_set = xgb.DMatrix(x_val, y_val)
        return train_set, val_set

    def convert_x(self, x):
        return xgb.DMatrix(x)


class Catb_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        clf = CatBoostRegressor(**self.params)
        clf.fit(train_set['X'],
                train_set['y'],
                eval_set=(val_set['X'], val_set['y']),
                verbose=verbosity, )
        return clf

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
    
class Catb_Class_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        clf = CatBoostClassifier(**self.params)
        clf.fit(train_set['X'],
                train_set['y'],
                eval_set=(val_set['X'], val_set['y']),
                verbose=verbosity)
        return clf

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set


class RF_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        if self.is_classifier:
            rf = RandomForestClassifier(**self.params)
        else:
            rf = RandomForestRegressor(**self.params)
            
        rf.fit(train_set['X'], train_set['y'])
        return rf

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
    

class KNN_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        if self.is_classifier:
            knn = KNeighborsClassifier(**self.params)
        else:
            knn = KNeighborsRegressor(**self.params)
        knn.fit(train_set['X'], train_set['y'])
        return knn

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
    

class Lgb_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        return train_set, val_set

class Nn_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(train_set['X'].shape[1],)),
            tf.keras.layers.Dense(200, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(100, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(50, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(25, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(1, activation='relu')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=4e-4), loss='mse')
        print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True,
                                                       verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'],
                  train_set['y'],
                  validation_data=(val_set['X'], val_set['y']),
                  epochs=100,
                  callbacks=[save_best, early_stop])
        model.load_weights('nn_model.w8')
        return model

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set

    def get_params(self):
        return None
    
class Nn_Class_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(train_set['X'].shape[1],)),
            tf.keras.layers.Dense(100, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(50, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(25, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(4, activation='softmax')
        ])
        model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])
        print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True,
                                                       verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'],
                  train_set['y'],
                  validation_data=(val_set['X'], val_set['y']),
                  epochs=100,
                  callbacks=[save_best, early_stop])
        model.load_weights('nn_model.w8')
        return model

    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set

    def get_params(self):
        return None

In [14]:
def read_data():
    print('Reading train.csv file....')
    train = pd.read_csv('./data/train.csv')
    print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

    print('Reading test.csv file....')
    test = pd.read_csv('./data/test.csv')
    print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv('./data/train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    return train, test, train_labels

In [15]:
# read data
train, test, train_labels = read_data()

Reading train.csv file....
Training.csv file have 11341042 rows and 11 columns
Reading test.csv file....
Test.csv file have 1156414 rows and 11 columns
Reading train_labels.csv file....
Train_labels.csv file have 17690 rows and 7 columns


In [16]:
# remove unwanted data
train = remove_dead_weight(train, train_labels, test_set=False)
test = remove_dead_weight(test, train_labels, test_set=True)

In [17]:
# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code = encode_title(train, test, train_labels)

In [18]:
# tranform function to get the train and test set
reduce_train, reduce_test, categoricals = get_train_and_test(train, test, assess_titles, list_of_event_code, list_of_event_id, activities_labels, all_title_event_code, win_code)

HBox(children=(FloatProgress(value=0.0, max=3614.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [19]:
# Delete train and test to free up resources
del train, test
gc.collect()

2223

In [20]:
# Add the accuracy group vals
reduce_train = reduce_train.set_index('game_session')
train_labels = train_labels.set_index('game_session')
reduce_train.update(train_labels)

In [21]:
# Reset index
reduce_train = reduce_train.reset_index()

In [22]:
# Create the cols to drop for training
cols_to_drop = ['accuracy_group', 'game_session', 'installation_id']

In [23]:
# SMOTE
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state = 0)

reduce_train_x, reduce_train_y =  os.fit_sample(reduce_train.drop(cols_to_drop, axis=1), reduce_train['accuracy_group'])

# create dataframes from SMOTE analysis
reduce_train_x = pd.DataFrame(data = reduce_train_x, columns = reduce_train.drop(cols_to_drop, axis=1).columns)
reduce_train_y = pd.DataFrame(data = reduce_train_y, columns = ['accuracy_group'])
reduce_train_x['accuracy_group'] = reduce_train_y
reduce_train = reduce_train_x.copy()

Using TensorFlow backend.


In [24]:
del reduce_train_x, reduce_train_y, train_labels
gc.collect()

20

In [25]:
reduce_train['installation_id'] = 0

In [26]:
reduce_train, reduce_test, features = preprocess(reduce_train, reduce_test, cols_to_drop)

In [27]:
# Create Cats
reduce_train, reduce_test = create_cats(reduce_train, reduce_test, categoricals, activities_labels)

In [28]:
reduce_train, reduce_test = create_min_max(reduce_train, reduce_test, categoricals, cols_to_drop)
reduce_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_train.columns]
reduce_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_test.columns]

In [29]:
features = reduce_train.loc[(reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns # delete useless columns
features = [x for x in features if x not in cols_to_drop]

In [30]:
# Go through and find high correlations. Add to remove list if so
counter = 0
to_remove = []
for feat_a in features:
    for feat_b in features:
        if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
            c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
            if c > 0.995:
                counter += 1
                to_remove.append(feat_b)
                #print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))

In [31]:
# Go through again a look for features to remove
to_exclude = [] 
ajusted_test = reduce_test.copy()
for feature in ajusted_test.columns:
    if feature not in (cols_to_drop + categoricals):
        try:
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            error = stract_hists(feature, train=reduce_train, test=reduce_test, adjust=True)
            ajust_factor = train_mean / test_mean
            if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                to_exclude.append(feature)
                #print(feature, train_mean, test_mean, error)
            else:
                ajusted_test[feature] *= ajust_factor
        except:
            to_exclude.append(feature)

In [32]:
# Create final feature list removing the unwanted ones
features = [x for x in features if x not in (to_exclude + to_remove)]

In [33]:
# Create the cols to drop for training
cols_to_drop = ['accuracy_group', 'game_session', 'installation_id']
categoricals = ['session_title']

In [80]:
# Random Forrest Classifier

params = {'bootstrap': False, 
          'max_depth':59, 
          'max_features': 72, 
          'min_samples_leaf': 2, 
          'min_samples_split': 6, 
          'n_estimators': 100}

rf_model = RF_Model(reduce_train, ajusted_test, features, params, reduce_train, ajusted_test, 
                          categoricals=categoricals, verbose=False, is_classifier=False)

Partial score of fold 0 is: 0.6947427925381571
Partial score of fold 1 is: 0.6922555115884681
Partial score of fold 2 is: 0.6914292175486205
Partial score of fold 3 is: 0.6902204635387225
Partial score of fold 4 is: 0.6809496890898813


In [81]:
rf_train_pred = rf_model.oof_pred
print('Accuracy on training data: ', rf_model.score)
# Accuracy on training data:  0.5400765760009788 - Classifier
# Accuracy on training data:  0.586369916234434 - Regressor
# Accuracy on training data:  0.7094658767129483 - With SMOTE
# Accuracy on training data:  0.6651401764771585 - with SMOTE Classifier

Accuracy on training data:  0.6898883009994121


In [45]:
# K Nearest Classifier

weights = 0.3491139618762451
if weights >= 0 and weights < 1.0:
    weights = 'uniform'
else:
    weights = 'distance'

algorithm = 0.04441288498288465
if algorithm >= 0 and algorithm < 1.0:
    algorithm = 'ball_tree'
elif algorithm >= 1 and algorithm < 2.0:
    algorithm = 'kd_tree'
elif algorithm >= 2 and algorithm < 3.0:
    algorithm = 'brute'
else:
    algorithm = 'auto'

params = {
         'n_neighbors': int(19.544302888065488),
        'weights': weights,
        'algorithm': algorithm,
        'leaf_size': int(29.702070879545722),
        'p': int(2.986361352754792),
        'n_jobs': -1
}
knn_model = KNN_Model(reduce_train, ajusted_test, features, params, reduce_train, ajusted_test, 
                      categoricals=categoricals, verbose=False, is_classifier=False)

Partial score of fold 0 is: 0.489359227817188
Partial score of fold 1 is: 0.5059178463680669
Partial score of fold 2 is: 0.4906122448979592
Partial score of fold 3 is: 0.4889199906694658
Partial score of fold 4 is: 0.4694873881204231


In [46]:
knn_train_pred = knn_model.oof_pred
print('Accuracy on training data: ', knn_model.score)
# Accuracy on training data:  0.4820109463573031 - Classifier
# Accuracy on training data:  0.535143726742256 - Regressor
# Accuracy on training data:  0.5574548037485966 - With SMOTE
# Accuracy on training data:  0.4888640647908957 - with SMOTE Classifier

Accuracy on training data:  0.4888640647908957


In [47]:
# XG Boost
params_class = {
            'colsample_bytree': 0.2,                 
            'eta': 0.3,
            'objective':'multi:softmax',
            'num_class': 4,
            'max_depth': 6,
            'subsample': 1,
            'min_child_weight': 3,
            'gamma': 0.25,
            'eval_metric': 'mlogloss'
         }

params_reg = {
            'colsample_bytree': 0.2,                 
            'learning_rate': 0.01,
            'objective':'reg:squarederror',
            'max_depth': 6,
            'subsample': 1,
            'min_child_weight': 3,
            'gamma': 0.25,
            'n_estimators': 1400
         }

xgb_model = Xgb_Model(reduce_train, ajusted_test, features, params_reg, reduce_train, ajusted_test, 
                            categoricals=categoricals, verbose=False, is_classifier=False)

[0]	train-mlogloss:1.31967	val-mlogloss:1.32612
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.464104	val-mlogloss:0.777055
[200]	train-mlogloss:0.271738	val-mlogloss:0.694881
[300]	train-mlogloss:0.176011	val-mlogloss:0.657615
[400]	train-mlogloss:0.12543	val-mlogloss:0.639673
[500]	train-mlogloss:0.102664	val-mlogloss:0.631439
[600]	train-mlogloss:0.091826	val-mlogloss:0.628635
[666]	train-mlogloss:0.086872	val-mlogloss:0.627012
Partial score of fold 0 is: 0.6968639184397163
[0]	train-mlogloss:1.32018	val-mlogloss:1.32442
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 100 rounds.
[100]	train-mlogloss:0.470686	val-mlogloss:0.782342
[200]	train-mlogloss:0.270777	val-mlogloss:0.707626
[300]	train-mlogloss:0.173312	val-mlogloss:0.6717
[400]	train-mlogloss:0.125251	val-mloglo

In [48]:
xgb_train_pred = xgb_model.oof_pred
print('Accuracy on training data: ', xgb_model.score)
# Accuracy on training data:  0.603943020944038 - with Regressor
# Accuracy on training data:  0.6287394007914076 - With SMOTE Regresor
# Accuracy on training data:  0.6870184942607847 - with SMOTE Classifier

Accuracy on training data:  0.6870184942607847


In [49]:
# Catboost
params_class = {
            'loss_function': 'MultiClass',
            'classes_count': 4,
            'task_type': "CPU",
            'iterations': 1860,
            'depth': 6,
            'early_stopping_rounds': 300,
            'l2_leaf_reg': 2,
            'rsm': 1,
            'bootstrap_type': 'Bayesian',
            'bagging_temperature': 1,
            'random_seed': 42,
            'learning_rate': 0.04,
            'eval_metric': 'MultiClass'
        }

params_reg = {
            'loss_function': 'MultiRMSE',
            'task_type': "CPU",
            'iterations': 1860,
            'depth': 6,
            'early_stopping_rounds': 300,
            'l2_leaf_reg': 2,
            'rsm': 1,
            'bootstrap_type': 'Bayesian',
            'bagging_temperature': 1,
            'random_seed': 42,
            'learning_rate': 0.04
        }

cat_model = Catb_Class_Model(reduce_train, ajusted_test, features, params_reg, reduce_train, ajusted_test, 
                             categoricals=categoricals, verbose=False, is_classifier=False)

Partial score of fold 0 is: 0.644002600498429
Partial score of fold 1 is: 0.6354516863680728
Partial score of fold 2 is: 0.6415747350205494
Partial score of fold 3 is: 0.61964517524881
Partial score of fold 4 is: 0.6210764750296625


In [50]:
cat_train_pred = cat_model.oof_pred
print('Accuracy on training data: ', cat_model.score)
# Accuracy on training data:  0.6028335173880344
# Accuracy on training data:  0.6501526286037309 - With SMOTE Regressor
# Accuracy on training data:  0.6323386206747311 - with SMOTE Classifier

Accuracy on training data:  0.6323386206747311


In [51]:
# LightGBM
import lightgbm as lgb

params_class = {
        'boosting_type': 'dart',
        'feature_fraction': 0.7766778552692686,
        'lambda_l1': 0.4958811953667753,
        'lambda_l2': 0.08799041939480234,
        'learning_rate': 0.06209127849529422,
        'min_child_samples': 336,
        'num_leaves': 39,
        'subsample': 0.519326536607012,
        'n_estimators': 1000,
        'early_stopping_rounds': 50,
        'objective': 'multiclass',
        'num_class': 4,
        'metric': 'multi_logloss'
}

params_reg = {
        'boosting_type': 'dart',
        'feature_fraction': 0.7766778552692686,
        'lambda_l1': 0.4958811953667753,
        'lambda_l2': 0.08799041939480234,
        'learning_rate': 0.06209127849529422,
        'min_child_samples': 336,
        'num_leaves': 39,
        'subsample': 0.519326536607012,
        'n_estimators': 1000,
        'early_stopping_rounds': 100
}

lgb_model = Lgb_Class_Model(reduce_train, ajusted_test, features, params_reg, reduce_train, ajusted_test, 
                      categoricals=categoricals, verbose=True, is_classifier=False, is_lgb=True)

[100]	training's multi_logloss: 1.0201	valid_1's multi_logloss: 1.06005
[200]	training's multi_logloss: 0.931783	valid_1's multi_logloss: 0.99381
[300]	training's multi_logloss: 0.852804	valid_1's multi_logloss: 0.940239
[400]	training's multi_logloss: 0.790431	valid_1's multi_logloss: 0.902415
[500]	training's multi_logloss: 0.732426	valid_1's multi_logloss: 0.870372
[600]	training's multi_logloss: 0.710666	valid_1's multi_logloss: 0.860039
[700]	training's multi_logloss: 0.672745	valid_1's multi_logloss: 0.840422
[800]	training's multi_logloss: 0.648423	valid_1's multi_logloss: 0.828825
[900]	training's multi_logloss: 0.615722	valid_1's multi_logloss: 0.812634
[1000]	training's multi_logloss: 0.593785	valid_1's multi_logloss: 0.802732
Partial score of fold 0 is: 0.6501350621285791
[100]	training's multi_logloss: 1.02125	valid_1's multi_logloss: 1.05881
[200]	training's multi_logloss: 0.933296	valid_1's multi_logloss: 0.992198
[300]	training's multi_logloss: 0.853847	valid_1's multi_l

In [52]:
lgb_train_pred = lgb_model.oof_pred
print('Accuracy on training data: ', lgb_model.score)
# Accuracy on training data:  0.6043009253169422
# Accuracy on training data:  0.6612323346523459 - With SMOTE Regressor
# Accuracy on training data:  0.6385652248688622 - with SMOTE Classifier

Accuracy on training data:  0.6385652248688622


In [62]:
rf_test_pred = rf_model.y_pred
knn_test_pred = knn_model.y_pred
xgb_test_pred = xgb_model.y_pred
cat_test_pred = cat_model.y_pred
lgb_test_pred = lgb_model.y_pred

train_pred_df = pd.DataFrame()
test_pred_df = pd.DataFrame()

train_pred_df['rf_pred'] = rf_train_pred
train_pred_df['knn_pred'] = knn_train_pred
train_pred_df['xgb_pred'] = xgb_train_pred
train_pred_df['cat_pred'] = cat_train_pred
train_pred_df['lgb_pred'] = lgb_train_pred
train_pred_df['installation_id'] = reduce_train['installation_id']
train_pred_df['accuracy_group'] = reduce_train['accuracy_group']

test_pred_df['rf_pred'] = rf_test_pred
test_pred_df['knn_pred'] = knn_test_pred
test_pred_df['xgb_pred'] = xgb_test_pred
test_pred_df['cat_pred'] = cat_test_pred
test_pred_df['lgb_pred'] = lgb_test_pred
test_pred_df['installation_id'] = ajusted_test['installation_id']
test_pred_df['accuracy_group'] = ajusted_test['accuracy_group']

new_features = ['rf_pred', 'knn_pred', 'xgb_pred', 'cat_pred', 'lgb_pred']

In [63]:
nn_model = Nn_Class_Model(train_pred_df, test_pred_df, new_features, {}, reduce_train, ajusted_test, 
                          categoricals=categoricals, verbose=True, is_classifier=True, is_lgb=True)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_40 (Dense)             (None, 100)               600       
_________________________________________________________________
layer_normalization_30 (Laye (None, 100)               200       
_________________________________________________________________
dropout_30 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 50)                5050      
_________________________________________________________________
layer_normalization_31 (Laye (None, 50)                100       
_________________________________________________________________
dropout_31 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_42 (Dense)             (None, 25)              

Epoch 41/100
Epoch 00041: val_loss did not improve from 0.69493
Epoch 42/100
Epoch 00042: val_loss did not improve from 0.69493
Epoch 43/100
Epoch 00043: val_loss did not improve from 0.69493
Epoch 44/100
Epoch 00044: val_loss did not improve from 0.69493
Epoch 45/100
Epoch 00045: val_loss did not improve from 0.69493
Epoch 46/100
Epoch 00046: val_loss did not improve from 0.69493
Epoch 47/100
Epoch 00047: val_loss did not improve from 0.69493
Epoch 48/100
Epoch 00048: val_loss improved from 0.69493 to 0.69447, saving model to nn_model.w8
Epoch 49/100
Epoch 00049: val_loss did not improve from 0.69447
Epoch 50/100
Epoch 00050: val_loss did not improve from 0.69447
Epoch 51/100
Epoch 00051: val_loss did not improve from 0.69447
Epoch 52/100
Epoch 00052: val_loss improved from 0.69447 to 0.69431, saving model to nn_model.w8
Epoch 53/100
Epoch 00053: val_loss did not improve from 0.69431
Epoch 54/100
Epoch 00054: val_loss improved from 0.69431 to 0.69384, saving model to nn_model.w8
Epoch

Epoch 64/100
Epoch 00064: val_loss did not improve from 0.69384
Epoch 65/100
Epoch 00065: val_loss improved from 0.69384 to 0.69325, saving model to nn_model.w8
Epoch 66/100
Epoch 00066: val_loss did not improve from 0.69325
Epoch 67/100
Epoch 00067: val_loss did not improve from 0.69325
Epoch 68/100
Epoch 00068: val_loss did not improve from 0.69325
Epoch 69/100
Epoch 00069: val_loss did not improve from 0.69325
Epoch 70/100
Epoch 00070: val_loss did not improve from 0.69325
Epoch 71/100
Epoch 00071: val_loss did not improve from 0.69325
Epoch 72/100
Epoch 00072: val_loss did not improve from 0.69325
Epoch 73/100
Epoch 00073: val_loss did not improve from 0.69325
Epoch 74/100
Epoch 00074: val_loss did not improve from 0.69325
Epoch 75/100
Epoch 00075: val_loss did not improve from 0.69325
Epoch 76/100
Epoch 00076: val_loss did not improve from 0.69325
Epoch 77/100
Epoch 00077: val_loss did not improve from 0.69325
Epoch 78/100
Epoch 00078: val_loss did not improve from 0.69325
Epoch 7

Partial score of fold 0 is: 0.6940699331360298
Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_44 (Dense)             (None, 100)               600       
_________________________________________________________________
layer_normalization_33 (Laye (None, 100)               200       
_________________________________________________________________
dropout_33 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_45 (Dense)             (None, 50)                5050      
_________________________________________________________________
layer_normalization_34 (Laye (None, 50)                100       
_________________________________________________________________
dropout_34 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_

Epoch 00018: val_loss did not improve from 0.71762
Epoch 19/100
Epoch 00019: val_loss improved from 0.71762 to 0.71725, saving model to nn_model.w8
Epoch 20/100
Epoch 00020: val_loss improved from 0.71725 to 0.71688, saving model to nn_model.w8
Epoch 21/100
Epoch 00021: val_loss improved from 0.71688 to 0.71381, saving model to nn_model.w8
Epoch 22/100
Epoch 00022: val_loss did not improve from 0.71381
Epoch 23/100
Epoch 00023: val_loss did not improve from 0.71381
Epoch 24/100
Epoch 00024: val_loss did not improve from 0.71381
Epoch 25/100
Epoch 00025: val_loss improved from 0.71381 to 0.71360, saving model to nn_model.w8
Epoch 26/100
Epoch 00026: val_loss did not improve from 0.71360
Epoch 27/100
Epoch 00027: val_loss did not improve from 0.71360
Epoch 28/100
Epoch 00028: val_loss did not improve from 0.71360
Epoch 29/100
Epoch 00029: val_loss did not improve from 0.71360
Epoch 30/100
Epoch 00030: val_loss did not improve from 0.71360
Epoch 31/100
Epoch 00031: val_loss did not improv

Epoch 00063: val_loss did not improve from 0.70966
Epoch 64/100
Epoch 00064: val_loss did not improve from 0.70966
Epoch 65/100
Epoch 00065: val_loss did not improve from 0.70966
Epoch 66/100
Epoch 00066: val_loss did not improve from 0.70966
Epoch 67/100
Epoch 00067: val_loss did not improve from 0.70966
Epoch 68/100
Epoch 00068: val_loss did not improve from 0.70966
Epoch 69/100
Epoch 00069: val_loss did not improve from 0.70966
Epoch 70/100
Epoch 00070: val_loss did not improve from 0.70966
Epoch 71/100
Epoch 00071: val_loss did not improve from 0.70966
Epoch 72/100
Epoch 00072: val_loss did not improve from 0.70966
Epoch 73/100
Epoch 00073: val_loss did not improve from 0.70966
Epoch 74/100
Epoch 00074: val_loss did not improve from 0.70966
Epoch 75/100
Epoch 00075: val_loss did not improve from 0.70966
Epoch 76/100
Epoch 00076: val_loss did not improve from 0.70966
Epoch 77/100
Epoch 00077: val_loss did not improve from 0.70966
Epoch 78/100
Epoch 00078: val_loss did not improve fr

Train on 28304 samples, validate on 7076 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.77633, saving model to nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 0.77633 to 0.72565, saving model to nn_model.w8
Epoch 3/100
Epoch 00003: val_loss did not improve from 0.72565
Epoch 4/100
Epoch 00004: val_loss improved from 0.72565 to 0.71598, saving model to nn_model.w8
Epoch 5/100
Epoch 00005: val_loss improved from 0.71598 to 0.70938, saving model to nn_model.w8
Epoch 6/100
Epoch 00006: val_loss did not improve from 0.70938
Epoch 7/100
Epoch 00007: val_loss did not improve from 0.70938
Epoch 8/100
Epoch 00008: val_loss did not improve from 0.70938
Epoch 9/100
Epoch 00009: val_loss improved from 0.70938 to 0.70751, saving model to nn_model.w8
Epoch 10/100
Epoch 00010: val_loss improved from 0.70751 to 0.70740, saving model to nn_model.w8
Epoch 11/100
Epoch 00011: val_loss did not improve from 0.70740
Epoch 12/100
Epoch 00012: val_loss did not improve from 0.7074

Epoch 46/100
Epoch 00046: val_loss did not improve from 0.69820
Epoch 47/100
Epoch 00047: val_loss did not improve from 0.69820
Epoch 48/100
Epoch 00048: val_loss did not improve from 0.69820
Epoch 49/100
Epoch 00049: val_loss did not improve from 0.69820
Epoch 50/100
Epoch 00050: val_loss improved from 0.69820 to 0.69785, saving model to nn_model.w8
Epoch 51/100
Epoch 00051: val_loss improved from 0.69785 to 0.69717, saving model to nn_model.w8
Epoch 52/100
Epoch 00052: val_loss did not improve from 0.69717
Epoch 53/100
Epoch 00053: val_loss did not improve from 0.69717
Epoch 54/100
Epoch 00054: val_loss did not improve from 0.69717
Epoch 55/100
Epoch 00055: val_loss did not improve from 0.69717
Epoch 56/100
Epoch 00056: val_loss did not improve from 0.69717
Epoch 57/100
Epoch 00057: val_loss did not improve from 0.69717
Epoch 58/100
Epoch 00058: val_loss did not improve from 0.69717
Epoch 59/100
Epoch 00059: val_loss did not improve from 0.69717
Epoch 60/100
Epoch 00060: val_loss did

Epoch 69/100
Epoch 00069: val_loss did not improve from 0.69631
Epoch 70/100
Epoch 00070: val_loss did not improve from 0.69631
Epoch 71/100
Epoch 00071: val_loss did not improve from 0.69631
Epoch 72/100
Epoch 00072: val_loss did not improve from 0.69631
Epoch 73/100
Epoch 00073: val_loss did not improve from 0.69631
Epoch 74/100
Epoch 00074: val_loss did not improve from 0.69631
Epoch 75/100
Epoch 00075: val_loss did not improve from 0.69631
Epoch 76/100
Epoch 00076: val_loss did not improve from 0.69631
Epoch 77/100
Epoch 00077: val_loss did not improve from 0.69631
Epoch 78/100
Epoch 00078: val_loss did not improve from 0.69631
Epoch 79/100
Epoch 00079: val_loss did not improve from 0.69631
Epoch 80/100
Epoch 00080: val_loss did not improve from 0.69631
Epoch 81/100
Epoch 00081: val_loss did not improve from 0.69631
Epoch 82/100
Epoch 00082: val_loss did not improve from 0.69631
Epoch 83/100
Epoch 00083: val_loss did not improve from 0.69631
Epoch 84/100
Epoch 00084: val_loss did n

Train on 28304 samples, validate on 7076 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.79037, saving model to nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 0.79037 to 0.72838, saving model to nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 0.72838 to 0.72149, saving model to nn_model.w8
Epoch 4/100
Epoch 00004: val_loss did not improve from 0.72149
Epoch 5/100
Epoch 00005: val_loss improved from 0.72149 to 0.71616, saving model to nn_model.w8
Epoch 6/100
Epoch 00006: val_loss did not improve from 0.71616
Epoch 7/100
Epoch 00007: val_loss improved from 0.71616 to 0.71385, saving model to nn_model.w8
Epoch 8/100
Epoch 00008: val_loss did not improve from 0.71385
Epoch 9/100
Epoch 00009: val_loss improved from 0.71385 to 0.71108, saving model to nn_model.w8
Epoch 10/100
Epoch 00010: val_loss did not improve from 0.71108
Epoch 11/100
Epoch 00011: val_loss did not improve from 0.71108
Epoch 12/100
Epoch 00012: val_loss did not improve from 0.7110

Epoch 23/100
Epoch 00023: val_loss improved from 0.70430 to 0.70360, saving model to nn_model.w8
Epoch 24/100
Epoch 00024: val_loss did not improve from 0.70360
Epoch 25/100
Epoch 00025: val_loss improved from 0.70360 to 0.70286, saving model to nn_model.w8
Epoch 26/100
Epoch 00026: val_loss did not improve from 0.70286
Epoch 27/100
Epoch 00027: val_loss did not improve from 0.70286
Epoch 28/100
Epoch 00028: val_loss did not improve from 0.70286
Epoch 29/100
Epoch 00029: val_loss did not improve from 0.70286
Epoch 30/100
Epoch 00030: val_loss did not improve from 0.70286
Epoch 31/100
Epoch 00031: val_loss did not improve from 0.70286
Epoch 32/100
Epoch 00032: val_loss improved from 0.70286 to 0.70185, saving model to nn_model.w8
Epoch 33/100
Epoch 00033: val_loss did not improve from 0.70185
Epoch 34/100
Epoch 00034: val_loss did not improve from 0.70185
Epoch 35/100
Epoch 00035: val_loss did not improve from 0.70185
Epoch 36/100
Epoch 00036: val_loss did not improve from 0.70185
Epoch

Epoch 46/100
Epoch 00046: val_loss did not improve from 0.70064
Epoch 47/100
Epoch 00047: val_loss improved from 0.70064 to 0.70018, saving model to nn_model.w8
Epoch 48/100
Epoch 00048: val_loss did not improve from 0.70018
Epoch 49/100
Epoch 00049: val_loss did not improve from 0.70018
Epoch 50/100
Epoch 00050: val_loss did not improve from 0.70018
Epoch 51/100
Epoch 00051: val_loss did not improve from 0.70018
Epoch 52/100
Epoch 00052: val_loss did not improve from 0.70018
Epoch 53/100
Epoch 00053: val_loss did not improve from 0.70018
Epoch 54/100
Epoch 00054: val_loss did not improve from 0.70018
Epoch 55/100
Epoch 00055: val_loss did not improve from 0.70018
Epoch 56/100
Epoch 00056: val_loss did not improve from 0.70018
Epoch 57/100
Epoch 00057: val_loss did not improve from 0.70018
Epoch 58/100
Epoch 00058: val_loss improved from 0.70018 to 0.70005, saving model to nn_model.w8
Epoch 59/100
Epoch 00059: val_loss did not improve from 0.70005
Epoch 60/100
Epoch 00060: val_loss did

Epoch 69/100
Epoch 00069: val_loss did not improve from 0.69951
Epoch 70/100
Epoch 00070: val_loss did not improve from 0.69951
Epoch 71/100
Epoch 00071: val_loss did not improve from 0.69951
Epoch 72/100
Epoch 00072: val_loss did not improve from 0.69951
Epoch 73/100
Epoch 00073: val_loss did not improve from 0.69951
Epoch 74/100
Epoch 00074: val_loss did not improve from 0.69951
Epoch 75/100
Epoch 00075: val_loss did not improve from 0.69951
Epoch 76/100
Epoch 00076: val_loss did not improve from 0.69951
Epoch 77/100
Epoch 00077: val_loss did not improve from 0.69951
Epoch 78/100
Epoch 00078: val_loss did not improve from 0.69951
Epoch 79/100
Epoch 00079: val_loss did not improve from 0.69951
Epoch 80/100
Epoch 00080: val_loss did not improve from 0.69951
Epoch 81/100
Epoch 00081: val_loss did not improve from 0.69951
Epoch 82/100
Epoch 00082: val_loss did not improve from 0.69951
Epoch 83/100
Epoch 00083: val_loss did not improve from 0.69951
Epoch 84/100
Epoch 00084: val_loss did n

Train on 28304 samples, validate on 7076 samples
Epoch 1/100
Epoch 00001: val_loss improved from inf to 0.89337, saving model to nn_model.w8
Epoch 2/100
Epoch 00002: val_loss improved from 0.89337 to 0.76631, saving model to nn_model.w8
Epoch 3/100
Epoch 00003: val_loss improved from 0.76631 to 0.75640, saving model to nn_model.w8
Epoch 4/100
Epoch 00004: val_loss improved from 0.75640 to 0.75164, saving model to nn_model.w8
Epoch 5/100
Epoch 00005: val_loss improved from 0.75164 to 0.74954, saving model to nn_model.w8
Epoch 6/100
Epoch 00006: val_loss improved from 0.74954 to 0.74778, saving model to nn_model.w8
Epoch 7/100
Epoch 00007: val_loss improved from 0.74778 to 0.74464, saving model to nn_model.w8
Epoch 8/100
Epoch 00008: val_loss did not improve from 0.74464
Epoch 9/100
Epoch 00009: val_loss improved from 0.74464 to 0.74382, saving model to nn_model.w8
Epoch 10/100
Epoch 00010: val_loss did not improve from 0.74382
Epoch 11/100
Epoch 00011: val_loss improved from 0.74382 to 

Epoch 23/100
Epoch 00023: val_loss did not improve from 0.73600
Epoch 24/100
Epoch 00024: val_loss did not improve from 0.73600
Epoch 25/100
Epoch 00025: val_loss did not improve from 0.73600
Epoch 26/100
Epoch 00026: val_loss did not improve from 0.73600
Epoch 27/100
Epoch 00027: val_loss did not improve from 0.73600
Epoch 28/100
Epoch 00028: val_loss improved from 0.73600 to 0.73525, saving model to nn_model.w8
Epoch 29/100
Epoch 00029: val_loss did not improve from 0.73525
Epoch 30/100
Epoch 00030: val_loss improved from 0.73525 to 0.73518, saving model to nn_model.w8
Epoch 31/100
Epoch 00031: val_loss did not improve from 0.73518
Epoch 32/100
Epoch 00032: val_loss did not improve from 0.73518
Epoch 33/100
Epoch 00033: val_loss did not improve from 0.73518
Epoch 34/100
Epoch 00034: val_loss improved from 0.73518 to 0.73516, saving model to nn_model.w8
Epoch 35/100
Epoch 00035: val_loss improved from 0.73516 to 0.73311, saving model to nn_model.w8
Epoch 36/100
Epoch 00036: val_loss d

Epoch 46/100
Epoch 00046: val_loss did not improve from 0.73311
Epoch 47/100
Epoch 00047: val_loss improved from 0.73311 to 0.73217, saving model to nn_model.w8
Epoch 48/100
Epoch 00048: val_loss did not improve from 0.73217
Epoch 49/100
Epoch 00049: val_loss did not improve from 0.73217
Epoch 50/100
Epoch 00050: val_loss did not improve from 0.73217
Epoch 51/100
Epoch 00051: val_loss did not improve from 0.73217
Epoch 52/100
Epoch 00052: val_loss did not improve from 0.73217
Epoch 53/100
Epoch 00053: val_loss did not improve from 0.73217
Epoch 54/100
Epoch 00054: val_loss did not improve from 0.73217
Epoch 55/100
Epoch 00055: val_loss did not improve from 0.73217
Epoch 56/100
Epoch 00056: val_loss did not improve from 0.73217
Epoch 57/100
Epoch 00057: val_loss did not improve from 0.73217
Epoch 58/100
Epoch 00058: val_loss did not improve from 0.73217
Epoch 59/100
Epoch 00059: val_loss improved from 0.73217 to 0.73073, saving model to nn_model.w8
Epoch 60/100
Epoch 00060: val_loss did

In [64]:
print('Accuracy on training data: ', nn_model.score)
# Accuracy on training data:   0.6088463108528276
# Accuracy on training data:  0.7190729225551159 With SMOTE
# Accuracy on training data:  0.6884159873437191 - with SMOTE Classifier all

Accuracy on training data:  0.6861723009814613


In [65]:
from sklearn.metrics import classification_report
print(classification_report(reduce_train['accuracy_group'], np.around(nn_model.oof_pred)))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78      8845
           1       0.81      0.75      0.78      8845
           2       0.83      0.74      0.78      8845
           3       0.68      0.83      0.75      8845

    accuracy                           0.77     35380
   macro avg       0.78      0.77      0.77     35380
weighted avg       0.78      0.77      0.77     35380



In [66]:
submission = pd.DataFrame()
submission['installation_id'] = ajusted_test['installation_id']
submission['accuracy_group'] =  np.around(nn_model.y_pred)
submission.to_csv('submission.csv', index=False)
submission['accuracy_group'].value_counts(normalize=True)

3.0    0.592
0.0    0.225
1.0    0.107
2.0    0.076
Name: accuracy_group, dtype: float64

In [1]:
!git status

On branch cjs
Your branch is up to date with 'new_origin/cjs'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	modified:   Big Three.ipynb

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	.idea/

no changes added to commit (use "git add" and/or "git commit -a")
