In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostRegressor
from matplotlib import pyplot
import shap
import random
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupKFold
from pprint import pprint
from bayes_opt import BayesianOptimization
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import gc
import json
pd.set_option('display.max_columns', 1000)

/kaggle/input/data-science-bowl-2019/train.csv
/kaggle/input/data-science-bowl-2019/specs.csv
/kaggle/input/data-science-bowl-2019/sample_submission.csv
/kaggle/input/data-science-bowl-2019/train_labels.csv
/kaggle/input/data-science-bowl-2019/test.csv


In [2]:
def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(reduce_train['accuracy_group'])
    for k in dist:
        dist[k] /= len(reduce_train)
    reduce_train['accuracy_group'].hist()
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred))).reshape(y_true.shape)

    return 'cappa', cohen_kappa_score(y_true, y_pred, weights='quadratic'), True

In [3]:
def qwk3(a1, a2, max_rat=3):
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

In [4]:
def read_data():
    print('Reading train.csv file....')
    train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
    print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

    print('Reading test.csv file....')
    test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
    print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    return train, test, train_labels

In [5]:
def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])
    
    
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code

In [6]:
def get_train_and_test(train, test):
    
    compiled_train = []
    compiled_test = []
    assessment_sessions_by_instid = {}
    
    # Loop through each train installation id
    for ins_id, user_sample in tqdm(train.groupby('installation_id', sort = False), 
                                   total = train['installation_id'].nunique()):
        compiled_train += get_data(user_sample, test_set = False)  
        
    reduce_train = pd.DataFrame(compiled_train)
    del compiled_train
    
    # Loop through each test installation id
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), 
                                   total = test['installation_id'].nunique()):
        test_data = get_data(user_sample, test_set = True)
        compiled_test.append(test_data)   
        
    reduce_test = pd.DataFrame(compiled_test)
    del compiled_test
    
    categoricals = ['session_title']
    
    return reduce_train, reduce_test, categoricals

In [7]:
def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one 
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0
    
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    # new features: time spent in each activity
    last_session_time_sec = 0
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    durations_game = []
    durations_activity = []
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    last_game_time_title = {'lgt_' + title: 0 for title in assess_titles}
    ac_game_time_title = {'agt_' + title: 0 for title in assess_titles}
    ac_true_attempts_title = {'ata_' + title: 0 for title in assess_titles}
    ac_false_attempts_title = {'afa_' + title: 0 for title in assess_titles}
    event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
    event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
    title_count: Dict[str, int] = {eve: 0 for eve in activities_labels.values()} 
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in all_title_event_code}
    session_count = 0
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        game_session = session['game_session'].iloc[0]          
            
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(event_code_count.copy())
            features.update(title_count.copy())
            features.update(event_id_count.copy())
            features.update(title_event_code_count.copy())
            features.update(last_game_time_title.copy())
            features.update(ac_game_time_title.copy())
            features.update(ac_true_attempts_title.copy())
            features.update(ac_false_attempts_title.copy())
            features['installation_session_count'] = session_count
            
            variety_features = [('var_event_code', event_code_count), 
                                ('var_event_id', event_id_count), 
                                ('var_title', title_count), 
                                ('var_title_event_code', title_event_code_count)]
            
            for name, dict_counts in variety_features:
                arr = np.array(list(dict_counts.values()))
                features[name] = np.count_nonzero(arr)
                
            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['game_session'] = game_session
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0]
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            
            # ----------------------------------------------
            ac_true_attempts_title['ata_' + session_title_text] += true_attempts
            ac_false_attempts_title['afa_' + session_title_text] += false_attempts
            
            
            last_game_time_title['lgt_' + session_title_text] = session['game_time'].iloc[-1]
            ac_game_time_title['agt_' + session_title_text] += session['game_time'].iloc[-1]
            # ----------------------------------------------
            
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
                features['duration_std'] = 0
                features['last_duration'] = 0
                features['duration_max'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
                features['duration_std'] = np.std(durations)
                features['last_duration'] = durations[-1]
                features['duration_max'] = np.max(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            
            if durations_game == []:
                features['duration_game_mean'] = 0
                features['duration_game_std'] = 0
                features['game_last_duration'] = 0
                features['game_max_duration'] = 0
            else:
                features['duration_game_mean'] = np.mean(durations_game)
                features['duration_game_std'] = np.std(durations_game)
                features['game_last_duration'] = durations_game[-1]
                features['game_max_duration'] = np.max(durations_game)
                
            if durations_activity == []:
                features['duration_activity_mean'] = 0
                features['duration_activity_std'] = 0
                features['game_activity_duration'] = 0
                features['game_activity_max'] = 0
            else:
                features['duration_activity_mean'] = np.mean(durations_activity)
                features['duration_activity_std'] = np.std(durations_activity)
                features['game_activity_duration'] = durations_activity[-1]
                features['game_activity_max'] = np.max(durations_activity)
            
            # the accuracy is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
            
        if session_type == 'Game':
            durations_game.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            
        if session_type == 'Activity':
            durations_activity.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
                
        
        session_count += 1
        # this piece counts how many actions was made in each event_code so far
        def update_counters(counter: dict, col: str):
                num_of_session_count = Counter(session[col])
                for k in num_of_session_count.keys():
                    x = k
                    if col == 'title':
                        x = activities_labels[k]
                    counter[x] += num_of_session_count[k]
                return counter
            
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        title_count = update_counters(title_count, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type 
                        
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [8]:
def remove_dead_weight(df, train_labels, test_set=False):
    #df = df[df['world'] != 'NONE']

    # filtering by ids that took assessments
    ids_w_assessments = df[df['type'] == 'Assessment']['installation_id'].drop_duplicates()
    df = df[df['installation_id'].isin(ids_w_assessments)]
    
    #If training set then make sure the installation ids are in the labels and remove assements not in the labels
    if test_set == False:
        # drop data whose installation does not contain any scored assessments in train_labels
        df = df[df['installation_id'].isin(train_labels['installation_id'].unique())]

        assessments = df[df.type == 'Assessment']
        assessments = assessments[~assessments.game_session.isin(train_labels.game_session)]
        df = df[~df.game_session.isin(assessments.game_session)]
        df.reset_index(drop=True, inplace=True)
        
    return df

In [9]:
def stract_hists(feature, train, test, adjust=False, plot=False):
    n_bins = 10
    train_data = train[feature]
    test_data = test[feature]
    if adjust:
        test_data *= train_data.mean() / test_data.mean()
    perc_90 = np.percentile(train_data, 95)
    train_data = np.clip(train_data, 0, perc_90)
    test_data = np.clip(test_data, 0, perc_90)
    train_hist = np.histogram(train_data, bins=n_bins)[0] / len(train_data)
    test_hist = np.histogram(test_data, bins=n_bins)[0] / len(test_data)
    msre = mean_squared_error(train_hist, test_hist)
    if plot:
        print(msre)
        plt.bar(range(n_bins), train_hist, color='blue', alpha=0.5)
        plt.bar(range(n_bins), test_hist, color='red', alpha=0.5)
        plt.show()
    return msre

In [10]:
# get prediction
def get_class_pred(pred, train_t):
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(train_t['accuracy_group'])
    for k in dist:
        dist[k] /= len(train_t)
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, pred)))
    
    return y_pred


In [11]:
def preprocess(reduce_train, reduce_test):
    for df in [reduce_train, reduce_test]:
        df['installation_session_count'] = df.groupby(['installation_id'])['Clip'].transform('count')
        df['installation_duration_mean'] = df.groupby(['installation_id'])['duration_mean'].transform('mean')
        #df['installation_duration_std'] = df.groupby(['installation_id'])['duration_mean'].transform('std')
        df['installation_title_nunique'] = df.groupby(['installation_id'])['session_title'].transform('nunique')
        
        df['sum_event_code_count'] = df[[2050, 4100, 4230, 5000, 4235, 2060, 4110, 5010, 2070, 2075, 2080, 2081, 2083, 3110, 4010, 3120, 3121, 4020, 4021, 
                                        4022, 4025, 4030, 4031, 3010, 4035, 4040, 3020, 3021, 4045, 2000, 4050, 2010, 2020, 4070, 2025, 2030, 4080, 2035, 
                                        2040, 4090, 4220, 4095]].sum(axis = 1)
        
        df['installation_event_code_count_mean'] = df.groupby(['installation_id'])['sum_event_code_count'].transform('mean')
        #df['installation_event_code_count_std'] = df.groupby(['installation_id'])['sum_event_code_count'].transform('std')
        
    features = reduce_train.loc[(reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns # delete useless columns
    features = [x for x in features if x not in cols_to_drop]
    
    return reduce_train, reduce_test, features

In [12]:
class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, params, categoricals=[], n_splits=5, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'accuracy_group'
        self.cv = self.get_cv()
        self.verbose = verbose
        self.params = params
        self.y_pred, self.score, self.model, self.oof_pred = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self):
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        return cv.split(self.train_df, self.train_df[self.target])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
        
    def fit(self):
        oof_pred = np.zeros((len(reduce_train), ))
        y_pred = np.zeros((len(reduce_test), ))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            try:
                x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
                y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
                train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
                model = self.train_model(train_set, val_set)
                conv_x_val = self.convert_x(x_val)
                tmp_pred = model.predict(conv_x_val)
                oof_pred[val_idx] = tmp_pred.reshape(oof_pred[val_idx].shape)
                x_test = self.convert_x(self.test_df[self.features])
                y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
                print('Partial score of fold {} is: {}'.format(fold, qwk3(y_val, tmp_pred)))
                loss_score = qwk3(self.train_df[self.target], oof_pred)
            except Exception as e:
                print(e)
                print('Error training: val_idx = ', val_idx)
        if self.verbose:
            print('Our oof cohen kappa score is: ', loss_score)
        del self.train_df, self.test_df, self.cv
        gc.collect()
        return y_pred, loss_score, model, oof_pred

In [13]:
class Xgb_Model(Base_Model):
          
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return xgb.train(self.params, train_set, 
                         num_boost_round=667, evals=[(train_set, 'train'), (val_set, 'val')], 
                         verbose_eval=verbosity, early_stopping_rounds=100)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = xgb.DMatrix(x_train, y_train)
        val_set = xgb.DMatrix(x_val, y_val)
        return train_set, val_set
    
    def convert_x(self, x):
        return xgb.DMatrix(x)

In [14]:
class Catb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        clf = CatBoostRegressor(**self.params)
        clf.fit(train_set['X'], 
                train_set['y'], 
                eval_set=(val_set['X'], val_set['y']),
                verbose=verbosity,)
        return clf
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set

In [15]:
class RF_Model(Base_Model):
        
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        rf = RandomForestClassifier(**self.params)
        rf.fit(train_set['X'], train_set['y'])
        return rf
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set

In [16]:
class KNN_Model(Base_Model):
        
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        knn = KNeighborsClassifier(**self.params)
        knn.fit(train_set['X'], train_set['y'])
        return knn
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set

In [17]:
class Lgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        return train_set, val_set

In [18]:
from random import choice

class Cnn_Model(Base_Model):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, verbose=True):
        self.create_feat_2d(features)
        super().__init__(train_df, test_df, features, {}, categoricals, n_splits, verbose)
        
    def create_feat_2d(self, features, n_feats_repeat=50):
        self.n_feats = len(features)
        self.n_feats_repeat = n_feats_repeat
        self.mask = np.zeros((self.n_feats_repeat, self.n_feats), dtype=np.int32)
        for i in range(self.n_feats_repeat):
            l = list(range(self.n_feats))
            for j in range(self.n_feats):
                c = l.pop(choice(range(len(l))))
                self.mask[i, j] = c
        self.mask = tf.convert_to_tensor(self.mask)
        print(self.mask.shape)
       
        
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0

        inp = tf.keras.layers.Input(shape=(self.n_feats))
        x = tf.keras.layers.Lambda(lambda x: tf.gather(x, self.mask, axis=1))(inp)
        x = tf.keras.layers.Reshape((self.n_feats_repeat, self.n_feats, 1))(x)
        x = tf.keras.layers.Conv2D(18, (50, 50), strides=50, activation='relu')(x)
        x = tf.keras.layers.Flatten()(x)
        #x = tf.keras.layers.Dense(200, activation='relu')(x)
        #x = tf.keras.layers.LayerNormalization()(x)
        #x = tf.keras.layers.Dropout(0.3)(x)
        x = tf.keras.layers.Dense(100, activation='relu')(x)
        x = tf.keras.layers.LayerNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        x = tf.keras.layers.Dense(50, activation='relu')(x)
        x = tf.keras.layers.LayerNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        out = tf.keras.layers.Dense(1)(x)
        
        model = tf.keras.Model(inp, out)
    
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss='mse')
        print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'], 
                train_set['y'], 
                validation_data=(val_set['X'], val_set['y']),
                epochs=100,
                 callbacks=[save_best, early_stop])
        model.load_weights('nn_model.w8')
        return model
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        return None

In [19]:
class Nn_Model(Base_Model):

    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(train_set['X'].shape[1],)),
            tf.keras.layers.Dense(200, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(100, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(50, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(25, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(1, activation='relu')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=4e-4), loss='mse')
        print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'], 
                train_set['y'], 
                validation_data=(val_set['X'], val_set['y']),
                epochs=100,
                 callbacks=[save_best, early_stop])
        model.load_weights('nn_model.w8')
        return model
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        return None

In [20]:
# read data
train, test, train_labels = read_data()

Reading train.csv file....
Training.csv file have 11341042 rows and 11 columns
Reading test.csv file....
Test.csv file have 1156414 rows and 11 columns
Reading train_labels.csv file....
Train_labels.csv file have 17690 rows and 7 columns


In [21]:
# remove unwanted data
train = remove_dead_weight(train, train_labels, test_set=False)
test = remove_dead_weight(test, train_labels, test_set=True)

In [22]:
# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code = encode_title(train, test, train_labels)

In [23]:
# tranform function to get the train and test set
reduce_train, reduce_test, categoricals = get_train_and_test(train, test)

HBox(children=(IntProgress(value=0, max=3614), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [24]:
# Delete train and test to free up resources
del train, test
gc.collect()

2264

In [25]:
# Add the accuracy group vals
reduce_train = reduce_train.set_index('game_session')
train_labels = train_labels.set_index('game_session')
reduce_train.update(train_labels)

In [26]:
# Reset index
reduce_train = reduce_train.reset_index()
train_labels = train_labels.reset_index()

In [27]:
# Create the cols to drop for training
cols_to_drop = ['accuracy_group', 'game_session', 'installation_id']

In [28]:
reduce_train, reduce_test, features = preprocess(reduce_train, reduce_test)

In [29]:
# Create Catagoricals
def create_cats(train, test, categoricals):
    tmp_train = train.copy()
    tmp_test = test.copy()
    tmp_train['session_title'] = tmp_train['session_title'].astype(CategoricalDtype(categories=activities_labels))
    tmp_test['session_title'] = tmp_test['session_title'].astype(CategoricalDtype(categories=activities_labels))
    #tmp_train['world'] = tmp_train['world'].astype(CategoricalDtype(categories=[0, 1, 2]))
    #tmp_test['world'] = tmp_test['world'].astype(CategoricalDtype(categories=[0, 1, 2]))
    train_cats = pd.get_dummies(tmp_train[categoricals], prefix=categoricals)
    test_cats = pd.get_dummies(tmp_test[categoricals], prefix=categoricals)
    
    tmp_train = tmp_train.drop(categoricals, axis=1)
    tmp_test = tmp_test.drop(categoricals, axis=1)
    
    tmp_train = pd.concat([tmp_train, train_cats], axis=1, sort=False)
    tmp_test = pd.concat([tmp_test, test_cats], axis=1, sort=False)
    
    return tmp_train, tmp_test

In [30]:
# Create Cats
reduce_train, reduce_test = create_cats(reduce_train, reduce_test, categoricals)

In [31]:
# Min Max
def create_min_max(train, test, categoricals):
    tmp_train = train.copy()
    tmp_test = test.copy()
    scalars = [x for x in tmp_train.columns if x not in (categoricals + cols_to_drop)]
    tmp_train[scalars] = tmp_train[scalars].apply(lambda x: (x - x.min()) / (x.max() - x.min())).fillna(0)
    tmp_test[scalars] = tmp_test[scalars].apply(lambda x: (x - x.min()) / (x.max() - x.min())).fillna(0)
    
    return tmp_train, tmp_test

In [32]:
reduce_train, reduce_test = create_min_max(reduce_train, reduce_test, categoricals)
reduce_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_train.columns]
reduce_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in reduce_test.columns]

In [33]:
features = reduce_train.loc[(reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns # delete useless columns
features = [x for x in features if x not in cols_to_drop]

In [34]:
# Go through and find high correlations. Add to remove list if so
counter = 0
to_remove = []
for feat_a in features:
    for feat_b in features:
        if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
            c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
            if c > 0.995:
                counter += 1
                to_remove.append(feat_b)
                print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))

1: FEAT_A: Clip FEAT_B: 27253bdc - Correlation: 1.0
2: FEAT_A: 2050 FEAT_B: 2040 - Correlation: 0.9965259434878112
3: FEAT_A: 2050 FEAT_B: 2b9272f4 - Correlation: 0.9999839030068793
4: FEAT_A: 2050 FEAT_B: 73757a5e - Correlation: 0.999805014671399
5: FEAT_A: 2050 FEAT_B: 26fd2d99 - Correlation: 0.9965084543995784
6: FEAT_A: 2050 FEAT_B: 08fd73f3 - Correlation: 0.9966123918733626
7: FEAT_A: 2050 FEAT_B: dcaede90 - Correlation: 0.9965259434878112
8: FEAT_A: 2050 FEAT_B: 37c53127 - Correlation: 1.0
9: FEAT_A: 2050 FEAT_B: Scrub_A_Dub_2050 - Correlation: 1.0
10: FEAT_A: 2050 FEAT_B: Scrub_A_Dub_3021 - Correlation: 0.999805014671399
11: FEAT_A: 2050 FEAT_B: Scrub_A_Dub_2040 - Correlation: 0.9965259434878112
12: FEAT_A: 2050 FEAT_B: Scrub_A_Dub_2030 - Correlation: 0.9966123918733626
13: FEAT_A: 2050 FEAT_B: Scrub_A_Dub_3121 - Correlation: 0.9999839030068793
14: FEAT_A: 2050 FEAT_B: Scrub_A_Dub_2020 - Correlation: 0.9965084543995784
15: FEAT_A: 4230 FEAT_B: 4235 - Correlation: 0.9999995197498

In [35]:
# Go through again a look for features to remove
to_exclude = [] 
ajusted_test = reduce_test.copy()
for feature in ajusted_test.columns:
    if feature not in (cols_to_drop + categoricals):
        try:
            data = reduce_train[feature]
            train_mean = data.mean()
            data = ajusted_test[feature] 
            test_mean = data.mean()
            error = stract_hists(feature, train=reduce_train, test=reduce_test, adjust=True)
            ajust_factor = train_mean / test_mean
            if ajust_factor > 10 or ajust_factor < 0.1:# or error > 0.01:
                to_exclude.append(feature)
                print(feature, train_mean, test_mean, error)
            else:
                ajusted_test[feature] *= ajust_factor
        except:
            to_exclude.append(feature)

lgt_Cauldron_Filler__Assessment_ 0.0006131019707591197 0.014890678565531248 0.01280386624911603
lgt_Bird_Measurer__Assessment_ 0.001467792461199117 0.03618532176893661 0.006967017932031494
lgt_Mushroom_Sorter__Assessment_ 0.0002720048349456899 0.004590887618656621 0.01326614870499121
agt_Mushroom_Sorter__Assessment_ 0.0004883155304924486 0.007520193269563968 0.008830469115515913


In [36]:
# Create final feature list removing the unwanted ones
features = [x for x in features if x not in (to_exclude + to_remove)]

In [37]:
# Random Forrest Classifier

params = {'bootstrap': False, 
          'max_depth':59, 
          'max_features': 72, 
          'min_samples_leaf': 2, 
          'min_samples_split': 6, 
          'n_estimators': 100}

rf_model = RF_Model(reduce_train, ajusted_test, features, params, categoricals=categoricals)
# 0.5523468488732437

Partial score of fold 0 is: 0.5573580366562838
Partial score of fold 1 is: 0.5455838806449469
Partial score of fold 2 is: 0.5345198315160676
Partial score of fold 3 is: 0.5583900879769266
Partial score of fold 4 is: 0.5413472117459542
Our oof cohen kappa score is:  0.5474315123578697


In [38]:
rf_train_pred = rf_model.oof_pred
print('Accuracy on training data: ', qwk3(reduce_train['accuracy_group'], rf_train_pred))

Accuracy on training data:  0.5474315123578697


In [39]:
# K Nearest

weights = 0.3491139618762451
if weights >= 0 and weights < 1.0:
    weights = 'uniform'
else:
    weights = 'distance'

algorithm = 0.04441288498288465
if algorithm >= 0 and algorithm < 1.0:
    algorithm = 'ball_tree'
elif algorithm >= 1 and algorithm < 2.0:
    algorithm = 'kd_tree'
elif algorithm >= 2 and algorithm < 3.0:
    algorithm = 'brute'
else:
    algorithm = 'auto'

params = {
         'n_neighbors': int(19.544302888065488),
        'weights': weights,
        'algorithm': algorithm,
        'leaf_size': int(29.702070879545722),
        'p': int(2.986361352754792),
        'n_jobs': -1
}
knn_model = KNN_Model(reduce_train, ajusted_test, features, params, categoricals=categoricals)
# 0.4942484678835759

Partial score of fold 0 is: 0.47426155198022635
Partial score of fold 1 is: 0.49343867565590827
Partial score of fold 2 is: 0.46496425799374264
Partial score of fold 3 is: 0.4763552841041663
Partial score of fold 4 is: 0.48148629329824355
Our oof cohen kappa score is:  0.4781292818058771


In [40]:
knn_train_pred = knn_model.oof_pred
print('Accuracy on training data: ', qwk3(reduce_train['accuracy_group'], knn_train_pred))

Accuracy on training data:  0.4781292818058771


In [41]:
# XG Boost
params = {
            'colsample_bytree': 0.2,                 
            'learning_rate': 0.01,
            'objective':'reg:squarederror',
            'max_depth': 6,
            'subsample': 1,
            'min_child_weight': 3,
            'gamma': 0.25,
            'n_estimators': 1400
         }

xgb_model = Xgb_Model(reduce_train, ajusted_test, features, params, categoricals=categoricals)
# 0.6098126526596694

[0]	train-rmse:1.85917	val-rmse:1.85916
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train-rmse:1.19929	val-rmse:1.22708
[200]	train-rmse:1.01839	val-rmse:1.07527
[300]	train-rmse:0.948968	val-rmse:1.03067
[400]	train-rmse:0.903107	val-rmse:1.00609
[500]	train-rmse:0.868435	val-rmse:0.991004
[600]	train-rmse:0.842172	val-rmse:0.982262
[666]	train-rmse:0.827314	val-rmse:0.978281
Partial score of fold 0 is: 0.46595104187923664
[0]	train-rmse:1.85914	val-rmse:1.85946
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train-rmse:1.1988	val-rmse:1.2319
[200]	train-rmse:1.01703	val-rmse:1.07849
[300]	train-rmse:0.946626	val-rmse:1.03248
[400]	train-rmse:0.899773	val-rmse:1.0091
[500]	train-rmse:0.865111	val-rmse:0.995583
[600]	train-rmse:0.838626	val-rmse:0.988273
[666]	train-rmse:0.823719	val-r

In [42]:
xgb_train_pred = xgb_model.oof_pred
print('Accuracy on training data: ', qwk3(reduce_train['accuracy_group'], xgb_train_pred))

Accuracy on training data:  0.45593095935157313


In [43]:
# Catboost
params = {
            'loss_function': 'MultiRMSE',
            'task_type': "CPU",
            'iterations': 1860,
            'depth': 6,
            'early_stopping_rounds': 300,
            'l2_leaf_reg': 2,
            'rsm': 1,
            'bootstrap_type': 'Bayesian',
            'bagging_temperature': 1,
            'random_seed': 42,
            'learning_rate': 0.04
        }

cat_model = Catb_Model(reduce_train, ajusted_test, features, params, categoricals=categoricals)
# 0.6091561423979555

0:	learn: 2.1992176	test: 2.1986935	best: 2.1986935 (0)	total: 103ms	remaining: 3m 11s
100:	learn: 1.0075605	test: 1.0155215	best: 1.0155215 (100)	total: 4.42s	remaining: 1m 17s
200:	learn: 0.9735774	test: 0.9946583	best: 0.9946583 (200)	total: 8.8s	remaining: 1m 12s
300:	learn: 0.9482295	test: 0.9834390	best: 0.9834390 (300)	total: 13.2s	remaining: 1m 8s
400:	learn: 0.9258153	test: 0.9775811	best: 0.9775811 (400)	total: 17.5s	remaining: 1m 3s
500:	learn: 0.9077978	test: 0.9741636	best: 0.9741229 (499)	total: 21.9s	remaining: 59.5s
600:	learn: 0.8923743	test: 0.9724816	best: 0.9722439 (587)	total: 26.2s	remaining: 55s
700:	learn: 0.8776046	test: 0.9709678	best: 0.9708560 (680)	total: 30.6s	remaining: 50.6s
800:	learn: 0.8645137	test: 0.9705376	best: 0.9705376 (800)	total: 35s	remaining: 46.2s
900:	learn: 0.8518941	test: 0.9698601	best: 0.9696706 (878)	total: 39.4s	remaining: 41.9s
1000:	learn: 0.8391667	test: 0.9699349	best: 0.9695947 (910)	total: 43.8s	remaining: 37.6s
1100:	learn: 0.

In [44]:
cat_train_pred = cat_model.oof_pred
print('Accuracy on training data: ', qwk3(reduce_train['accuracy_group'], cat_train_pred))

Accuracy on training data:  0.47886933893948913


In [45]:
cnn_model = Cnn_Model(reduce_train, ajusted_test, features, categoricals=categoricals)

(50, 391)
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 391)]             0         
_________________________________________________________________
lambda (Lambda)              (None, 50, 391)           0         
_________________________________________________________________
reshape (Reshape)            (None, 50, 391, 1)        0         
_________________________________________________________________
conv2d (Conv2D)              (None, 1, 7, 18)          45018     
_________________________________________________________________
flatten (Flatten)            (None, 126)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               12700     
_________________________________________________________________
layer_normalization (LayerNo (None, 100)           

In [46]:
cnn_train_pred = cnn_model.oof_pred
print('Accuracy on training data: ', qwk3(reduce_train['accuracy_group'], cnn_train_pred))

Accuracy on training data:  0.4726632103896947


In [47]:
nn_model = Nn_Model(reduce_train, ajusted_test, features, {}, categoricals=categoricals)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 200)               78400     
_________________________________________________________________
layer_normalization_10 (Laye (None, 200)               400       
_________________________________________________________________
dropout_10 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 100)               20100     
_________________________________________________________________
layer_normalization_11 (Laye (None, 100)               200       
_________________________________________________________________
dropout_11 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 50)                5

In [48]:
nn_train_pred = nn_model.oof_pred
print('Accuracy on training data: ', qwk3(reduce_train['accuracy_group'], nn_train_pred))

Accuracy on training data:  0.47542251448992634


In [49]:
# LightGBM
import lightgbm as lgb

params = {'n_estimators': int(5055.501312496299),
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'subsample': 0.6367826323790282,
            'subsample_freq': int(0.2658232429370657),
            'learning_rate': 0.017195259555759495,
            'feature_fraction': 0.8494137043130827,
            'max_depth': int(19.86367634095852),
            'lambda_l1': 1.9348461159493258,  
            'lambda_l2':1.33218451609903384,
            'early_stopping_rounds': 100
            }

lgb_model = Lgb_Model(reduce_train, ajusted_test, features, params, categoricals=categoricals)
# 0.6160951388044382

Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.976487	valid_1's rmse: 1.00548
[200]	training's rmse: 0.919112	valid_1's rmse: 0.977682
[300]	training's rmse: 0.884238	valid_1's rmse: 0.970223
[400]	training's rmse: 0.855904	valid_1's rmse: 0.967784
[500]	training's rmse: 0.831457	valid_1's rmse: 0.966618
[600]	training's rmse: 0.809202	valid_1's rmse: 0.966159
[700]	training's rmse: 0.789133	valid_1's rmse: 0.966233
Early stopping, best iteration is:
[621]	training's rmse: 0.804807	valid_1's rmse: 0.966064
Partial score of fold 0 is: 0.48229140401280723
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 0.974967	valid_1's rmse: 1.01076
[200]	training's rmse: 0.916861	valid_1's rmse: 0.987067
[300]	training's rmse: 0.881238	valid_1's rmse: 0.982396
[400]	training's rmse: 0.853414	valid_1's rmse: 0.980568
[500]	training's rmse: 0.829455	valid_1's rmse: 0.980077
[600]	training's rmse: 0.808354	valid_1's rmse: 0.9801

In [50]:
lgb_train_pred = lgb_model.oof_pred
print('Accuracy on training data: ', qwk3(reduce_train['accuracy_group'], lgb_train_pred))

Accuracy on training data:  0.47571855331164636


In [51]:
def find_weights(rf, knn, xgb, cat, cnn, nn, lgb):
    weights = {'rf': rf, 'knn': knn, 'xgb': xgb, 'cat': cat, 'cnn': cnn, 'nn': nn, 'lgb': lgb}

    final_pred = (rf_train_pred * weights['rf']) + \
        (knn_train_pred * weights['knn']) + \
        (xgb_train_pred * weights['xgb']) + \
        (cat_train_pred * weights['cat']) + \
        (cnn_train_pred * weights['cnn']) + \
        (nn_train_pred * weights['nn']) + \
        (lgb_train_pred * weights['lgb'])

    return cohen_kappa_score(reduce_train['accuracy_group'],final_pred.astype(int), weights='quadratic')

In [52]:
init_points = 100
n_iter = 200

bounds_weights = {'rf': (0, 1), 
           'knn': (0, 1),  
           'xgb': (0, 1), 
           'cat': (0, 1),  
           'cnn': (0, 1),
           'nn': (0, 1),
           'lgb': (0, 1)
          }

#Weights_BO = BayesianOptimization(find_weights, bounds_weights, random_state=1029)

#with warnings.catch_warnings():
#    warnings.filterwarnings('ignore')
#    Weights_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

In [53]:
#print(Weights_BO.max)

In [54]:
weights = {'cat': 0.6319722842278973, 'cnn': 0.1481760339510487, 'knn': 0.0039029911977264383, 'lgb': 0.003848860524615473, 'nn': 0.26120804791868485, 'rf': 0.2107114405167529, 'xgb': 0.006934110357386136}

final_pred = (rf_train_pred * weights['rf']) + \
    (knn_train_pred * weights['knn']) + \
    (xgb_train_pred * weights['xgb']) + \
    (cat_train_pred * weights['cat']) + \
    (cnn_train_pred * weights['cnn']) + \
    (nn_train_pred * weights['nn']) + \
    (lgb_train_pred * weights['lgb'])

print('Cappa score:', cohen_kappa_score(reduce_train['accuracy_group'],final_pred.astype(int), weights='quadratic'))
print('Accuracy:', accuracy_score(reduce_train['accuracy_group'],final_pred.astype(int)))

# 0.6123981493146219 Local

Cappa score: 0.6088190350056795
Accuracy: 0.526455624646693


In [55]:
# Make final predictions
rf_test_pred = rf_model.y_pred
knn_test_pred = knn_model.y_pred
xgb_test_pred = xgb_model.y_pred
cat_test_pred = cat_model.y_pred
cnn_test_pred =cnn_model.y_pred
nn_test_pred = nn_model.y_pred
lgb_test_pred = lgb_model.y_pred

final_pred = (rf_test_pred * weights['rf']) + \
    (knn_test_pred * weights['knn']) + \
    (xgb_test_pred * weights['xgb']) + \
    (cat_test_pred * weights['cat']) + \
    (nn_test_pred * weights['cnn']) + \
    (nn_test_pred * weights['nn']) + \
    (lgb_test_pred * weights['lgb'])

In [56]:
submission = pd.DataFrame()
submission['installation_id'] = ajusted_test['installation_id']
submission['accuracy_group'] = get_class_pred(final_pred, reduce_train)
submission.to_csv('submission.csv', index=False)
submission['accuracy_group'].value_counts(normalize=True)

3    0.500
0    0.239
1    0.136
2    0.125
Name: accuracy_group, dtype: float64