In [None]:
## General librairies
import pandas as pd
import numpy as np
import _pickle as pickle
import matplotlib.pyplot as plt
import random
import sys
import time
from copy import deepcopy
import os

from sklearn.metrics import cohen_kappa_score

from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import json

import dateutil.parser

In [None]:
specs = 'specs.csv'
test = 'test.csv'
train = 'train.csv'
train_labels = 'train_labels.csv'
subs = 'sample_submission_exemple.csv'

In [None]:
df = pd.read_csv(train)

In [None]:
def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

def relative_time(x):
    x1 = []
    for elt in x:
        x1.append((elt-x[0]).item()/1000000000)
    return x1

def categorise(x):
    dico = {}
    count = 0
    for elt in x:
        if not(elt in dico):
            dico[elt] = count
            count += 1
    return dico

def padding( dataset, n):
    d = list(np.zeros(len(dataset)))
    c = 0
    count = 0
    for elt in dataset:
        
        if count % 100 == 0:
            print(count)
        u = elt.shape[0]
        
        if u<200:
            c += 1
        
        if u > n:
            d[count] = elt[-n:]
        else:
            a = np.zeros(((n-u), elt.shape[1])) -1
            elt = np.concatenate([a, elt], axis = 0)
            d[count] = elt
        count += 1
    return d

def build_line(df, installation_id, game_session):
    
    df1 = df[(df['installation_id']==installation_id)&(df['date']<=df[(df['installation_id']==installation_id)&(df['game_session']==game_session)]['date'].iloc[0])]
    dico = categorise(df1['game_session'].unique())
    df1 = df1.replace({'game_session' : dico})
    x = relative_time(df1['date'].values)
    
    df1['time_delta'] = x
    
#     df1 = df1.drop_duplicates(subset = ['event_id'])
    
    cats_to_keep = ['game_session','time_delta','title','type', 'world',  'event_count', 'game_time', 'event_code']
    df1 = df1[cats_to_keep]
    return df1

def format_input(X):
    return [X[:,:,4:24], X[:,:,0].reshape((X.shape[0], X.shape[1],1)), 
            X[:,:,1].reshape((X.shape[0], X.shape[1],1)), 
            X[:,:,2].reshape((X.shape[0], X.shape[1],1)),
            X[:,:,3].reshape((X.shape[0], X.shape[1],1)),
            X[:,:,24].reshape((X.shape[0], X.shape[1],1))]

def build_feature(i,session):
    vect = []
    
    title = session.iloc[0]['title']
    typ = session.iloc[0]['type']
    
    vect.append(session.iloc[0]['game_session'])    ##game_session
    vect.append(session.iloc[0]['installation_id'])   ##installation_id
    vect.append(session.iloc[0]['title'])  ##title
    vect.append(session.iloc[0]['type'])  ##type
    vect.append(session.iloc[0]['world'])  ##world
    vect.append(session.iloc[0]['timestamp'])  ##timestamp
    
    
    ## adding validation data if game or assessment
    if typ == 'Game' or typ == 'Assessment':
        pos = 0
        neg = 0
        for elt in session[session['event_code']==4100]['event_data']:
            a = json.loads(elt)['correct']
#             print(a)
            if a:
                pos+=1
            else:
                neg+=1
            
        if pos+neg ==0:
            acc = -1
        else:
            acc = pos/(pos+neg)
        
#         print(acc)
        if acc == 1:
            acc_class = 3
        elif acc == 0.5:
            acc_class = 2
        elif acc <=0:
            acc_class = 0
        else:
            acc_class = 1
#         print(acc_class)
        vect.append(acc_class)   ## accuracy_group
        vect.append(acc)       ## accuracy
        vect.append(pos)       ## n_positive
        vect.append(neg)       ## n_negative
            
            
    else: 
        vect.append(-1)  ##accuracy_group
        vect.append(-1) ## accuracy_group
        vect.append(-1)  ## n_positive
        vect.append(-1)  ## n_negative
    
    ## Adding features relative to Game, Assessment and activity
    if typ != 'Clip':
        ## Avg time per instruction
        
        st = session['game_time'][session['event_code'] == 3010]
        en = session['game_time'][session['event_code'] == 3110]
        
        s = min(st.shape[0], en.shape[0])
        if s!=0:
            vect.append((en[:s].sum()-st[:s].sum())/s)  ## time per instruction
        else:
            vect.append(-1)  ##time per instruction
        
        ##total_time
        total_time = session['game_time'].max()/1000
        vect.append(total_time)
        
        ##number of actions
        n_action = session[(session['event_code']>=4020)&(session['event_code']<=4080)].shape[0]
        vect.append(n_action)
        
        ## rounds
        n_rounds_st = session[session['event_code']==2020].shape[0]
        n_rounds_en = session[session['event_code']==2030].shape[0]
        
        vect.append(n_rounds_st)
        vect.append(n_rounds_st)
        
        if n_rounds_st != 0:
            vect.append(n_action/n_rounds_st)
            vect.append(total_time/n_rounds_st)
        else:
            vect.append(-1)
            vect.append(-1)
        ## hints
        vect.append(session[session['event_code']==4090].shape[0])
        
    else:
        vect.append(-1)  ## time per instruction
        vect.append(-1)  ##  total time
        vect.append(-1)  ## n_actions
        vect.append(-1)  ## n_rounds_start
        vect.append(-1)  ## n_round end
        vect.append(-1)  ## action per round
        vect.append(-1)  ## time per round
        vect.append(-1)  ## N_hints
    
    if typ == 'Game' or typ == 'Assessment':
        ## feed backs
        
        n_correct_f = session[session['event_code'] == 3021].shape[0]
        n_incorrect_f = session[session['event_code'] == 3020].shape[0]
        
        vect.append(n_correct_f)
        vect.append(n_incorrect_f)
        
        if n_incorrect_f + n_correct_f != 0:
            vect.append(n_correct_f/(n_correct_f+n_incorrect_f))
        else:
            vect.append(0)
        
        if n_correct_f !=0:
            vect.append((session['game_time'][session['event_code'] == 3121].sum()-session['game_time'][session['event_code'] == 3021].sum())/n_correct_f)
        else:
            vect.append(-1)
        
        if n_incorrect_f !=0:
            vect.append((session['game_time'][session['event_code'] == 3120].sum()-session['game_time'][session['event_code'] == 3020].sum())/n_incorrect_f)
        else:
            vect.append(-1)
    else:
        vect.append(-1)  ## correct_feed
        vect.append(-1)  ## incorrect_feed
        vect.append(-1)  ## acc_feed
        vect.append(-1)  ## time correct feedback
        vect.append(-1)  ## time incorrect feedback
    
    if typ == 'Game':
        vect.append(session[session['event_code'] == 2080].shape[0])
        vect.append(session[session['event_code'] == 2081].shape[0])
        vect.append(session[session['event_code'] == 2060].shape[0])
        vect.append(session[session['event_code'] == 2075].shape[0])
        
        
    else:
        vect.append(-1)  ## n movie
        vect.append(-1)  ## n_skipp movie
        vect.append(-1)  ## tutorial
        vect.append(-1)  ## skipped tutorial
        
    
    
    return vect

titles = df['title'].unique()
def build_set(df, installation_id, game_session):
    df1 = df[df['installation_id']==installation_id]
    
    df1 = df1.sort_values(by = ['date'], ascending = True)
    date = df1[df1['game_session']==game_session].iloc[0]['date']
    pred_title = df1[df1['game_session']==game_session].iloc[0]['title']
    
    df1['pred_title'] = pred_title
    df1 = df1[df1['date'] < date]
    
    
    vect = []
    
    vect.append(pred_title)
    
    ## Clip
    df2 = df1[df1['type']=='Clip']
    vect.append(df2.shape[0])
    
    ## Activity
    df2 = df1[df1['type']=='Activity']
    
    #n_activity
    vect.append(df2.shape[0])
    # instruction
    vect.append(df2[df2['time_instruction']!=-1]['time_instruction'].values.astype('float32').mean())
    vect.append(df2[df2['total_time']!=-1]['total_time'].values.astype('float32').mean())
    vect.append(df2[df2['actions']!=-1]['actions'].values.astype('float32').mean())
    vect.append(df2[df2['start_rounds']!=-1]['start_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['end_rounds']!=-1]['end_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['action_rounds']!=-1]['action_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['time_rounds']!=-1]['time_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['hints']!=-1]['hints'].values.astype('float32').mean())
    
    ## Assessment
    df2 = df1[df1['type']=='Assessment']
    if pred_title in df2['title'].values:
        vect.append(1)
    else:
        vect.append(0)
    vect.append(df2.shape[0])
    vect.append(df2[df2['accuracy_group']!=-1]['accuracy_group'].values.astype('float32').mean())
    vect.append(df2[df2['time_instruction']!=-1]['time_instruction'].values.astype('float32').mean())
    vect.append(df2[df2['total_time']!=-1]['total_time'].values.astype('float32').mean())
    vect.append(df2[df2['actions']!=-1]['actions'].values.astype('float32').mean())
    vect.append(df2[df2['start_rounds']!=-1]['start_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['end_rounds']!=-1]['end_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['action_rounds']!=-1]['action_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['time_rounds']!=-1]['time_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['hints']!=-1]['hints'].values.astype('float32').mean())
    vect.append(df2[df2['correct_feed']!=-1]['correct_feed'].values.astype('float32').mean())
    vect.append(df2[df2['incorrect_feed']!=-1]['incorrect_feed'].values.astype('float32').mean())
    vect.append(df2[df2['time_corr_feed']!=-1]['time_corr_feed'].values.astype('float32').mean())
    vect.append(df2[df2['time_inc_feed']!=-1]['time_inc_feed'].values.astype('float32').mean())
    vect.append(df2[df2['acc_feed']!=-1]['acc_feed'].values.astype('float32').mean())
    vect.append(df2[df2['accuracy']!=-1]['accuracy'].values.astype('float32').mean())
    vect.append(df2[df2['n_positive']!=-1]['n_positive'].values.astype('float32').mean())
    vect.append(df2[df2['n_negative']!=-1]['n_negative'].values.astype('float32').mean())
    
    ## Games
    df2 = df1[df1['type']=='Game']
    vect.append(df2.shape[0])
    vect.append(df2[df2['accuracy_group']!=-1]['accuracy_group'].values.astype('float32').mean())
    vect.append(df2[df2['time_instruction']!=-1]['time_instruction'].values.astype('float32').mean())
    vect.append(df2[df2['total_time']!=-1]['total_time'].values.astype('float32').mean())
    vect.append(df2[df2['actions']!=-1]['actions'].values.astype('float32').mean())
    vect.append(df2[df2['start_rounds']!=-1]['start_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['end_rounds']!=-1]['end_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['action_rounds']!=-1]['action_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['time_rounds']!=-1]['time_rounds'].values.astype('float32').mean())
    vect.append(df2[df2['hints']!=-1]['hints'].values.astype('float32').mean())
    vect.append(df2[df2['correct_feed']!=-1]['correct_feed'].values.astype('float32').mean())
    vect.append(df2[df2['incorrect_feed']!=-1]['incorrect_feed'].values.astype('float32').mean())
    vect.append(df2[df2['time_corr_feed']!=-1]['time_corr_feed'].values.astype('float32').mean())
    vect.append(df2[df2['time_inc_feed']!=-1]['time_inc_feed'].values.astype('float32').mean())
    vect.append(df2[df2['acc_feed']!=-1]['acc_feed'].values.astype('float32').mean())
    vect.append(df2[df2['accuracy']!=-1]['accuracy'].values.astype('float32').mean())
    vect.append(df2[df2['n_positive']!=-1]['n_positive'].values.astype('float32').mean())
    vect.append(df2[df2['n_negative']!=-1]['n_negative'].values.astype('float32').mean())
    vect.append(df2[df2['movies']!=-1]['movies'].values.astype('float32').sum())
    vect.append(df2[df2['skipped_movie']!=-1]['skipped_movie'].values.astype('float32').sum())
    vect.append(df2[df2['tuto']!=-1]['tuto'].values.astype('float32').sum())
    vect.append(df2[df2['skipped_tuto']!=-1]['skipped_tuto'].values.astype('float32').mean())
    
    ## Activity done
    for x in titles:
        vect.append( df1[df1['title']==x].shape[0])
    
    
    return vect

columns = [
    'game_session',
'installation_id',
'title',
'type',
'world',
'timestamp',
'accuracy_group',
'accuracy',
'n_positive',
'n_negative',
'time_instruction',
'actions',
'total_time',
'start_rounds',
'end_rounds',
'action_rounds',
'time_rounds',
'hints',
'correct_feed',
'incorrect_feed',
'acc_feed',
'time_corr_feed',
'time_inc_feed',
'movies',
'skipped_movie',
'tuto',
'skipped_tuto'
]

cols = [
    'pred_title',
'n_clip',
'n_activity',
'Activity_time_instruction',
'Activity_total_time',
'Activity_actions',
'Activity_start_rounds',
'Activity_end_rounds',
'Activity_action_rounds',
'Activity_time_rounds',
'Activity_hints',
'same_title',
'n_assessment',
'Assessment_accuracy_group',
'Assessment_time_instruction',
'Assessment_total_time',
'Assessment_actions',
'Assessment_start_rounds',
'Assessment_end_rounds',
'Assessment_action_rounds',
'Assessment_time_rounds',
'Assessment_hints',
'Assessment_correct_feed',
'Assessment_incorrect_feed',
'Assessment_time_corr_feed',
'Assessment_time_inc_feed',
'Assessment_acc_feed',
'Assessment_accuracy',
'Assessment_n_positive',
'Assessment_n_negative',
'n_games',
'Games_accuracy_group',
'Games_time_instruction',
'Games_total_time',
'Games_actions',
'Games_start_rounds',
'Games_end_rounds',
'Games_action_rounds',
'Games_time_rounds',
'Games_hints',
'Games_correct_feed',
'Games_incorrect_feed',
'Games_time_corr_feed',
'Games_time_inc_feed',
'Games_acc_feed',
'Games_accuracy',
'Games_n_positive',
'Games_n_negative',
'Games_movies',
'Games_skipped_movie',
'Games_tuto',
'Games_skipped_tuto',  
]

for elt in titles:
    cols.append('actitivity_title_'+str(elt))

#### Loading Data and grouping by game session

#### Train

In [None]:
# df = pd.read_csv(train)
# labels = pd.read_csv(train_labels)
vect = df['event_code']
vect[df['event_id'] == '17113b36'] = 4100
df['event_code'] = vect

dataset = list(np.zeros(df['game_session'].unique().shape[0]))

print(df['game_session'].unique().shape[0])

count = 0
for i, session in df.groupby(['game_session']):
#     if count >= 100:
#         break
    line =build_feature(i, session)
#     dataset.append(line)
    dataset[count] = line
    if count % 1000 == 0:
        print(count)
    count += 1

dataset = np.array(dataset)

data = pd.DataFrame(dataset, columns = columns)

save(data, 'data by session')

#### Test

In [None]:
df = pd.read_csv(test)
vect = df['event_code']
vect[df['event_id'] == '17113b36'] = 4100
df['event_code'] = vect

dataset = list(np.zeros(df['game_session'].unique().shape[0]))

print(df['game_session'].unique().shape[0])

count = 0
for i, session in df.groupby(['game_session']):
#     if count >= 100:
#         break
    line =build_feature(i, session)
#     dataset.append(line)
    dataset[count] = line
    if count % 1000 == 0:
        print(count)
    count += 1

dataset = np.array(dataset)

data = pd.DataFrame(dataset, columns = columns)

save(data, 'data test by session')

#### Getting test labels

In [None]:
df = load('data by session')
labels = pd.read_csv(train_labels)

ids = df[df['type']=='Assessment']['installation_id'].unique()
df = df[df.installation_id.isin(ids)]
df = df[df.installation_id.isin(labels.installation_id.unique())]

# dtitle = categorise(df['title'])

# df = df.replace({'title' : dtitle})

# save(dtitle, 'dico_title')

df['date'] = df['timestamp'].apply(dateutil.parser.parse)

df = df.sort_values(by = ['date'], ascending =True)

save(df, 'data by session')

In [None]:
df = load('data test by session')

ids = df[df['type']=='Assessment']['installation_id'].unique()

# dtitle =load('dico_title')

# df = df.replace({'title' : dtitle})

df['date'] = df['timestamp'].apply(dateutil.parser.parse)

df = df.sort_values(by = ['date'], ascending =True)

game_inst = []
for elt in ids:
    df1 = df[df['installation_id']==elt]
    df1 = df1.sort_values(by = ['date'], ascending = True)
    game_inst.append(df1.iloc[-1]['game_session'])
    
save((ids, game_inst), 'test_labels')
save(df, 'data test by session')

#### Adding test session into train

In [None]:
df = pd.read_csv(test)

test_train = []

df_labels = df[(df['type'] == 'Assessment')&(df['event_code'] == 4100)][['installation_id', 'game_session']].drop_duplicates(subset = ['installation_id', 'game_session'])

df = load('data test by session')

vect = []

for i in range(df_labels.shape[0]):
    vect.append(df[(df['installation_id'] == df_labels.iloc[i]['installation_id'])&(df['game_session'] == df_labels.iloc[i]['game_session'])]['accuracy_group'].values[0])

df_labels['accuracy_group'] = vect

save(df_labels, 'test labels')

#### Building train dataset


In [None]:
df = load('data by session')

In [None]:
titles = df['title'].unique()

In [None]:
titles

In [None]:
cols = [
    'pred_title',
'n_clip',
'n_activity',
'Activity_time_instruction',
'Activity_total_time',
'Activity_actions',
'Activity_start_rounds',
'Activity_end_rounds',
'Activity_action_rounds',
'Activity_time_rounds',
'Activity_hints',
'same_title',
'n_assessment',
'Assessment_accuracy_group',
'Assessment_time_instruction',
'Assessment_total_time',
'Assessment_actions',
'Assessment_start_rounds',
'Assessment_end_rounds',
'Assessment_action_rounds',
'Assessment_time_rounds',
'Assessment_hints',
'Assessment_correct_feed',
'Assessment_incorrect_feed',
'Assessment_time_corr_feed',
'Assessment_time_inc_feed',
'Assessment_acc_feed',
'Assessment_accuracy',
'Assessment_n_positive',
'Assessment_n_negative',
'n_games',
'Games_accuracy_group',
'Games_time_instruction',
'Games_total_time',
'Games_actions',
'Games_start_rounds',
'Games_end_rounds',
'Games_action_rounds',
'Games_time_rounds',
'Games_hints',
'Games_correct_feed',
'Games_incorrect_feed',
'Games_time_corr_feed',
'Games_time_inc_feed',
'Games_acc_feed',
'Games_accuracy',
'Games_n_positive',
'Games_n_negative',
'Games_movies',
'Games_skipped_movie',
'Games_tuto',
'Games_skipped_tuto',  
]

for elt in titles:
    cols.append('actitivity_title_'+str(elt))

In [None]:


labels = pd.read_csv(train_labels)
labels_test = load('test labels')

dataset = list(range(labels.shape[0]))

df = load('data by session')
for i in range(labels.shape[0]):
    if i%100 == 0:
        print(i)
    installation_id = labels.iloc[i]['installation_id']
    game_session = labels.iloc[i]['game_session']
    dataset[i] = build_set(df, installation_id, game_session)

    
df = load('data test by session')
for i in range(labels_test.shape[0]):
    if i%100 == 0:
        print(i)
    installation_id = labels_test.iloc[i]['installation_id']
    game_session = labels_test.iloc[i]['game_session']
    dataset.append(build_set(df, installation_id, game_session))
    
df_data = pd.DataFrame(dataset, columns = cols)

save(df_data, 'train dataset')

#### Building test dataset

In [None]:
ids, game_inst = load('test_labels')

dataset = list(range(ids.shape[0]))

for i in range(ids.shape[0]):
    if i%100 == 0:
        print(i)
    installation_id = ids[i]
    game_session = game_inst[i]
    dataset[i] = build_set(df, installation_id, game_session)

df_data = pd.DataFrame(dataset, columns = cols)
save(df_data, 'dataset test')

#### Preparing for training

In [None]:
df_data.describe().loc['mean'].values

In [None]:
dataset = load('train dataset')

X = dataset
labels = pd.read_csv(train_labels)
test_labels = load('test labels')
Y = np.concatenate([labels['accuracy_group'].values.astype('float64'), test_labels['accuracy_group'].values.astype('float64')], axis = 0)
# y = np_utils.to_categorical(Y)

from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(Y),Y)

dtitle = load('dico_title')
X = X.replace({'pred_title': dtitle})
X = X.fillna(-1)
# X = dataset.iloc[17690:]
# Y = Y[17690:]
# X = X.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=43)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=43)

In [None]:
import pandas_profiling
df.profile_report(style={'full_width':True}, pool_size = 8)

In [None]:
X

In [None]:
# Y = Y[X['pred_title']==28]
# X = X[X['pred_title']==28]

# class_weights = class_weight.compute_class_weight('balanced',
#                                                  np.unique(Y),Y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool, CatBoostRegressor

# clf = RandomForestClassifier(n_estimators=500,n_jobs = 8, random_state=0)

# clf = CatBoostClassifier(
#                                loss_function='MultiClass',
#                                 eval_metric="WKappa",
#                                task_type="GPU",
#                                learning_rate=0.01,
#                                iterations=10000,
#                                od_type="Iter",
# #                                 depth=4,
#                                early_stopping_rounds=2000,
# #                                 l2_leaf_reg=10,
# #                                 border_count=254,
#                                random_seed=42,
#                                 #use_best_model=use_best_model,
#                                 class_weights=class_weights
#                               )

clf = CatBoostRegressor(loss_function='RMSE', 
                        iterations=1000,
                          learning_rate=0.01,
                        task_type="GPU",
#                         early_stopping_rounds=500,
                          depth=8, )

clf.fit(X_train, y_train,cat_features = [0], eval_set = (X_test, y_test))

pred = clf.predict(X_test)

# print(accuracy_score(y_test, pred))
# print(cohen_kappa_score(y_test, pred, weights = 'quadratic'))

In [None]:
params = {'n_estimators':5000,
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.005,
            'feature_fraction': 0.9,
         'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'verbose': 100,
            'early_stopping_rounds': 100, 'eval_metric': 'cappa'
            }

from lightgbm import LGBMRegressor

clf = LGBMRegressor(**params)
clf.fit(X_train, y_train,categorical_feature =[0], eval_set = (X_test, y_test), eval_metric='cappa')

In [None]:
import optuna
def tres(pred, t):
    pred1 = deepcopy(pred)
    pred1[pred1 < t[0]] = 0
    pred1[(pred1>= t[0])&(pred1 < t[1])] = 1
    pred1[(pred1>= t[1])&(pred1 < t[2])] = 2
    pred1[pred1 >= t[2]] = 3
    
    return pred1

def objective(trial, pred = clf.predict(X_train), y_test = y_train):
# def objective(trial, pred = clf.predict(X_test), y_test = y_test):
# def objective(trial, pred = clf.predict(X_val), y_test = y_val):
    x0 = trial.suggest_uniform('x0', pred.min(), pred.max())
    x1 = trial.suggest_uniform('x1', x0,  pred.max())
    x2 = trial.suggest_uniform('x2', x1,  pred.max())

    t = [x0, x1, x2]
    pred1 = tres(pred, t)
    a = cohen_kappa_score(y_test, pred1, weights = 'quadratic')
    print(a)
    return 1-a

study = optuna.create_study()
study.optimize(objective, n_trials=100)

In [None]:
t = list(study.best_params.values())

pred = clf.predict(X_val)
pred1 = tres(pred, t)

print(cohen_kappa_score(y_val, pred1, weights = 'quadratic'))

pred = clf.predict(X_test)
pred1 = tres(pred, t)

print(cohen_kappa_score(y_test, pred1, weights = 'quadratic'))

In [None]:
ind0 = 90
ind1 = 110
plt.plot(X.columns[ind0:ind1], clf.feature_importances_[ind0:ind1])
plt.xticks(X.columns[ind0:ind1], X.columns[ind0:ind1], rotation = 'vertical')
plt.savefig('features.png')

In [None]:
dir(clf)

In [None]:
l = []
for i in range(len(X.columns)):
    l.append([clf.feature_importances_[i], X.columns[i]])


In [None]:
l.sort(reverse = True)

In [None]:
l = np.array(l)

In [None]:
l[:70]

In [None]:
feats_to_keep = l[:30,1]

## Test 

In [None]:
ids, game_inst = load('test_labels')
df_data = load('dataset test')

In [None]:
df_data = df_data.replace({'pred_title': dtitle})
df_data =df_data.fillna(-1)

pred = clf.predict(df_data)
# pred = pred.reshape((pred.shape[0])).astype(int)

In [None]:
pred = tres(pred, t)

In [None]:
X = []
for i in range(len(pred)):
    X.append([ids[i], pred[i]])

In [None]:
pred = pd.DataFrame(X, columns = ['installation_id','accuracy_group'])

In [None]:
pred.index = pred['installation_id']

In [None]:
sample_sub = pd.read_csv(subs)
sample_sub['accuracy_group'] = pred.loc[sample_sub['installation_id'].values]['accuracy_group'].values

In [None]:
sample_sub.to_csv('submission.csv', index = False)