In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm
import lightgbm as lgb
import riiideducation
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import stats

import random
import os

import pickle

In [2]:
# Random seed
SEED = 123

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

# Prior Question Elapsed time mean

In [3]:
prior_question_elapsed_time_mean=15162.5927734375

# Features Dicts

In [4]:
# Funcion for user stats with loops
def add_features(df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_uq,timestamp_l, nb_l_watched, part_u_count,part_u_sum, tags_u_count,tags_u_sum,first_bundle_dict,community_u_sum,community_u_count,timestamp_u_correct): 
    
    # -----------------------------------------------------------------------
    for num, row in enumerate(df[['user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'timestamp','content_type_id','part','tags1','bundle_id','community']].values): 
        
        if row[6]==False:
            
            # Client features updates
            answered_correctly_u_count[row[0]] += 1
            answered_correctly_u_sum[row[0]] += row[1]
            part_u_count[row[0]][row[7]]+=1
            part_u_sum[row[0]][row[7]]+= row[1]
            tags_u_sum[row[0]][row[8]]+= row[1]
            tags_u_count[row[0]][row[8]]+=1
            elapsed_time_u_sum[row[0]] += row[3]
            explanation_u_sum[row[0]] += int(row[4])
            
            if row[1]==1:
                timestamp_u_correct[row[0]]=row[5]
            
            if len(timestamp_u[row[0]]) == 3:
                timestamp_u[row[0]].pop(0)
                timestamp_u[row[0]].append(row[5])
            else:
                timestamp_u[row[0]].append(row[5])
            
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[5])
                else:
                    timestamp_u_incorrect[row[0]].append(row[5])
            
            # ------------------------------------------------------------------
            # Community features updates
            community_u_count[row[0]][row[10]]+=1
            community_u_sum[row[0]][row[10]]+= row[1]
            # ------------------------------------------------------------------
            # Bundle features updates
            if first_bundle_dict[row[0]]==0:
                first_bundle_dict[row[0]]=row[9]
            # ------------------------------------------------------------------
            # Client Question updates
            answered_correctly_uq[row[0]][row[2]] += 1
           
            
        else:
            

            nb_l_watched[row[0]]+=1
            
            timestamp_l[row[0]]=row[5]
            
    return

In [5]:
def read_and_preprocess(feature_engineering = False):
    
    # Client dictionaries
    answered_correctly_u_count = defaultdict(int)
    answered_correctly_u_sum = defaultdict(int)
    elapsed_time_u_sum = defaultdict(int)
    explanation_u_sum = defaultdict(int)
    timestamp_u = defaultdict(list)
    timestamp_u_incorrect = defaultdict(list)
    timestamp_u_correct = defaultdict(int)
    part_u_count=defaultdict(lambda: defaultdict(int))
    part_u_sum=defaultdict(lambda: defaultdict(int))
    tags_u_count=defaultdict(lambda: defaultdict(int))
    tags_u_sum=defaultdict(lambda: defaultdict(int))
    
    # Community dictionaries
    community_u_count=defaultdict(lambda: defaultdict(int))
    community_u_sum=defaultdict(lambda: defaultdict(int))
    
    # Bundle dictionaries
    first_bundle_dict=defaultdict(int)
    
    # Client Question dictionary
    answered_correctly_uq = defaultdict(lambda: defaultdict(int))
    
    # Lectures dictionaries
    timestamp_l=defaultdict(int)
    nb_l_watched=defaultdict(int)
    
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    
    train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
    valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
    question_file = '../input/import-for-gcp/communities.csv'
    
    # Merge with question dataframe
    questions_df = pd.read_csv(question_file)[['question_id','part','community','bundle_id','tags1']]
    questions_df['part'] = questions_df['part'].astype(np.int8)
    questions_df['community'] = questions_df['community'].astype(np.int8)
    questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)
    questions_df['tags1']=questions_df['tags1'].astype(np.int16)
    #questions_df['tags_encoded']=questions_df['tags_encoded'].astype(np.int16)
    
    
    gc.collect()
    
    # Read data
    feld_needed = ['timestamp', 'user_id', 'answered_correctly', 'content_id', 'content_type_id', 'prior_question_elapsed_time', 'prior_question_had_explanation']
    train = pd.read_pickle(train_pickle)[feld_needed].iloc[-50000000:]
    
    # Changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    
    
    # Fill prior question elapsed time with the mean
    train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    
    
    
    train = pd.merge(train, questions_df[['question_id', 'part','tags1','bundle_id','community']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    
    
    print('User feature calculation started...')
    print('\n')
    train = add_features(train, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_uq, timestamp_l,nb_l_watched, part_u_count,part_u_sum, tags_u_count,tags_u_sum, first_bundle_dict, community_u_sum,community_u_count, timestamp_u_correct)
    
    del train
    gc.collect()
    
    print('Second Iteration')
    
    train = pd.read_pickle(train_pickle)[feld_needed].iloc[-90000000:-50000000]
    
    # Changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    
    # Fill prior question elapsed time with the mean
    train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    
    # Merge
    train = pd.merge(train, questions_df[['question_id', 'part','tags1','bundle_id','community']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    gc.collect()
    
    print('User feature calculation started...')
    print('\n')
    train = add_features(train, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_uq, timestamp_l,nb_l_watched, part_u_count,part_u_sum, tags_u_count,tags_u_sum, first_bundle_dict, community_u_sum,community_u_count, timestamp_u_correct)
    
    del train
    gc.collect()
    
    
    
    print('valid')
    

    
    valid = pd.read_pickle(valid_pickle)[feld_needed]
    
    # Changing dtype to avoid lightgbm error
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')
    
    # Fill prior question elapsed time with the mean
    valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    
    # Merge
    valid = pd.merge(valid, questions_df[['question_id', 'part','tags1','bundle_id','community']], left_on = 'content_id', right_on = 'question_id', how = 'left')
        
    print('User feature calculation started...')
    print('\n')
    valid = add_features(valid, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_uq, timestamp_l,nb_l_watched, part_u_count,part_u_sum, tags_u_count,tags_u_sum, first_bundle_dict, community_u_sum,community_u_count, timestamp_u_correct)
    
    del valid
    gc.collect()
    
    print('Creating Features Dicts')
    
    features_dicts = {
        'answered_correctly_u_count': answered_correctly_u_count,
        'answered_correctly_u_sum': answered_correctly_u_sum,
        'elapsed_time_u_sum': elapsed_time_u_sum,
        'explanation_u_sum': explanation_u_sum,
        'answered_correctly_uq': answered_correctly_uq,
        'timestamp_u': timestamp_u,
        'timestamp_u_incorrect': timestamp_u_incorrect,
        'timestamp_u_correct':timestamp_u_correct,
        'nb_l_watched':nb_l_watched,
        'timestamp_l':timestamp_l,
        'part_u_count':part_u_count,
        'part_u_sum':part_u_sum,
        'tags_u_count':tags_u_count,
        'tags_u_sum':tags_u_sum,
        'first_bundle_dict': first_bundle_dict,
        'community_u_count': community_u_count,
        'community_u_sum':community_u_sum
        
    }

    
    return features_dicts

In [6]:
%%time
features_dicts = read_and_preprocess(feature_engineering = True)

User feature calculation started...


Second Iteration
User feature calculation started...


valid
User feature calculation started...


Creating Features Dicts
CPU times: user 46min 27s, sys: 48.1 s, total: 47min 15s
Wall time: 48min 42s


# AGG FILE AND PART FILE

In [7]:
gc.collect()

20

In [8]:
questions_df = pd.read_csv('../input/import-for-gcp/communities.csv')
questions_df['part'] = questions_df['part'].astype(np.int8)
questions_df['community'] = questions_df['community'].astype(np.int8)
questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)
questions_df['tags1']=questions_df['tags1'].astype(np.int16)
questions_df['tags_encoded']=questions_df['tags_encoded'].astype(np.int16)

In [9]:
agg_file='../input/import-for-gcp/features.pkl'
part_file='../input/import-for-gcp/part.pkl'

agg=pd.read_pickle(agg_file)
agg['question_correctly_q_count']=agg['question_correctly_q_count'].astype(np.int32)
agg['question_correctly_q_mean']=agg['question_correctly_q_mean'].astype(np.float32)
agg['question_elapsed_time_mean']=agg['question_elapsed_time_mean'].astype(np.float32)
agg['question_had_explanation_mean']=agg['question_had_explanation_mean'].astype(np.float32)

part=pd.read_pickle(part_file)
part['part_elapsed_time_mean']=part['part_elapsed_time_mean'].astype(np.int32)
part['part_had_explanation_mean']=part['part_had_explanation_mean'].astype(np.float32)
part['part_correctly_q_mean']=part['part_correctly_q_mean'].astype(np.float32)

# Target / Features / Model

In [10]:
TARGET = 'answered_correctly'
    
# Features to train and predict
FEATURES = ['prior_question_elapsed_time', 'part', 'prior_question_had_explanation',  
                'answered_correctly_u_avg', 'elapsed_time_u_avg', 'explanation_u_avg', 'question_correctly_q_count',
                'question_correctly_q_mean', 'question_had_explanation_mean','question_elapsed_time_mean',
                'answered_correctly_uq_count','timestamp_u_correct_recency',
                'timestamp_u_recency_1', 'timestamp_u_recency_2', 'timestamp_u_recency_3', 
                'timestamp_u_incorrect_recency','tags1', 'part_elapsed_time_mean','part_had_explanation_mean',
                'part_correctly_q_mean',
                'nb_u_lect_watched','timestamp_l_recency_1','part_u_avg','tags_u_avg','user_count','user_part_count',
               'first_bundle','community',
                'community_u_avg','tags_encoded']

model = pickle.load(open('../input/best-ever-done/lgbm_80M_30.pkl', 'rb'))

# Inference

In [11]:
def update_features(df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_uq,timestamp_l, nb_l_watched, part_u_count,part_u_sum, tags_u_count,tags_u_sum,first_bundle_dict,community_u_sum,community_u_count, timestamp_u_correct):
    
    for row in df[['user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'timestamp','content_type_id','part','tags1','bundle_id','community']].values:
        
        if row[6]==0:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_count[row[0]] += 1
            answered_correctly_u_sum[row[0]] += row[1]
            part_u_count[row[0]][row[7]]+=1
            part_u_sum[row[0]][row[7]]+= row[1]
            tags_u_sum[row[0]][row[8]]+= row[1]
            tags_u_count[row[0]][row[8]]+=1
            elapsed_time_u_sum[row[0]] += row[3]
            explanation_u_sum[row[0]] += int(row[4])
            
            if row[1]==1:
                timestamp_u_correct[row[0]]=row[5]
            
            if len(timestamp_u[row[0]]) == 3:
                timestamp_u[row[0]].pop(0)
                timestamp_u[row[0]].append(row[5])
            else:
                timestamp_u[row[0]].append(row[5])
            
            if row[1] == 0:
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[5])
                else:
                    timestamp_u_incorrect[row[0]].append(row[5])
            
            # ------------------------------------------------------------------
            # Community features updates
            community_u_count[row[0]][row[10]]+=1
            community_u_sum[row[0]][row[10]]+= row[1]
            # ------------------------------------------------------------------
            # Bundle features updates
            if first_bundle_dict[row[0]]==0:
                first_bundle_dict[row[0]]=row[9]
            # ------------------------------------------------------------------
            # Client Question updates
            answered_correctly_uq[row[0]][row[2]] += 1

            

    return

In [12]:
def add_features_inf(df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_uq,timestamp_l, nb_l_watched, part_u_count,part_u_sum, tags_u_count,tags_u_sum,first_bundle_dict,community_u_sum,community_u_count,timestamp_u_correct):
    iteration=0
    # -----------------------------------------------------------------------
    # Client features
    answered_correctly_u_avg = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    elapsed_time_u_avg = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    explanation_u_avg = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    timestamp_u_recency_1 = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    timestamp_u_recency_2 = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    timestamp_u_recency_3 = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    timestamp_u_incorrect_recency = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    timestamp_u_correct_recency = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    part_u_avg=np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    tags_u_avg=np.zeros(len(df[df.content_type_id==False]),dtype=np.float32)
    user_count=np.zeros(len(df[df.content_type_id==False]),dtype=np.int32)
    user_part_count=np.zeros(len(df[df.content_type_id==False]),dtype=np.int32)
    # -----------------------------------------------------------------------
    # Bundle features
    first_bundle= np.zeros(len(df[df.content_type_id==False]),dtype=np.float32)
    # -----------------------------------------------------------------------
    #Community features
    community_u_avg = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    # -----------------------------------------------------------------------
    # User Question
    answered_correctly_uq_count = np.zeros(len(df[df.content_type_id==False]), dtype = np.int32)
    # -----------------------------------------------------------------------
    # Lecture feature
    timestamp_l_recency_1 = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    #timestamp_l_recency_2 = np.zeros(len(df[df.content_type_id==False]), dtype = np.float32)
    nb_u_lect_watched= np.zeros(len(df[df.content_type_id==False]), dtype = np.int16)
    
    # -----------------------------------------------------------------------
    for num, row in enumerate(df[['user_id', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'timestamp','content_type_id','part','tags1','bundle_id','community']].values):
        
        num=num-iteration
        if row[5]==0:
            # Client features assignation
            # ------------------------------------------------------------------
            if answered_correctly_u_count[row[0]] != 0:
                answered_correctly_u_avg[num] = answered_correctly_u_sum[row[0]] / answered_correctly_u_count[row[0]]
                elapsed_time_u_avg[num] = elapsed_time_u_sum[row[0]] / answered_correctly_u_count[row[0]]
                explanation_u_avg[num] = explanation_u_sum[row[0]] / answered_correctly_u_count[row[0]]
            else:
                answered_correctly_u_avg[num] = np.nan
                elapsed_time_u_avg[num] = np.nan
                explanation_u_avg[num] = np.nan

            if len(timestamp_u[row[0]]) == 0:
                timestamp_u_recency_1[num] = np.nan
                timestamp_u_recency_2[num] = np.nan
                timestamp_u_recency_3[num] = np.nan
                
            elif len(timestamp_u[row[0]]) == 1:
                timestamp_u_recency_1[num] = row[4] - timestamp_u[row[0]][0]
                timestamp_u_recency_2[num] = np.nan
                timestamp_u_recency_3[num] = np.nan
                
            elif len(timestamp_u[row[0]]) == 2:
                timestamp_u_recency_1[num] = row[4] - timestamp_u[row[0]][1]
                timestamp_u_recency_2[num] = timestamp_u[row[0]][1] - timestamp_u[row[0]][0]
                timestamp_u_recency_3[num] = np.nan
            elif len(timestamp_u[row[0]]) == 3:
                timestamp_u_recency_1[num] = row[4] - timestamp_u[row[0]][2]
                timestamp_u_recency_2[num] = timestamp_u[row[0]][2] - timestamp_u[row[0]][1]
                timestamp_u_recency_3[num] = timestamp_u[row[0]][1] - timestamp_u[row[0]][0]
                

            if len(timestamp_u_incorrect[row[0]]) == 0:
                timestamp_u_incorrect_recency[num] = np.nan
            else:
                timestamp_u_incorrect_recency[num] = row[4] - timestamp_u_incorrect[row[0]][0]
                
            if part_u_count[row[0]][row[6]]!=0:
                part_u_avg[num]=part_u_sum[row[0]][row[6]]/part_u_count[row[0]][row[6]]
            else:
                part_u_avg[num]=np.nan
            
            if tags_u_count[row[0]][row[7]]!=0:
                tags_u_avg[num]=tags_u_sum[row[0]][row[7]]/tags_u_count[row[0]][row[7]]
            else:
                tags_u_avg[num]=np.nan
                
            user_count[num]=answered_correctly_u_count[row[0]]
            user_part_count[num]=part_u_count[row[0]][row[6]]
            
            if timestamp_u_correct[row[0]]!=0:
                timestamp_u_correct_recency[num]=row[4] - timestamp_u_correct[row[0]]
            else:
                timestamp_u_correct_recency[num]=np.nan
                
            
            # ------------------------------------------------------------------
            # Community features assignation
            
            if community_u_count[row[0]][row[9]]!=0:
                community_u_avg[num]=community_u_sum[row[0]][row[9]]/community_u_count[row[0]][row[9]]
            else:
                community_u_avg[num]=np.nan
            # ------------------------------------------------------------------
            # Bundle features assignation                
            if first_bundle_dict[row[0]]!=0:
                first_bundle[num]=first_bundle_dict[row[0]]
            else:
                first_bundle[num]=np.nan
            # ------------------------------------------------------------------
            # Lectures features assignation
            if timestamp_l[row[0]]!=0:
                timestamp_l_recency_1[num]=row[4]-timestamp_l[row[0]]
            else:
                timestamp_l_recency_1[num]=np.nan
                
            nb_u_lect_watched[num]=nb_l_watched[row[0]]
            # ------------------------------------------------------------------
            # Client Question assignation
            answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[1]]
            # ------------------------------------------------------------------
            # ------------------------------------------------------------------
            
        else:
            
            iteration+=1
            nb_l_watched[row[0]]+=1
            
            timestamp_l[row[0]]=row[4]
            
    user_df = pd.DataFrame({'answered_correctly_u_avg': answered_correctly_u_avg, 'elapsed_time_u_avg': elapsed_time_u_avg, 'explanation_u_avg': explanation_u_avg,  
                            'answered_correctly_uq_count': answered_correctly_uq_count, 'timestamp_u_recency_1': timestamp_u_recency_1, 'timestamp_u_recency_2': timestamp_u_recency_2,
                            'timestamp_u_recency_3': timestamp_u_recency_3, 
                             'timestamp_u_incorrect_recency': timestamp_u_incorrect_recency, 'timestamp_l_recency_1': timestamp_l_recency_1, 
                            'nb_u_lect_watched':nb_u_lect_watched,'part_u_avg':part_u_avg,'tags_u_avg':tags_u_avg,'user_count':user_count,'user_part_count':user_part_count,
                        'first_bundle':first_bundle,
                           'community_u_avg':community_u_avg,'timestamp_u_correct_recency':timestamp_u_correct_recency})
    
    df = df.loc[df.content_type_id == False].reset_index(drop = True)
    
    df = pd.concat([df, user_df], axis = 1)
    
    
    return df

In [13]:
def inference(TARGET, FEATURES, model, prior_question_elapsed_time_mean, features_dicts, questions_df,agg, part):
    
    # Get feature dict
    answered_correctly_u_count = features_dicts['answered_correctly_u_count']
    answered_correctly_u_sum = features_dicts['answered_correctly_u_sum']
    elapsed_time_u_sum = features_dicts['elapsed_time_u_sum']
    explanation_u_sum = features_dicts['explanation_u_sum']
    answered_correctly_uq = features_dicts['answered_correctly_uq']
    timestamp_u = features_dicts['timestamp_u']
    timestamp_u_incorrect = features_dicts['timestamp_u_incorrect']
    timestamp_u_correct = features_dicts['timestamp_u_correct']
    nb_l_watched= features_dicts['nb_l_watched']
    timestamp_l = features_dicts['timestamp_l']
    part_u_count= features_dicts['part_u_count']
    part_u_sum= features_dicts['part_u_sum']
    tags_u_count=features_dicts['tags_u_count']
    tags_u_sum=features_dicts['tags_u_sum']
    first_bundle_dict=features_dicts['first_bundle_dict']
    community_u_sum= features_dicts['community_u_sum']
    community_u_count= features_dicts['community_u_count']
    
    # Get api iterator and predictor
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    
    
    previous_test_df = None
    for (test_df, sample_prediction_df) in iter_test:
        if previous_test_df is not None:
            previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
            update_features(previous_test_df,answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_uq,timestamp_l, nb_l_watched, part_u_count,part_u_sum, tags_u_count,tags_u_sum,first_bundle_dict,community_u_sum,community_u_count, timestamp_u_correct)
   
        test_df=pd.merge(test_df, questions_df[['question_id', 'part','tags1','bundle_id','community','tags_encoded']], left_on = 'content_id', right_on = 'question_id', how = 'left')
        
        test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
        test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
        
        previous_test_df = test_df.copy()
        
        test_df = add_features_inf(test_df, answered_correctly_u_count, answered_correctly_u_sum, elapsed_time_u_sum, explanation_u_sum, timestamp_u, timestamp_u_incorrect, answered_correctly_uq,timestamp_l, nb_l_watched, part_u_count,part_u_sum, tags_u_count,tags_u_sum,first_bundle_dict,community_u_sum,community_u_count, timestamp_u_correct)
   
        
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
        
        test_df = pd.merge(test_df, agg, on = 'content_id', how = 'left')
        test_df = pd.merge(test_df, part, on = 'part', how = 'left')
    
        test_df[TARGET] =  model.predict(test_df[FEATURES])
        env.predict(test_df[['row_id', TARGET]])
        
    print('Job Done')

In [14]:
%%time
inference(TARGET, FEATURES, model, prior_question_elapsed_time_mean, features_dicts,questions_df, agg, part)

Job Done
CPU times: user 1.04 s, sys: 41 ms, total: 1.08 s
Wall time: 411 ms
