# **Riiid! Answer Correctness Prediction**

## Imports librairies

In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score
from tqdm.notebook import tqdm



#del datafram_df
#gc.collect()

## Functions

In [None]:
def print_filling_rate (data_df, a_label_abscisse, color_graphe, color_threshold  ) : 
    """ 
        Display the filling rate of the columns
        @donnees_df : dataframe qui contient les données
        @a_seuil : booléen égal à True si on souhaite afficher le seuil
        @a_label_abscisse : booléen égal à True si on souhaite afficher le nom des colonnes en abscisse
        @color_graphe
        @color_threshold 
    """
    if(color_graphe == ''):
        color_graphe = 'blue'
    data = (data_df.count() / len(data_df)).sort_values().values
    ind = np.arange(len(data))
    width = 0.5
    fig, axes = plt.subplots(1,1,figsize=(6, 3), dpi=100)
    tr = axes.bar(ind, data, width, color=color_graphe)
    axes.set_ylabel('Filling rates');
    if(a_label_abscisse):
        axes.set_xticks(ind )
        axes.set_xticklabels((data_df.count() / len(data_df)).sort_values().index, fontsize=10, rotation=90)
        axes.legend([tr], ['Filling rates'])
        

def print_pie(data_df, column, title_fig, title_legend) :
    """
    @data_df : data
    """
    fig, ax = plt.subplots(figsize=(6, 6))
    #colors = ['blue','orange', 'green',  'pink', 'blue', 'teal',  'olive',    'deepskyblue',  'slategray',  'rebeccapurple',   'rosybrown','indianred','goldenrod','gold', 'khaki']
    ttl = plt.title(title_fig, fontsize=15, weight="bold")
    ttl.set_position([0, 1.05])
    data_df[column].value_counts(normalize=True).sort_index().plot(kind='pie', startangle=180, counterclock = False, autopct='%1.1f%%', fontsize = 14) #labels =labels, 
    plt.axis('equal')
    plt.ylabel('')
    plt.rcParams['legend.title_fontsize'] = 'large'
    ax.legend(title=title_legend, loc="center right",  bbox_to_anchor=(1, 0, 1, 1), fontsize='medium') #labels,
    plt.show()
        
        

## Data loading

We will follow this tutorials : [Competition API Detailed Introduction](https://www.kaggle.com/sohier/competition-api-detailed-introduction) and 
[Tutorial on reading large datasets](http://https://www.kaggle.com/rohanrao/tutorial-on-reading-large-datasets/).

In [None]:
%%time

questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv')
example_test = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_test.csv')
example_sample_submission = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv')

## <span style="color:green">train.csv<span style="color:darkblue">
### <span style="color:green">The variables :<span style="color:darkblue"> 

**<span style="color:darkblue">row_id</span>**: (int64) ID code for the row.

<span style="color:darkblue">**user_id**</span>: (int32) ID code for the user.
    
<span style="color:darkblue">**timestamp**</span>: (int64) the time in milliseconds between this user interaction and the first event completion from that user. <br>
<span style="color:crimson">**Continuous variable**</span>

    
<span style="color:darkblue">**user_answer**</span>: (int8) <br />
0, 1, 2, 3 if content_type_id == 0. -1 if content_type_id <br>
<span style="color:crimson">**Categorical variable**</span>


<span style="color:darkblue">**content_id**</span>: (int16) ID code for the user interaction

    
<span style="color:darkblue">**content_type_id**</span>: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.<br>
<span style="color:crimson">**Categorical variable**</span>

<span style="color:darkblue">**task_container_id**</span>: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.

<span style="color:darkblue">**prior_question_elapsed_time**</span>: (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.<br>
<span style="color:crimson">**Continuous variable**</span>

<span style="color:darkblue">**prior_question_had_explanation**</span>: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.<br>
<span style="color:crimson">**Boolean variable**</span>

<span style="color:green">**TARGET**:</span><br>

<span style="color:darkblue">**answered_correctly**</span>: (int8) if the user responded correctly. Read -1 as null, for lectures.<br>



=> We want to predict if the user will answer correctly or not.<br>
We will exclure the value "-1" which corresponds to the lecture, not the answer.<br>
<span style="color:magenta">**So we have a binary classification problem to solve.**</span>

In [None]:
%%time

dtypes = {
    "row_id": "int64",
    "timestamp": "int64",
    "user_id": "int32",
    "content_id": "int16",
    "content_type_id": "int8",
    "task_container_id": "int16",
    "user_answer": "int8",
    "answered_correctly": "int8",
    "prior_question_elapsed_time": "float32", 
    "prior_question_had_explanation": "boolean"
}

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', 
                       low_memory=False, 
                       nrows=10**6, 
                       dtype=dtypes
                      )
print("Train size:", train_df.shape)

In [None]:
train_df.memory_usage(deep=True)

In [None]:
train_df.info()

In [None]:
train_df.head(4)

In [None]:
train_df.describe()

In [None]:
print("The dataset contains {} rows and {} columns. \n".format(train_df.shape[0], train_df.shape[0]))
for col in train_df:
    print("The column {} has {} unique values.".format(col, train_df[col].nunique()))

In [None]:
print(train_df.isnull().sum())
print("****************************************")
print(train_df.isnull().sum() / len(train_df))

print_filling_rate (train_df, True, "blue", "blue")

## user_id

* 3 824 unique users
* no missing value

## Categorical variables :
* content_type_id
* user_answer
* answered_correctly **(TARGET)**

### content_type_id

In [None]:
train_df.content_type_id.value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plt.title('Percentage of content type : questions or lectures', fontsize=15, weight="bold")
labels = ['Questions','Lectures']
colors = ['lightgray','peachpuff']
explode=(0.1,0.2 )
train_df["content_type_id"].value_counts(normalize=True).plot(kind='pie', labels=labels, colors=colors, explode=explode, startangle=50, autopct='%1.1f%%', fontsize = 13)
plt.axis('equal') 
plt.ylabel('')
plt.show()

print(train_df['content_type_id'].value_counts().sort_index().to_frame())

The majority of the users interactions is <span style="color:magenta">**questions**</span> : 98% VS 2% for lectures.

### user_answer

In [None]:
train_df.user_answer.value_counts()

In [None]:
print_pie(train_df[train_df['user_answer']>=0], 'user_answer', "Percentage of user answers (only questions) :", "User answers: ")

The user answers are 0, 1, 2 or 3. But what means 0 : is it a possible answer, or does it mean "no answer" ? 
Let have a look at the possible answers (in questions_df) :

In [None]:
print_pie(questions_df, 'correct_answer', "Percentage of possible answers:", "Possible answers: ")

Ok, 0 is a possible answer. 

How to know if the user didn't know the answer, is it something possible ?

## **answered_correctly = TARGET**

In [None]:
train_df.answered_correctly.value_counts()

In [None]:
# print_pie(train_df[train_df['answered_correctly']>=0], 'answered_correctly', "User answered correctly ? :", "Answered correctly : ")
fig, ax = plt.subplots(figsize=(5, 5))
plt.title('Percentage of correct and uncorrect answers (only questions) :', fontsize=15, weight="bold")
labels = ['Correct', 'Uncorrect']
colors = ['lightgray','peachpuff']
explode=(0.1,0)
train_df[train_df['answered_correctly']>=0]["answered_correctly"].value_counts(normalize=True).plot(kind='pie', labels=labels, colors=colors, explode=explode, startangle=50, autopct='%1.1f%%', fontsize = 13)
plt.axis('equal') 
plt.ylabel('')
plt.show()

print(train_df[train_df['answered_correctly']>=0]['answered_correctly'].value_counts().sort_index().to_frame())

### 'prior_question_had_explanation' (boolean)

In [None]:
train_df['prior_question_had_explanation'].fillna(False, inplace=True)
train_df.loc[train_df["prior_question_had_explanation"]== True, "prior_had_explanation_le"] = 1
train_df.loc[train_df["prior_question_had_explanation"]== False, "prior_had_explanation_le"] = 0

In [None]:
# print_pie(train_df[train_df['answered_correctly']>=0], 'answered_correctly', "User answered correctly ? :", "Answered correctly : ")
fig, ax = plt.subplots(figsize=(5, 5))
plt.title('Prior question had explanation ? (percentage):', fontsize=15, weight="bold")
labels = ['Yes', 'No']
colors = ['lightgray','peachpuff']
explode=(0.1,0)
train_df["prior_had_explanation_le"].value_counts(normalize=True).plot(kind='pie', labels=labels, colors=colors, explode=explode, startangle=50, autopct='%1.1f%%', fontsize = 13)
plt.axis('equal') 
plt.ylabel('')
plt.show()

print(train_df['prior_had_explanation_le'].value_counts().sort_index().to_frame())

In [None]:
pd.set_option('display.max_columns', None)
pivot_table = pd.pivot_table(train_df, index=['user_answer']).style.background_gradient()
pivot_table

In [None]:
train_only_questions_df = train_df[train_df['content_type_id'] == 0]
train_only_questions_df['prior_question_had_explanation'].value_counts() / len(train_only_questions_df[~train_only_questions_df['prior_question_had_explanation'].isna()])

In [None]:
train_only_questions_df[~train_only_questions_df['prior_question_had_explanation'].isna()].groupby('prior_question_had_explanation')['answered_correctly'].mean()

## Continue variables :
* timestamp (Time variables milliseconds)
* prior_question_elapsed_time (Time variables milliseconds)

### timestamp

In [None]:
nbMillisecByYear = 1000 * 60 * 60 * 24 * 365
nbMillisecByMonth = nbMillisecByYear / 12
train_df['timestamp_by_month'] = train_df['timestamp']/nbMillisecByMonth
fig = plt.figure(figsize=(12,6))
train_df['timestamp_by_month'].plot.hist(bins=100)
plt.title("Histogram of timestamp converted in month")
plt.xticks(rotation=0)
plt.xlabel("Months between this user interaction and the first event completion from that user")
plt.show()

In [None]:
train_df = train_df[train_df['timestamp_by_month'] <= 24]

In [None]:
train_df['timestamp'].plot(kind='box', subplots=True, title='Boxplot timestamp', figsize=(20,20), layout=(4,4))
plt.show()

### prior_question_elapsed_time

In [None]:
train_df['prior_question_elapsed_time'].hist(figsize=(7,4), density=True, bins= 50)
plt.show()



In [None]:
time_limit = 5*60*1000
print(time_limit)

In [None]:
train_df['prior_question_elapsed_time_in_min'] = train_df['prior_question_elapsed_time']/1000/60

In [None]:
train_df['prior_question_elapsed_time'].plot(kind='box', subplots=True, title='Boxplot prior_question_elapsed_time', figsize=(20,20), layout=(4,4))
plt.show()

In [None]:
prior_question_elapsed_time_mean = train_df[~train_df['prior_question_elapsed_time'].isna()]['prior_question_elapsed_time'].mean()
train_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)

In [None]:
train_df.columns

### Correlation

In [None]:
list_columns_corr = ['timestamp', 'user_id', 'content_id', 
                     'task_container_id', 'user_answer', 'answered_correctly',
                     'prior_question_elapsed_time', 'prior_question_had_explanation']

In [None]:
corr = train_df[~(train_df['prior_question_elapsed_time'].isna())][list_columns_corr].corr(method = "kendall")
f, ax = plt.subplots(figsize=(10, 10))
plt.rcParams['font.size'] = 11
ax.set_title('Kendall correlation')

#A1 = ['Calories','Mat. grasse','M.G.saturée','Glucides','Sucre','Fibres','Protéines','Sel','Sodium']

sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), fmt=".2f", cmap='coolwarm', square=True, ax=ax ) #, xticklabels=A1, yticklabels=A1 )

## questions.csv

In [None]:
questions_df[questions_df['tags'].isna()]

In [None]:
questions_df.describe()

In [None]:
questions_df.info()

In [None]:
print(questions_df.isnull().sum())
print("****************************************")
print(questions_df.isnull().sum() / len(train_df))

print_filling_rate (questions_df, True, "green", "green")

In [None]:
questions_df['tags'].fillna("", inplace=True)
questions_df["nb_tags"] = questions_df["tags"].apply(lambda text: len(text.split()))

questions_df[questions_df['question_id'] == 10033]

In [None]:
questions_df = questions_df[questions_df["nb_tags"] > 0]

In [None]:
test = "".split()
len(test)

In [None]:
print('Le nombre max de tags par question : {}'.format(max(questions_df['nb_tags'])))
print('Le nombre min de tag par question : {}'.format(min(questions_df['nb_tags'])))
print('Le nombre moyen de tags par question : %f'%(sum(questions_df['nb_tags'])/len(questions_df['nb_tags'])))

In [None]:
import matplotlib.style as style
style.use('seaborn-dark-palette')
fig, ax = plt.subplots(figsize=(10, 5))
ax.set_title("Répartition du nombre de tags par question", fontsize=16, weight="bold");

sns.countplot(questions_df['nb_tags'], palette="Set1")
ax.set_ylabel("Nombre d'occurences", fontsize=14)
ax.set_xlabel("Nombre de tags par question", fontsize=14);

In [None]:
tag = questions_df["tags"].str.split(" ", n = 10, expand = True) 
tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']

questions_df =  pd.concat([questions_df,tag],axis=1)
questions_df['tags1'] = pd.to_numeric(questions_df['tags1'], errors='coerce')
questions_df['tags2'] = pd.to_numeric(questions_df['tags2'], errors='coerce')
questions_df['tags3'] = pd.to_numeric(questions_df['tags3'], errors='coerce')
questions_df['tags4'] = pd.to_numeric(questions_df['tags4'], errors='coerce')
questions_df['tags5'] = pd.to_numeric(questions_df['tags5'], errors='coerce')
questions_df['tags6'] = pd.to_numeric(questions_df['tags6'], errors='coerce')

In [None]:
questions_df['tags_list'] = questions_df['tags'].apply(lambda x: x.split())
tags_list = [item for sublist in questions_df['tags_list'].values for item in sublist]
print(len(tags_list))
tags_unique_list = list(set(tags_list))
# Affiche la nouvelle liste 
print(len(tags_unique_list))

In [None]:
import nltk
def distribution_nb_apparition_tag_questions(nb_tags, title, with_return):
    tags_most_common = tags_frequence.most_common(nb_tags)
    tags_df = pd.DataFrame(tags_most_common, columns = ['tags' , 'nb_tags']) 
    tags_df.columns
    tags_sorted_df = tags_df.sort_values(['nb_tags'], ascending=False)
    tags_counts = tags_sorted_df['nb_tags'].values
    plt.plot(tags_counts)
    plt.title(title)
    plt.grid()
    plt.xlabel("Nombre de tags")
    plt.ylabel("Nombre d'occurences")
    plt.show()
    if(with_return):
        return tags_df

In [None]:
tags_frequence= nltk.FreqDist(tags_list)



tags_df = distribution_nb_apparition_tag_questions(len(tags_unique_list), "Distribution du nombre d'apparition des tags", True)

In [None]:
distribution_nb_apparition_tag_questions(25, "Distribution du nombre d'apparition des 25 tags les plus fréquents", False)

In [None]:
NB_TAGS = 25
distribution_nb_apparition_tag_questions(NB_TAGS, "Distribution du nombre d'apparition des NB_TAGS tags les plus fréquents", False)

In [None]:
words_most_common = tags_frequence.most_common(NB_TAGS)

In [None]:
fq_words_df = pd.DataFrame(words_most_common, columns = ['tags' , 'nb_tags']) 
fq_words_df.head()
SELECTED_TAGS = (fq_words_df['tags'][:NB_TAGS]).to_list()
print(SELECTED_TAGS)

In [None]:
def tags_column_processing(tags):
    new_tags = []
    tags_words = tags.split()
    for tag in tags_words:
        if (tag in SELECTED_TAGS) :
            new_tags.append(tag)   
    return new_tags

questions_df['new_tags'] = questions_df["tags"].apply(lambda text : tags_column_processing(text))

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
classes_tags = tuple(SELECTED_TAGS)
one_hot = MultiLabelBinarizer(classes = classes_tags )
#print(one_hot.fit_transform(questions_df['new_tags']))

#print(one_hot.classes_)


types_encoded = pd.DataFrame(one_hot.fit_transform(questions_df['new_tags']),columns=one_hot.classes_)
types_encoded.head()

In [None]:
# Concating df and types_encoded
questions_df = pd.concat([questions_df,types_encoded], axis = 1)
questions_df.head()

In [None]:
questions_df.columns

In [None]:
# Drop column B as it is now encoded
questions_df = questions_df.drop(['new_tags', 'tags', 'tags_list'],axis = 1)

In [None]:
questions_df = questions_df[~questions_df['part'].isna()]

In [None]:
questions_df['part'].unique()

In [None]:
questions_df['is_reading_section']= 0 
questions_df.loc[questions_df['part'] >= 5 ,'is_reading_section'] = 1

In [None]:
questions_df['is_easy_part'] = 0
questions_df['is_medium_part'] = 0
questions_df['is_difficult_part'] = 0
questions_df.loc[questions_df['part'] <= 3 ,'is_easy_part'] = 1
questions_df.loc[(questions_df['part'] == 4)|(questions_df['part'] == 5) ,'is_difficult_part'] = 1
questions_df.loc[questions_df['part'] >6 ,'is_medium_part'] = 1

In [None]:
questions_df.head()

In [None]:
questions_df.head(20)

## Lectures.csv
cf https://www.kaggle.com/jsylas/utilize-lecture-in-your-model-before-answering

In [None]:
lecture_cnt = train_df[train_df.content_type_id == True][['user_id','content_type_id']].groupby(['user_id'],as_index = False).agg(['sum']).reset_index()
lecture_cnt.columns = ["user_id","lecture_heard_count"]
user_list = lecture_cnt['user_id'].unique()
train = train_df[train_df.content_type_id == False]
train_lecture_not_heard = train[~train.user_id.isin(user_list)]
train_lecture_heard = train[train.user_id.isin(user_list)]
train_lecture_not_heard_unq = pd.DataFrame(train_lecture_not_heard['user_id'].unique())
train_lecture_not_heard_unq.columns = ['user_id']
train_lecture_not_heard_unq['lecture'] = 'Not Heard'
train_lecture_heard_unq = pd.DataFrame(train_lecture_heard['user_id'].unique())
train_lecture_heard_unq.columns = ['user_id']
train_lecture_heard_unq['lecture'] = 'Heard'
train_lecture = pd.concat([train_lecture_not_heard_unq,train_lecture_heard_unq],axis=0)

In [None]:
# visualizing Lecture Heard Student VS Lecture Not Heard Student
plt.figure(figsize=(10,5))
ax = sns.countplot(x=train_lecture['lecture'], palette=['#f76f6f',"#0cdeed"])
ax.set_xlabel('Lecture Heard Student VS Lecture Not Heard Student',size=15)
plt.title("Number of Student : 393656",size=15)

for p in ax.patches:
    x=p.get_bbox().get_points()[:,0]
    y=p.get_bbox().get_points()[1,1]
    ax.annotate('{:.0f}'.format(p.get_height()), (x.mean(), y), ha='center', va='bottom')

In [None]:
train_lecture_not_heard_mean = train_lecture_not_heard[['user_id','answered_correctly']].groupby(['user_id'],as_index = False).agg(['mean']).reset_index()
train_lecture_not_heard_mean.columns= ['user_id','train_lecture_not_heard_mean']
train_lecture_heard_mean = train_lecture_heard[['user_id','answered_correctly']].groupby(['user_id'],as_index = False).agg(['mean']).reset_index()
train_lecture_heard_mean.columns = ['user_id','train_lecture_heard_mean']

In [None]:
plt.rcParams['figure.figsize'] = [15,10]
plt.rcParams['font.size'] = 14
sns.kdeplot(train_lecture_heard_mean.train_lecture_heard_mean, label="Lecture Heard", clip=[0,1])
plt.axvline(train_lecture_heard['answered_correctly'].mean(), color='blue')
sns.kdeplot(train_lecture_not_heard_mean.train_lecture_not_heard_mean, label="Lecture Not Heard", clip=[0,1])
plt.axvline(train_lecture_not_heard['answered_correctly'].mean(), color='orange')

# add text 
plt.text(train_lecture_heard_mean.train_lecture_heard_mean.mean()-.3, 3,
         f"Lecture Heard Student Mean  mean={round(train_lecture_heard['answered_correctly'].mean(), 2)}")

plt.text(train_lecture_not_heard_mean.train_lecture_not_heard_mean.mean()-.25, 2.5,
         f"Lecture Not Heard Student Mean mean={round(train_lecture_not_heard['answered_correctly'].mean(), 2)}")

plt.title("Lecture Heard Vs Lecture Not Heard")
plt.xlabel("average answer correctness per user")
plt.ylabel("pdf")

plt.legend()
plt.show()

As we can see in this analysis, the students who had lecture before answering performe better than the students whithout.

In [None]:
print(train_lecture.groupby('user_id').count)

## Merge questions and train

In [None]:
train_df = pd.merge(train_df,questions_df, how='left', left_on='content_id', right_on='question_id').sort_values('row_id')


In [None]:
train_df.head()
#train_df[train_df['1'] == 1]

## Pivot tables

In [None]:
train_df[['user_answer', 'answered_correctly']].groupby(['user_answer'], as_index=False).mean().sort_values(by='answered_correctly', ascending=False)

In [None]:
train_df[['part', 'answered_correctly']].groupby(['part'], as_index=False).mean().sort_values(by='answered_correctly', ascending=False)


In [None]:
# Display information by user ID 
print(pd.pivot_table(train_df, index='user_id', values=['timestamp', 'prior_question_elapsed_time', 'answered_correctly'], aggfunc='mean'))

# Target

In [None]:
(train_df['answered_correctly']==-1).mean()
# We should exclude information about lectures.
train_df_questions = train_df[train_df['answered_correctly']!=-1]
train_df_questions['answered_correctly'].mean()

In [None]:
'''cids = train_df.content_id.value_counts()[:30]

fig = plt.figure(figsize=(12,6))
ax = cids.plot.bar()
plt.title("Thirty most used content id's")
plt.xticks(rotation=90)
ax.get_yaxis().set_major_formatter(FuncFormatter(lambda x, p: format(int(x), ','))) #add thousands separator
plt.show()'''

In [None]:
train_df['is_first_question'] = 0

train_df.loc[train_df['prior_question_elapsed_time'].isna(),'is_first_question'] = 1


In [None]:
train_df.head()

In [None]:
user_agg = train_df.groupby('user_id')['answered_correctly'].\
                    agg(['sum', 'count']) #,'mean','median','std'])
#train_df['user_sum'] = train_df['user_id'].map(user_agg['sum']).astype('int32')
train_df['user_count'] = train_df['user_id'].map(user_agg['count']).astype('int32')
train_df['user_nb_mean'] = train_df['user_id'].map(user_agg['sum']/user_agg['count']).astype('int32')
#train_df['user_mean'] = train_df['user_id'].map(user_agg['mean']).astype('int32')
#train_df['user_median'] = train_df['user_id'].map(user_agg['median']).astype('int32')
#train_df['user_std'] = train_df['user_id'].map(user_agg['std']).astype('int32')

In [None]:
train_df['part'].fillna(4, inplace = True)

In [None]:
content_agg = train_df.groupby('content_id')['answered_correctly'].\
                        agg(['sum', 'count']) #'mean', 'median', 'std'
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
#train_df['content_sum'] = train_df['content_id'].map(content_agg['sum']).astype('int32')
train_df['content_nb_mean'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])

# Baseline

In [None]:
train_df.columns


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
train_df = train_df[train_df['answered_correctly']!= -1]

features_bsl = ['timestamp_by_month', 'tags1', 'tags2', 'tags3', 
                'tags4', 'tags5', 'tags6', 
                'is_first_question', 'part',
                'prior_had_explanation_le', 
                'prior_question_elapsed_time_in_min',
                'content_id', 'content_count', 'content_nb_mean',
                'user_nb_mean', 'user_count',
                'is_easy_part', 'is_medium_part', 'is_difficult_part'
                ]


X = train_df[features_bsl]
X = sc.fit_transform(X)
y = train_df['answered_correctly']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

In [None]:
def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=7, random_state=1, metric='None', n_jobs=4, n_estimators=200)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=1,
    verbose=True)

In [None]:
#gs.fit(X_train, y_train, **fit_params)
#print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

Best score reached: 0.7514851143810443 with params: {'colsample_bytree': 0.9080972808940212, 'min_child_samples': 343, 'min_child_weight': 0.01, 'num_leaves': 42, 'reg_alpha': 5, 'reg_lambda': 0, 'subsample': 0.863184719640143} 

In [None]:
y.describe()

In [None]:
clf = lgb.LGBMClassifier(max_depth=7, 
                         random_state=1, metric='None', 
                         n_jobs=4, n_estimators=200)


params = {
    #'bagging_fraction': 0.5817242323514327,
    'feature_fraction': 0.6884588361650144,
    #'learning_rate': 0.42887924851375825, 
    'learning_rate': 0.4, 
    'max_depth': 7,
    'min_child_samples': 100, 
    'min_child_weight': 0.01,
    'min_data_in_leaf': 20, 
    'n_estimators': 200,
    'n_job': 4,
    'num_leaves': 42,
    'random_state': 1,
    'reg_alpha': 5, 
    'reg_lambda': 0,
    'subsample': 0.863184719640143
}

lgbm = LGBMClassifier( **params)

In [None]:
train_df[features_bsl].shape

In [None]:
lgbm.fit(train_df[features_bsl], y)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y.values, lgbm.predict_proba(train_df[features_bsl])[:,1])

In [None]:
#displaying the most important features
lgb.plot_importance(lgbm)
plt.show()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import riiideducation
import pandas as pd

env = riiideducation.make_env()

In [None]:
from collections import defaultdict
user_sum_dict = user_agg['sum'].astype('int16').to_dict(defaultdict(int))
user_count_dict = user_agg['count'].astype('int16').to_dict(defaultdict(int))
content_sum_dict = content_agg['sum'].astype('int32').to_dict(defaultdict(int))
content_count_dict = content_agg['count'].astype('int32').to_dict(defaultdict(int))

In [None]:
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df[test_df['content_type_id'] == 0]
    test_df['is_first_question'] = 0
    
    test_df.loc[test_df['prior_question_had_explanation'].isna(),'is_first_question'] = 1
    
    test_df['prior_question_had_explanation'].fillna(False, inplace=True)
    test_df.loc[test_df["prior_question_had_explanation"]== True, "prior_had_explanation_le"] = 1
    test_df.loc[test_df["prior_question_had_explanation"]== False, "prior_had_explanation_le"] = 0
    
    prior_question_elapsed_time_mean = test_df[~test_df['prior_question_elapsed_time'].isna()]['prior_question_elapsed_time'].mean()
    test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace=True)
    
    
    test_df = pd.merge(test_df,questions_df, how='left', left_on='content_id', right_on='question_id').sort_values('row_id')
    test_df = test_df.sort_values(['user_id','timestamp'], ascending=False)
    
    user_sum = np.zeros(len(test_df), dtype=np.int16)
    user_count = np.zeros(len(test_df), dtype=np.int16)
    content_sum = np.zeros(len(test_df), dtype=np.int32)
    content_count = np.zeros(len(test_df), dtype=np.int32)
    for i, (user_id, content_id) in enumerate(zip(test_df['user_id'].values, test_df['content_id'].values)):
        user_sum[i] = user_sum_dict[user_id]
        user_count[i] = user_count_dict[user_id]
        content_sum[i] = content_sum_dict[content_id]
        content_count[i] = content_count_dict[content_id]

    test_df['user_count'] = user_count
    test_df['user_sum'] = user_sum
    test_df.loc[test_df["user_count"]==0, "user_count"] = test_df[test_df["user_count"]==0]["user_sum"]
    
    test_df['user_nb_mean'] = user_sum /user_count
    
    test_df['part'] = test_df.part - 1
    test_df['part'].fillna(4, inplace = True)
    
    # New user ? Pas encore de réponses ?
    test_df['content_count'] = content_count
    test_df['content_nb_mean'] = content_sum/content_count
    
    test_df['prior_question_elapsed_time_in_min'] = test_df['prior_question_elapsed_time']/1000/60/60
    test_df['timestamp_by_month'] = test_df['timestamp']/nbMillisecByMonth
    X = test_df[features_bsl]
    #X = sc.fit_transform(X)
    
    test_df['answered_correctly'] =  lgbm.predict(X)
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])