In [None]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm_notebook as tqdm

pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_rows", 500)

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('submission_popular.csv')
item_meta = pd.read_csv('item_metadata.csv')

In [None]:
# users and their actions

test_drop_nan = test[test["reference"].notnull()]

test_user_reference = group_concat(test_drop_nan[["user_id", "reference"]], "user_id", "reference")

test_user_digit_reference_dict = {}
for i in tqdm(range(test_user_reference.shape[0])):
    test_user_digit_reference_dict[test_user_reference.iloc[i]['user_id']] = [x for x in test_user_reference.iloc[i]['reference'].split(' ') if x.isdigit()]

In [None]:
# Experiment 2: consider the order of actions

for i in tqdm(range(submission.shape[0])):
    user = submission.loc[i, 'user_id']
    recommendations = submission.loc[i, 'item_recommendations'].split(' ')
    if user in test_user_digit_reference_dict:
        for item in test_user_digit_reference_dict[user]:
            if item in recommendations:
                recommendations.remove(item)
                recommendations = [item] + recommendations
        recommendations = ' '.join(recommendations)
        submission.set_value(i, 'item_recommendations', recommendations, takeable=False)
        
submission.to_csv('submission_consider_ordered_actions.csv')

In [None]:
# count the times for all actions

from functools import reduce
def get_item_score(df):
    """Get number of clicks that each item received in the df."""

    mask_click = df["action_type"] == "clickout item"
    df_clicks = df[mask_click]
    df_item_clicks = (
        df_clicks
        .groupby("reference")
        .size()
        .reset_index(name="n_click_outs")
        .transform(lambda x: x.astype(int))
    )
    
    mask_image = df["action_type"] == "interaction item image"
    df_images = df[mask_image]
    df_item_images = (
        df_images
        .groupby("reference")
        .size()
        .reset_index(name="image_clicks")
        .transform(lambda x: x.astype(int))
    )
    
    mask_rating = df["action_type"] == "interaction item rating"
    df_ratings = df[mask_rating]
    df_item_ratings = (
        df_ratings
        .groupby("reference")
        .size()
        .reset_index(name="rating_clicks")
        .transform(lambda x: x.astype(int))
    )
    
    mask_info = df["action_type"] == "interaction item info"
    df_info = df[mask_info]
    m = (df_info['reference'] != 'Estació de Sants') & (df_info['reference'] != 'Lower Manhattan') & (df_info['reference'] != 'Shinjuku Station') & (df_info['reference'] !='Miyako Airport') & (df_info['reference'] != 'unknown')
    df_info = df_info[m]
    df_item_info = (
        df_info
        .groupby("reference")
        .size()
        .reset_index(name="info_clicks")
        .transform(lambda x: x.astype(int))
    )
    
    mask_deals = df["action_type"] == "interaction item deals"
    df_deals = df[mask_deals]
    df_item_deals = (
        df_deals
        .groupby("reference")
        .size()
        .reset_index(name="deals_clicks")
        .transform(lambda x: x.astype(int))
    )
    
    mask_search = df["action_type"] == "search for item"
    df_search = df[mask_search]
    df_item_search = (
        df_search
        .groupby("reference")
        .size()
        .reset_index(name="search_clicks")
        .transform(lambda x: x.astype(int))
    )    
    
    dfs = [df_item_clicks, df_item_images, df_item_ratings, df_item_info, df_item_deals, df_item_search]
    df_final = reduce(lambda left,right: pd.merge(left, right, how='outer', on=['reference']), dfs)

    return df_final

In [None]:
# dataframe for all items and actions on each of them

df_final = get_item_score(train)
df_final

In [None]:
# cols_to_norm = ['n_click_outs','image_clicks','rating_clicks','info_clicks','deals_clicks','search_clicks']

cols_to_norm = ['image_clicks']
df_final[cols_to_norm] = df_final[cols_to_norm].apply(lambda x: (x - x.min()) / (x.quantile(.6) - x.min()))
df_final

In [None]:
df_final.corr(method='pearson', min_periods=1)

In [None]:
# fill in the NaN and calculate weighted sum as score

df_final.fillna(value=0, inplace=True)
df_final['score'] = df_final['n_click_outs'] * 3 + df_final['image_clicks']*0.770488 + df_final['rating_clicks']*0.537540 + df_final['info_clicks']*0.768803 + df_final['deals_clicks']*0.779586 + df_final['search_clicks']*0.426440
df_item_score = df_final[['reference', 'score']]
df_item_score

In [None]:
# function in baseline

GR_COLS = ["user_id", "session_id", "timestamp", "step"]


def get_submission_target(df):
    """Identify target rows with missing click outs."""

    mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
    df_out = df[mask]

    return df_out


def get_popularity(df):
    """Get number of clicks that each item received in the df."""

    mask = df["action_type"] == "clickout item"
    df_clicks = df[mask]
    df_item_clicks = (
        df_clicks
        .groupby("reference")
        .size()
        .reset_index(name="n_clicks")
        .transform(lambda x: x.astype(int))
    )

    return df_item_clicks


def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out

def explode(df_in, col_expl):
    """Explode column col_expl of array type into multiple rows."""

    df = df_in.copy()
    df.loc[:, col_expl] = df[col_expl].apply(string_to_array)

    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns.drop(col_expl)}
    )

    df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
    df_out.loc[:, col_expl] = df_out[col_expl].apply(int)

    return df_out

def group_concat(df, gr_cols, col_concat):
    """Concatenate multiple rows into one."""

    df_out = (
        df
        .groupby(gr_cols)[col_concat]
        .apply(lambda x: ' '.join(x))
        .to_frame()
        .reset_index()
    )

    return df_out

def calc_recommendation(df_expl, df_pop):
    """Calculate recommendations based on popularity of items.
    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
    :param df_expl: Data frame with exploded impression list
    :param df_pop: Data frame with items and number of clicks
    :return: Data frame with sorted impression list according to popularity in df_pop
    """

    df_expl_clicks = (
        df_expl[GR_COLS + ["impressions"]]
        .merge(df_pop,
               left_on="impressions",
               right_on="reference",
               how="left")
    )

    df_out = (
        df_expl_clicks
        .assign(impressions=lambda x: x["impressions"].apply(str))
        .sort_values(GR_COLS + ["score"],
                     ascending=[True, True, True, True, False])
    )

    df_out = group_concat(df_out, GR_COLS, "impressions")
    df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)

    return df_out


df_target = get_submission_target(test)
df_expl = explode(df_target, "impressions")


In [None]:
# make recommendations based on score

df_item_score_rec = calc_recommendation(df_expl, df_item_score)

In [None]:
# Experiment 3: test_user_digit_reference_dict for score submission

for i in tqdm(range(df_item_score_rec.shape[0])):
    user = df_item_score_rec.loc[i, 'user_id']
    recommendations = df_item_score_rec.loc[i, 'item_recommendations'].split(' ')
    if user in test_user_digit_reference_dict:
        for item in test_user_digit_reference_dict[user]:
            if item in recommendations:
                recommendations.remove(item)
                recommendations = [item] + recommendations
        recommendations = ' '.join(recommendations)
        df_item_score_rec.set_value(i, 'item_recommendations', recommendations, takeable=False)
        
df_item_score_rec.to_csv('submission_all_no_nomarlize_score_consider_ordered_actions.csv')

In [None]:
# Experiment 4: Using raw impressonist, combined with users preceeding actions

def raw(df_expl):
    """Calculate recommendations based on popularity of items.
    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.
    :param df_expl: Data frame with exploded impression list
    :param df_pop: Data frame with items and number of clicks
    :return: Data frame with sorted impression list according to popularity in df_pop
    """

    df_expl_clicks = df_expl[GR_COLS + ["impressions"]]


    df_out = (
        df_expl_clicks
        .assign(impressions=lambda x: x["impressions"].apply(str))
        .sort_values(GR_COLS,
                     ascending=[True, True, True, True])
    )

    df_out = group_concat(df_out, GR_COLS, "impressions")
    df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)

    return df_out


In [None]:
submission_raw = raw(df_expl)

In [None]:
# rerank impression list according to users' preceeding actions

for i in tqdm(range(submission_raw.shape[0])):
    user = submission_raw.loc[i, 'user_id']
    recommendations = submission_raw.loc[i, 'item_recommendations'].split(' ')
    if user in test_user_digit_reference_dict:
        for item in test_user_digit_reference_dict[user]:
            if item in recommendations:
                recommendations.remove(item)
                recommendations = [item] + recommendations
        recommendations = ' '.join(recommendations)
        submission_raw.set_value(i, 'item_recommendations', recommendations, takeable=False)
submission_raw.to_csv('submission_raw_consider_ordered_actions.csv')