In [1]:
# %load_ext autoreload
# %autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import os
import sys
# sys.path.append("..")

In [2]:
PROCESS_DATA = False

if PROCESS_DATA:
    is_first = True
    for i in range(1, 7):
        products_folder = f"sbermarket_tab_2_{i}"
        for file in os.listdir(products_folder):
            products_file = os.path.join(products_folder, file)
            if is_first:
                products = pd.read_csv(products_file, usecols=["user_id", "product_id", "quantity"])
                is_first = False
            else:
                new_products = pd.read_csv(products_file, usecols=["user_id", "product_id", "quantity"])
                products = pd.concat([products, new_products])

    products.rename(columns={"quantity": "purchase_count",
                             "user_id": "customerId",
                             "product_id": "productId"}, inplace=True)

    data = products.groupby(["customerId", "productId"]).agg({"purchase_count": "sum"})
    del products
    data.reset_index(inplace=True)
    data.to_csv("data.csv")
    del data

else:
    data = pd.read_csv("data.csv", index_col=0)
    print(f"Size  of the dataframe is {sys.getsizeof(data) / 1024**3:.1f} GB")
    print(f"Shape of the dataframe is {data.shape}")

  mask |= (ar1 == a)


Size  of the dataframe is 1.4 GB
Shape of the dataframe is (46194236, 3)


In [3]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,51,0,9
1,51,159,1
2,51,397,2
3,51,407,16
4,51,456,20


In [4]:
sample_submission = "sample_submission.csv"
customers = pd.read_csv(sample_submission, usecols=["Id"])
users_to_recommend = customers.Id.tolist()

user_id = 'customerId'
item_id = 'productId'
n_rec = 50 # number of items to recommend

In [5]:
FREQ_TYPE = 'dummy'
SIMILARITY = 'cosine'

if FREQ_TYPE == 'dummy':
    target = 'purchase_dummy'
    data['purchase_dummy'] = 1
elif FREQ_TYPE == 'rel_freq':
    target = 'scaled_purchase_freq'
    data["max_count_prod"] = data.groupby('productId')["purchase_count"].transform('max')
    data["scaled_purchase_freq"] = data["purchase_count"] / data["max_count_prod"]
    data.drop(columns=['max_count_prod', 'purchase_count'], inplace=True)
else:
    target = 'purchase_count'

train_data = tc.SFrame(data)
del data

In [6]:
def create_output(df_rec, name):
    df_rec['Predicted'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: ' '.join(x.astype(str)))
    df_rec.rename(columns={"customerId": "Id"}, inplace=True)
    df_output = df_rec[['Id', 'Predicted']].drop_duplicates() \
        .sort_values('Id').set_index('Id')
    df_output.to_csv(f'submission_{name}_{FREQ_TYPE}.csv')
    return df_output

def fit_predict(train_data, name, user_id, item_id, target, users_to_recommend, n_rec):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec).to_dataframe()
    return create_output(recom, name)

In [7]:
submission = fit_predict(train_data, SIMILARITY, user_id, item_id, target, users_to_recommend, n_rec)
print(submission.shape)
submission.head()
# del popularity

(107068, 1)


Unnamed: 0_level_0,Predicted
Id,Unnamed: 1_level_1
51,100789 69669 3817484 63072 100849 5479511 3959...
65,5469728 94333 3817484 69669 55133 5479511 687 ...
187,88085 3302679 88081 9221154 88095 89494 88889 ...
400,709 69669 100789 55133 63072 3817542 3497570 1...
576,709 3817484 5469728 69669 94333 39590 1300 100...


In [8]:
name = 'cosine'
target = 'purchase_count'
cos = fit_predict(train_data, name, user_id, item_id, target, users_to_recommend, n_rec)
# cos.to_csv("cos.csv")
# del cos

In [31]:
def create_output(df_rec, name):
    df_rec['Predicted'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: ' '.join(x.astype(str)))
    df_rec.rename(columns={"customerId": "Id"}, inplace=True)
    df_output = df_rec[['Id', 'Predicted']].drop_duplicates() \
        .sort_values('Id').set_index('Id')
    df_output.to_csv('submission_' + name + '.csv')
    return df_output

In [32]:
name = 'popularity'
popularity = pd.read_csv(name + ".csv", index_col=0)
popularity.head()
popularity_submission = create_output(popularity, name)

In [33]:
popularity_submission.head()

Unnamed: 0_level_0,Predicted
Id,Unnamed: 1_level_1
51,13344338 5061861 5481337 13344339 96412 746894...
65,13344338 5061861 5481337 13344339 96412 746894...
187,13344338 5061861 5481337 13344339 96412 746894...
400,13344338 5061861 5481337 13344339 96412 746894...
576,13344338 5061861 5481337 13344339 96412 746894...


In [19]:
sample_sub = pd.read_csv(sample_submission)
print(sample_sub.shape)
sample_sub.head()

(107068, 2)


Unnamed: 0,Id,Predicted
0,51,3239211 10720024 8493844 1965540 5009002 95218...
1,65,11693356 13392267 2464515 2245175 4748570 1134...
2,766,12773054 3692749 12726751 10041624 11893515 23...
3,1132,8613934 3599065 575614 4821474 11609320 699047...
4,1578,289404 4328240 3129788 9390886 11166655 173077...


In [10]:
cosine_submission = pd.read_csv("submission_cosine.csv")
print(cosine_submission.shape)
cosine_submission.head()

(107068, 2)


Unnamed: 0,Id,Predicted
0,51,5074743 99474 86371 7342276 27555 5061861 3819...
1,65,5469728 5479511 94333 55133 100812 69669 54816...
2,187,88085 88095 7407972 3302679 9220725 88889 8949...
3,400,709 100789 69669 100849 55133 63072 3817542 34...
4,576,10526485 10527160 85433 4275615 13577 10527066...


In [34]:
popularity_submission = pd.read_csv("submission_popularity.csv")
print(popularity_submission.shape)
popularity_submission.head()

(107068, 2)


Unnamed: 0,Id,Predicted
0,51,13344338 5061861 5481337 13344339 96412 746894...
1,65,13344338 5061861 5481337 13344339 96412 746894...
2,187,13344338 5061861 5481337 13344339 96412 746894...
3,400,13344338 5061861 5481337 13344339 96412 746894...
4,576,13344338 5061861 5481337 13344339 96412 746894...


In [None]:
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

In [None]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

In [None]:
data_dummy = data.copy()
data_dummy['purchase_dummy'] = 1

In [None]:
data_norm = data.copy()
data_norm["max_count_prod"] = data_norm.groupby('productId')["purchase_count"].transform('max')
data_norm["scaled_purchase_freq"] = data_norm["purchase_count"] / data_norm["max_count_prod"]
data_norm.drop(columns=['max_count_prod', 'purchase_count'], inplace=True)

In [None]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [None]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [None]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')

In [None]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [None]:
df = pd.DataFrame({'cus': [1, 1, 2, 2, 3],
                   'prod': [1, 2, 1, 2, 1],
                   'cnt': [1, 5, 3, 2, 42]})
df['max_cnt'] = df.groupby('prod')["cnt"].transform('max')
df

In [None]:
USR = 51

In [None]:
orders_file = "tab_1_orders.csv"
orders = pd.read_csv(orders_file, usecols=["order_id", "user_id"]) #, index_col="order_id", parse_dates=["order_completed_date"]
orders1 = orders.iloc[orders.user_id == USR].order_id.values().tolist()
# orders.head()

In [None]:
# show_head = True
orders2 = []
for i in range(1, 7):
    products_folder = f"kaggle_tab_2_{i}"
    for file in os.listdir(products_folder):
        products_file = os.path.join(products_folder, file)
        products = pd.read_csv(products_file, usecols=["order_id", "user_id"])
        orders2 += set(products.iloc[orders.user_id == USR].order_id.values.tolist())
#         if show_head:
#             orders.head()
#             show_head = False
#         del products

In [None]:
print(set(orders1).difference(orders2))
print(set(orders2).difference(orders1))