In [2]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import os
import sys

# Collaborative filtering

In [36]:
PROCESS_DATA = False

if PROCESS_DATA:
    is_first = True
    for i in range(1, 7):
        products_folder = f"sbermarket_tab_2_{i}"
        for file in os.listdir(products_folder):
            products_file = os.path.join(products_folder, file)
            if is_first:
                products = pd.read_csv(products_file, usecols=["user_id", "product_id", "quantity"])
                is_first = False
            else:
                new_products = pd.read_csv(products_file, usecols=["user_id", "product_id", "quantity"])
                products = pd.concat([products, new_products])

    products.rename(columns={"quantity": "purchase_count",
                             "user_id": "customerId",
                             "product_id": "productId"}, inplace=True)

    data = products.groupby(["customerId", "productId"]).agg({"purchase_count": "sum"})
    del products
    data.reset_index(inplace=True)
    data.to_csv("data.csv")

else:
    data = pd.read_csv("data.csv", index_col=0)
    print(f"Size  of the dataframe is {sys.getsizeof(data) / 1024**3:.1f} GB")
    print(f"Shape of the dataframe is {data.shape}")

  mask |= (ar1 == a)


Size  of the dataframe is 1.4 GB
Shape of the dataframe is (46194236, 3)


In [37]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,51,0,9
1,51,159,1
2,51,397,2
3,51,407,16
4,51,456,20


In [6]:
sample_submission = "sample_submission.csv"
customers = pd.read_csv(sample_submission, usecols=["Id"])
users_to_recommend = customers.Id.tolist()

user_id = 'customerId'
item_id = 'productId'
n_rec = 50  # number of items to recommend

In [5]:
FREQ_TYPE = 'scaled_purchase_freq'

if FREQ_TYPE == 'dummy':
    target = 'purchase_dummy'
    data['purchase_dummy'] = 1
elif FREQ_TYPE == 'rel_freq':
    target = 'scaled_purchase_freq'
    data["max_count_prod"] = data.groupby('productId')["purchase_count"].transform('max')
    data["scaled_purchase_freq"] = data["purchase_count"] / data["max_count_prod"]
    data.drop(columns=['max_count_prod', 'purchase_count'], inplace=True)
else:
    target = 'purchase_count'

train_data = tc.SFrame(data)
del data

In [6]:
def create_output(df_rec, name):
    df_rec['Predicted'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: ' '.join(x.astype(str)))
    df_rec.rename(columns={"customerId": "Id"}, inplace=True)
    df_output = df_rec[['Id', 'Predicted']].drop_duplicates() \
        .sort_values('Id').set_index('Id')
    df_output.to_csv(f'submission_{name}_{FREQ_TYPE}.csv')
    return df_output

def fit_predict(train_data, name, user_id, item_id, target, users_to_recommend, n_rec):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec).to_dataframe()
    return create_output(recom, name)

In [7]:
SIMILARITY = 'cosine'
submission = fit_predict(train_data, SIMILARITY, user_id, item_id, target, users_to_recommend, n_rec)

In [10]:
submission.head()

(107068, 2)


Unnamed: 0,Id,Predicted
0,51,5074743 99474 86371 7342276 27555 5061861 3819...
1,65,5469728 5479511 94333 55133 100812 69669 54816...
2,187,88085 88095 7407972 3302679 9220725 88889 8949...
3,400,709 100789 69669 100849 55133 63072 3817542 34...
4,576,10526485 10527160 85433 4275615 13577 10527066...


# Recommend items that user buys frequently

In [4]:
is_first = True
for i in range(1, 7):
    products_folder = f"sbermarket_tab_2_{i}"
    for file in os.listdir(products_folder):
        products_file = os.path.join(products_folder, file)
        if is_first:
            products = pd.read_csv(products_file, usecols=["user_id", "product_id"])
            is_first = False
        else:
            new_products = pd.read_csv(products_file, usecols=["user_id", "product_id"])
            products = pd.concat([products, new_products])

products['dummy'] = 1
data = products.groupby(["user_id", "product_id"]).agg({"dummy": "sum"})
del products
data.reset_index(inplace=True)

In [16]:
data2 = data.loc[data.user_id.isin(users_to_recommend)]
data3 = data2.sort_values("dummy", ascending=False)
data3.head()

Unnamed: 0,user_id,product_id,dummy
567820,56037,0,471
2759789,222648,3818317,364
570080,56037,6375406,360
2759213,222648,709,332
568229,56037,12565,330


In [24]:
data4 = data3.groupby(['user_id'])['product_id'] \
             .apply(lambda x: ' '.join([str(el) for el in x[:min(50, len(x))]])).reset_index()
data4.head()

Unnamed: 0,user_id,product_id
0,51,14863 63057 19562 3562687 709 9979 661 456 550...
1,65,54728 18450 3817542 709 26062 18439 105764 771...
2,187,88084 88086
3,400,0 813 94333 3817507 57085 61053 55034 26751 38...
4,576,177 2600 46469 5628 10002 102 4443 5046703 107...


In [None]:
cosine_dummy_submission = pd.read_csv("submission_cosine_dummy.csv")

In [25]:
sub_me = []
for idx, row in data4.iterrows():
    rec = row['product_id']
    user_id = row['user_id']
    assert isinstance(rec, str)
    assert isinstance(user_id, int)
    rec_list = rec.split(' ')
    lack = 50 - len(rec_list)
    assert lack >= 0
    if lack > 0:
        cos_rec = cosine_dummy_submission.loc[cosine_dummy_submission['Id'] == user_id].Predicted.item().split(' ')
        while lack > 0:
            rec_item = cos_rec.pop(0)
            if rec_item not in rec_list:
                rec_list.append(rec_item)
                lack -= 1
        rec = ' '.join(rec_list)
    sub_me.append(rec)

In [29]:
data5 = data4.copy()
data5['product_id'] = sub_me
data5.rename(columns={"user_id": "Id", "product_id": "Predicted"}, inplace=True)
data5.head()

Unnamed: 0,Id,Predicted
0,51,14863 63057 19562 3562687 709 9979 661 456 550...
1,65,54728 18450 3817542 709 26062 18439 105764 771...
2,187,88084 88086 88085 3302679 88081 9221154 88095 ...
3,400,0 813 94333 3817507 57085 61053 55034 26751 38...
4,576,177 2600 46469 5628 10002 102 4443 5046703 107...


In [30]:
data5.to_csv("dumb_solution.csv", index=False)

# Some crazy heuristics

In [70]:
from collections import defaultdict
from tqdm import tqdm
from itertools import permutations

In [39]:
is_first = True
for i in range(1, 7):
    products_folder = f"sbermarket_tab_2_{i}"
    for file in os.listdir(products_folder):
        products_file = os.path.join(products_folder, file)
        if is_first:
            products = pd.read_csv(products_file, usecols=["user_id", "product_id", "order_id"])
            is_first = False
        else:
            new_products = pd.read_csv(products_file, usecols=["user_id", "product_id", "order_id"])
            products = pd.concat([products, new_products])

data = products.loc[products.user_id.isin(users_to_recommend)]
del products

In [57]:
data2 = data.groupby(['user_id', 'order_id'])['product_id'].apply(list).reset_index().groupby('user_id')

In [69]:
Ids = []
Predictions = []
for _, group in tqdm(data2):
    Ids.append(group.user_id.iloc[0])
    group.sort_values('order_id', ascending=False)
    n_orders = group.shape[0]
    late_mid_early = []
    products_masks = defaultdict(int)
    for i in range(3):
        if i < n_orders:
            for prod_id in group.product_id.iloc[i]:
                products_masks[prod_id] += 2**i
        else:
            break
    masks_lists = defaultdict(list)
    for prod_id, mask in products_masks.items():
        masks_lists[mask].append(prod_id)
    Predictions.append(masks_lists)

100%|██████████| 107068/107068 [00:45<00:00, 2360.83it/s]


In [77]:
permuts = list(permutations([1, 2, 5, 6]))
valid_permuts = [perm for perm in permuts if perm.index(5) < perm.index(1)]
valid_permuts

[(2, 5, 1, 6),
 (2, 5, 6, 1),
 (2, 6, 5, 1),
 (5, 1, 2, 6),
 (5, 1, 6, 2),
 (5, 2, 1, 6),
 (5, 2, 6, 1),
 (5, 6, 1, 2),
 (5, 6, 2, 1),
 (6, 2, 5, 1),
 (6, 5, 1, 2),
 (6, 5, 2, 1)]

In [90]:
for PERM_i in tqdm(range(len(valid_permuts))):
    PERM = valid_permuts[PERM_i]

    Preds = []
    for def_dict in Predictions:
        rec = def_dict[7] + def_dict[3]
        for mask in PERM:
            rec += def_dict[mask]
        rec += def_dict[4]
        Preds.append(' '.join([str(r) for r in rec]))

    sub_perm = pd.DataFrame({'Id': Ids, 'Predicted': Preds})

    sub_me = []
    for idx, row in sub_perm.iterrows():
        rec = row['Predicted']
        user_id = row['Id']
        assert isinstance(rec, str)
        assert isinstance(user_id, int)
        rec_list = rec.split(' ')
        lack = 50 - len(rec_list)

        if lack > 0:
            cos_rec = data5.loc[data5['Id'] == user_id].Predicted.item().split(' ')
            while lack > 0:
                rec_item = cos_rec.pop(0)
                if rec_item not in rec_list:
                    rec_list.append(rec_item)
                    lack -= 1
            rec = ' '.join(rec_list)
        elif lack < 0:
            rec_list = rec_list[:50]
            rec = ' '.join(rec_list)
        sub_me.append(rec)

    sub_perm2 = sub_perm.copy()
    sub_perm2.Predicted = sub_me
    sub_perm2.to_csv(f"perm_{PERM_i}.csv", index=False)

100%|██████████| 11/11 [05:55<00:00, 32.30s/it]


# Even more crazy heuristics

In [121]:
TYPE = 'lin'
# TYPE = 'exp'
GAMMA = 0.13
MIN_WEIGHT = 0.1

Ids = []
Predictions = []
for _, group in tqdm(data2):
    Ids.append(group.user_id.iloc[0])
    n_orders = group.shape[0]
    group.sort_values('order_id', ascending=False, inplace=True)
    if TYPE == 'lin':
        group['weights'] = [max(1. - i*GAMMA, MIN_WEIGHT) for i in range(n_orders)]
    elif TYPE == 'exp':
        group['weights'] = [GAMMA ** i for i in range(n_orders)]
    
    products_scores = defaultdict(float)
    for idx, row in group.iterrows():
        assert isinstance(row['product_id'], list)
        assert isinstance(row['weights'], float)
        for prod_id in row['product_id']:
            products_scores[prod_id] += row['weights']
    rec_list = [itm[0] for itm in
                sorted(products_scores.items(), key=lambda item: item[1])][::-1]
    
    lack = 50 - len(rec_list)
    if lack > 0:
        cos_rec = data5.loc[data5['Id'] == user_id].Predicted.item().split(' ')
        while lack > 0:
            rec_item = cos_rec.pop(0)
            if rec_item not in rec_list:
                rec_list.append(rec_item)
                lack -= 1
    elif lack < 0:
        rec_list = rec_list[:50]
    Predictions.append(' '.join([str(r) for r in rec_list]))

sub_new_idea = pd.DataFrame({'Id': Ids, 'Predicted': Predictions})
sub_new_idea.to_csv(f"{TYPE}_{str(GAMMA)[2:]}.csv", index=False)

100%|██████████| 107068/107068 [03:44<00:00, 477.83it/s]


In [114]:
sub_new_idea.head()

Unnamed: 0,Id,Predicted
0,51,14863 709 19562 63057 3562687 3817489 54710 26...
1,65,3817542 54728 709 18439 26062 105764 5639 1845...
2,187,88086 88084 9741 63762 63763 63760 67694 68465...
3,400,3817507 94333 813 0 5481691 6565232 61053 5503...
4,576,177 2600 5046703 7103983 46469 14007 102 4443 ...


### Check

In [103]:
df = pd.DataFrame({'user_id':    [1, 1, 1, 1, 2, 2, 2, 2],
                   'order_id':   [1, 3, 3, 1, 2, 2, 4, 2],
                   'product_id': [7, 7, 3, 2, 4, 5, 4, 1]})
df

Unnamed: 0,user_id,order_id,product_id
0,1,1,7
1,1,3,7
2,1,3,3
3,1,1,2
4,2,2,4
5,2,2,5
6,2,4,4
7,2,2,1


In [104]:
df2 = df.groupby(['user_id', 'order_id'])['product_id'].apply(list).reset_index().groupby('user_id')

In [111]:
GAMMA = 0.1
MIN_WEIGHT = 0.1

Ids = []
Predictions = []
for _, group in tqdm(df2):
    Ids.append(group.user_id.iloc[0])
    n_orders = group.shape[0]
    group.sort_values('order_id', ascending=False, inplace=True)
    group['weights'] = [max(1. - i*GAMMA, MIN_WEIGHT) for i in range(n_orders)]
    print(group)
    
    products_scores = defaultdict(float)
    for idx, row in group.iterrows():
        assert isinstance(row['product_id'], list)
        assert isinstance(row['weights'], float)
        for prod_id in row['product_id']:
            products_scores[prod_id] += row['weights']
    rec_list = [itm[0] for itm in
                sorted(products_scores.items(), key=lambda item: item[1])][::-1]
    Predictions.append(' '.join([str(r) for r in rec_list]))

100%|██████████| 2/2 [00:00<00:00, 199.98it/s]

   user_id  order_id product_id  weights
1        1         3     [7, 3]      1.0
0        1         1     [7, 2]      0.9
   user_id  order_id product_id  weights
3        2         4        [4]      1.0
2        2         2  [4, 5, 1]      0.9





In [112]:
sub_new_df = pd.DataFrame({'Id': Ids, 'Predicted': Predictions})
sub_new_df

Unnamed: 0,Id,Predicted
0,1,7 3 2
1,2,4 1 5
