In [3]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
from collections import defaultdict
from surprise.model_selection import train_test_split
from tqdm import tqdm
import io
from pandasql import sqldf

from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir
from surprise import NormalPredictor
from surprise import Reader
from surprise.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

from Tfidfalg import TfidfAlgorithm

In [4]:
path = 'client_base.csv'
df = pd.read_csv(path).drop('Unnamed: 0', axis=1)

#### Selecting the set of clients who purchased 2 or more orders, because we'll use recommender system to recommend the last purchase and compare with what a client actually bought

In [5]:
df = df[df['cnt_purchased_orders'] > 1]
df

Unnamed: 0,client_id,DistinctCountOrders,AverageNumItemsInOrder,RevenueClient,cnt_purchased_orders,AvgOrderSumPurchased,AvgNumItemsOrderPurchased,RevenueClientPurchased,PurchasedItems,AvgMarginPurchased,...,ОДЕЖДА,ППКП,CENTRAL,FAR EAST,NORTH,PRIVOLZIE,SIBERIA,SOUTHERN,URAL,Cluster
0,55575053-54505550565678,3.5,2.440,13580.0,2.5,1575.0,2.275,9930.0,13.5,183.20,...,0.6390,0.02460,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,55575048-48524956565375,3.5,3.787,13580.0,2.5,542.0,2.867,9930.0,13.5,132.60,...,0.3200,0.46000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,55575555-57524953505673,3.5,2.703,13580.0,2.5,816.0,2.680,9930.0,13.5,93.10,...,0.6943,0.00685,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,55574953-50505150484978,3.5,4.168,13580.0,2.5,3218.0,2.223,9930.0,13.5,391.00,...,0.5693,0.10570,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,55575054-51515151484875,3.5,1.718,13580.0,2.5,2690.0,1.800,9930.0,13.5,314.20,...,0.4050,0.08860,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24484,55574848-48575054575179,2.0,1.000,8920.0,2.0,4508.0,1.000,8920.0,2.0,1282.00,...,0.5000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
24485,55574848-48545256485273,2.0,1.000,2810.0,2.0,1504.0,1.000,2810.0,2.0,392.50,...,0.6943,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
24486,55574848-48504852575373,3.0,2.000,872.0,2.0,681.0,2.000,582.0,4.0,13.64,...,0.0000,1.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
24487,55574848-48495057545270,3.0,7.668,13580.0,2.0,6576.0,8.500,9930.0,13.5,1787.00,...,0.6943,0.00000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [6]:
path = 'data_markets.csv'
markets = pd.read_csv(path).drop('Unnamed: 0', axis=1)
markets = markets.rename(columns={'Telephone_new': 'client_id'})

In [7]:
#removing items that have no group3 and 4 data
markets.dropna(subset=['Type'], inplace=True)
markets.dropna(subset=['Group2'], inplace=True)
markets.dropna(subset=['Group3'], inplace=True)
markets.dropna(subset=['Group4'], inplace=True)
markets.dropna(subset=['Nomenclature'], inplace=True)
markets = markets.reset_index(drop = True)

### Prepare dataframes so they both contain data on the same clients

In [8]:
# removing clients with < 2 items with the description of the item
qq1 = """
SELECT client_id
FROM markets
GROUP BY client_id
HAVING COUNT(*) >= 2
"""
pysqldf = lambda q: sqldf(q, globals())

clients = pysqldf(qq1)

In [9]:
qq2 = """
SELECT *
FROM markets
where client_id in (select client_id from clients)
"""
pysqldf = lambda q: sqldf(q, globals())

markets_c = pysqldf(qq2)

In [10]:
intersecting_client_ids = df[df['client_id'].isin(markets_c['client_id'])]['client_id']

In [11]:
# Filter df1 and df2 based on intersecting client_ids
df_filtered = df[df['client_id'].isin(intersecting_client_ids)]
markets_filtered = markets_c[markets_c['client_id'].isin(intersecting_client_ids)]

In [12]:
markets_filtered = markets_filtered.reset_index()

In [13]:
markets_filtered['num_row'] = range(1, len(markets_filtered) + 1)

In [14]:
#markets_filtered['item_descr'] = markets_filtered[['Type', 'Group2', 'Group3', 'Group4', 'Nomenclature']].apply(lambda x: '.'.join(x.astype(str)), axis=1)

### For each client find his last purchase.
This is going to be our test data

In [15]:
markets_filtered['Date'] = pd.to_datetime(markets_filtered['Date'])
last_purchase_indices = markets_filtered.groupby('client_id')['Date'].idxmax()
last_purchase = markets_filtered.loc[last_purchase_indices, ['client_id', 'Date', 'Nomenclature', 'num_row']]

In [16]:
test_df = last_purchase.copy()
test_df = test_df[['client_id', 'Nomenclature']]
test_df = test_df.rename(columns = {'Nomenclature' : 'item'})
test_df = test_df.reset_index(drop = True)
test_df

Unnamed: 0,client_id,item
0,55574848-48494948544878,"LEADER KIDS, КОМПЛЕКТ (майка+шорты) Сафари, (м..."
1,55574848-48495057545270,"BUTTON BLUE, ПЛАТЬЕ (бел+роз), р.128"
2,55574848-48545256485273,"TRANSFORMERS, ПОЛУБОТИНКИ школ., (син), р. 31-36"
3,55574848-48575054575179,"LEADER KIDS, БОРТИК в кроватку Мыльные пузыри,..."
4,55574848-48575448515270,"БУСИНКА, ТАРЕЛКА с подогревом, на присоске"
...,...,...
24003,55575757-57565749535776,"ПОЛЕСЬЕ, ЛОПАТА средняя, (41 см)"
24004,55575757-57565752495570,"КОТОФЕЙ, САПОГИ (красн), р. 23-26"
24005,55575757-57575751494871,"ИГРАЕМ ВМЕСТЕ, НАБОР ОРУЖИЯ ТРИ БОГАТЫРЯ (МЕЧ+..."
24006,55575757-57575753534870,"МУНИ, ПОДГУЗНИКИ (L), (9-14кг) (54шт)"


In [17]:
### Delete last purchase from train data
qq3 = """
SELECT *
FROM markets_filtered
WHERE num_row NOT IN (SELECT num_row FROM last_purchase)
"""
pysqldf = lambda q: sqldf(q, globals())

marks = pysqldf(qq3)

In [18]:
marks.head()

Unnamed: 0,index,Date,DateDelivery,OrderNumberOnTheWebsite,NewStatus,SumOrderOnTheWebsite,SumDocument,MethodDelivery,PaymentForm,Region,...,OrderDateOnTheWebsite,client_id,E-mail_new,Client,ID_SKU,CityShop,ShopOrder,City,Region_max,num_row
0,6,2017-03-01 11:57:00.000000,2017-03-09 00:00:00,3999687_TR,Частичный возврат,19991,19991.0,Курьерская,Наличная,Москва,...,2017-03-01,55574853-53565050515377,111117_ou23@workmail.ru,Николаева,IDL00011556856,0,,Москва,CENTRAL,1
1,7,2017-03-01 09:02:00.000000,2017-03-03 00:00:00,3999713_TR,Доставлен,1196,1196.0,Магазины,Безналичная,Подольск (Московская область район),...,2017-03-01,55574948-52515357485779,111117_ou25@yandex.ru,Ольга,IDL00025316250,0,,Подольск,CENTRAL,2
2,8,2017-03-01 09:02:00.000000,2017-03-03 00:00:00,3999713_TR,Доставлен,1196,1196.0,Магазины,Безналичная,Подольск (Московская область район),...,2017-03-01,55574948-52515357485779,111117_ou25@yandex.ru,Ольга,IDL00025320553,0,,Подольск,CENTRAL,3
3,9,2017-03-01 09:02:00.000000,2017-03-03 00:00:00,3999713_TR,Доставлен,1196,1196.0,Магазины,Безналичная,Подольск (Московская область район),...,2017-03-01,55574948-52515357485779,111117_ou25@yandex.ru,Ольга,IDL00024877755,0,,Подольск,CENTRAL,4
4,14,2017-03-01 14:01:00.000000,2017-03-04 00:00:00,4000127_TR,Доставлен,4754,4095.0,Магазины,Безналичная,Красково (Люберецкий район),...,2017-03-01,55574948-52504948534879,56117_8u19@mail.ru,Ольга,IDL00036503553,0,,Красково,CENTRAL,5


In [19]:
# Find penultimate purchase of every client
marks['Date'] = pd.to_datetime(marks['Date'])
penu_purchase_indices = marks.groupby('client_id')['Date'].idxmax()
penu_purchase = marks.loc[penu_purchase_indices, ['client_id', 'Date', 'Nomenclature', 'num_row']]

In [20]:
test_df_2 = penu_purchase.copy()
test_df_2 = test_df_2[['client_id', 'Nomenclature']]
test_df_2 = test_df_2.rename(columns = {'Nomenclature' : 'item'})
test_df_2 = test_df_2.reset_index(drop = True)
test_df_2

Unnamed: 0,client_id,item
0,55574848-48494948544878,"FUN TIME, КОМПЛЕКТ (брюки, футболка) (гол), р. 92"
1,55574848-48495057545270,"FUN TIME, БРЮКИ (син), р. 92"
2,55574848-48545256485273,"КОТОФЕЙ, ПОЛУБОТИНКИ (черн/гол), р. 33-37"
3,55574848-48575054575179,"ФЕЯ, КРОВАТЬ-трансформер 1100, маятник, 2+3 ящ..."
4,55574848-48575448515270,"АВЕНТ, БУТЫЛОЧКА для кормления Naturale, (260 ..."
...,...,...
24003,55575757-57565749535776,СИТЕЧКО Крепость
24004,55575757-57565752495570,"КОТОФЕЙ, ВАЛЕНКИ, (черн), р. 23-26"
24005,55575757-57575751494871,"ТРИ БОГАТЫРЯ, МЕЧ Восточный"
24006,55575757-57575753534870,"МУНИ, ПОДГУЗНИКИ (L), (9-14кг) (54шт)"


### Prepare train data
The data on clients' previous purchases

In [21]:
marks = marks[['client_id', 'Nomenclature']]
marks = marks.rename(columns = {'Nomenclature' : 'item'})
marks['rating'] = 1

In [22]:
marks

Unnamed: 0,client_id,item,rating
0,55574853-53565050515377,"AMOS, МАРКЕРЫ Малыш, (12 цв)",1
1,55574948-52515357485779,"FUN TIME, СОРОЧКА верхняя с воротом поло, р. 3 г",1
2,55574948-52515357485779,"FUN TIME, СОРОЧКА верхняя с воротом поло, р. 3 г",1
3,55574948-52515357485779,"FUN TIME, СОРОЧКА верхняя, р. 3 г",1
4,55574948-52504948534879,"LEADER KIDS, ТРУСЫ Сладкая вишня, (набивка), р...",1
...,...,...,...
341463,55574857-57485151544876,"СЭМПЕР, ПЮРЕ телятина, (мономясо), с 6 мес., (...",1
341464,55574953-50495349574974,"НУК, СОСКА для чая (латекс), с возд. клап., с ...",1
341465,55575348-48504953555074,"BEMBI, ФУТБОЛКА (бел), р.128",1
341466,55575450-53495456535575,"ХИПП, ПЮРЕ каша груша-зерновые хлопья, с 4 мес...",1


In [23]:
choose_column = ['rating']
selected_columns = ['client_id', 'item'] + choose_column

### Prepare the algorithm

In [24]:
reader = Reader()

In [25]:
data = Dataset.load_from_df(marks[selected_columns], reader)

In [26]:
trainset = data.build_full_trainset()

algo = TfidfAlgorithm(sim_options={'name': 'cosine', 'user_based': False, 'k': 10})

In [27]:
algo = algo.fit(trainset)

### Make predictions

In [28]:
%%time

preds = []
for idx, row in tqdm(test_df_2.iterrows(), total=test_df_2.shape[0]):
    pred = algo.get_best_item(trainset.to_inner_uid(row['client_id']), trainset.to_inner_iid(row['item']))
    preds.append(trainset.to_raw_iid(pred[0]))
test_df['predicted_item'] = preds
test_df

100%|██████████| 24008/24008 [02:41<00:00, 148.57it/s]

CPU times: user 2min 41s, sys: 676 ms, total: 2min 42s
Wall time: 2min 41s





Unnamed: 0,client_id,item,predicted_item
0,55574848-48494948544878,"LEADER KIDS, КОМПЛЕКТ (майка+шорты) Сафари, (м...","FUN TIME, КОМПЛЕКТ (брюки, футболка) (гол), р. 92"
1,55574848-48495057545270,"BUTTON BLUE, ПЛАТЬЕ (бел+роз), р.128","FUN TIME, БРЮКИ (т.син), р. 92"
2,55574848-48545256485273,"TRANSFORMERS, ПОЛУБОТИНКИ школ., (син), р. 31-36","КОТОФЕЙ, ПОЛУБОТИНКИ (черн/гол), р. 33-37"
3,55574848-48575054575179,"LEADER KIDS, БОРТИК в кроватку Мыльные пузыри,...","ФЕЯ, КРОВАТЬ-трансформер 1100, маятник, 2+3 ящ..."
4,55574848-48575448515270,"БУСИНКА, ТАРЕЛКА с подогревом, на присоске","АВЕНТ, БУТЫЛОЧКА для кормления Naturale, (260 ..."
...,...,...,...
24003,55575757-57565749535776,"ПОЛЕСЬЕ, ЛОПАТА средняя, (41 см)","КАНПОЛ, СИТЕЧКО для кормления"
24004,55575757-57565752495570,"КОТОФЕЙ, САПОГИ (красн), р. 23-26","КОТОФЕЙ, ВАЛЕНКИ, (черн), р. 23-26"
24005,55575757-57575751494871,"ИГРАЕМ ВМЕСТЕ, НАБОР ОРУЖИЯ ТРИ БОГАТЫРЯ (МЕЧ+...","ТРИ БОГАТЫРЯ, МЕЧ Восточный"
24006,55575757-57575753534870,"МУНИ, ПОДГУЗНИКИ (L), (9-14кг) (54шт)","МУНИ, ПОДГУЗНИКИ (L), (9-14кг) (54шт)"


In [29]:
# for each item description find its Group3 and Group4
check = markets_filtered[['Nomenclature', 'Group4', 'Group3']]
check = check.drop_duplicates(keep = 'first')
check = check.rename(columns = {'Nomenclature': 'item'})
temp = test_df.copy()
gr3ch = temp.merge(check, on = 'item')
gr3ch = gr3ch.rename(columns = {'Group4' : 'real_group4', 'Group3' : 'real_group3'}) 
check = check.rename(columns = {'item': 'predicted_item'})
gr3ch = gr3ch.merge(check, on = 'predicted_item')
gr3ch = gr3ch.rename(columns = {'Group4' : 'pred_group4', 'Group3' : 'pred_group3'}) 
prediction_results = gr3ch[['client_id', 'item', 'predicted_item', 'real_group3', 'pred_group3', 'real_group4', 'pred_group4']]

In [35]:
prediction_results = prediction_results.head(5000)

### Precision and recall scores TF-IDF Levenshtein

In [36]:
from collections import defaultdict
import numpy as np

def precision_recall(df):
    clients = list(set(df['client_id']))
    n_clients = len(clients)
    relevant_items = defaultdict(list)
    for _, row in df.iterrows():
        relevant_items[row['client_id']].append(row['real_group4'])
    precision_scores = []
    recall_scores = []
    for client in clients:
        client_data = df[df['client_id'] == client]
        predicted_item = client_data['pred_group4'].values[0]
        relevant = relevant_items[client]
        if predicted_item in relevant:
            precision_scores.append(1)
        else:
            precision_scores.append(0)
        if len(relevant) == 0:
            recall_scores.append(1)
        else:
            recall_scores.append(np.sum(np.array(client_data['pred_group4'].isin(relevant))) / len(relevant))
    avg_precision = np.mean(precision_scores)
    avg_recall = np.mean(recall_scores)
    return avg_precision, avg_recall

In [37]:
avg_precision, avg_recall = precision_recall(prediction_results)
print('Average precision:', avg_precision)
print('Average recall:', avg_recall)

Average precision: 0.4048241206030151
Average recall: 0.40472361809045226


In [38]:
# Calculate Accuracy
accuracy = accuracy_score(prediction_results['real_group4'], prediction_results['pred_group4'])
accuracy

0.4028

In [39]:
# Calculate F1 Score
f1 = f1_score(prediction_results['real_group4'], prediction_results['pred_group4'], average='weighted')
f1

0.3777807941285764