In [46]:
import pandas as pd
import sqldf
import plotly.express as px
import numpy as np

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

import scipy.sparse as sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

import warnings
warnings.filterwarnings('ignore')

In [32]:
df_r = pd.read_csv('data_markets.csv')

### Rules

In [189]:
df1 = df_r[['Telephone_new', 'Date', 'Group3']]

In [190]:
df1 = df1.dropna()

In [191]:
df1 = df1.rename(columns = {'Telephone_new' : 'client_id', 'Date': 'timestamp', 'Group3': 'item_id'})

In [192]:
df1['timestamp'] = pd.to_datetime(df1['timestamp'])

In [193]:
df1

Unnamed: 0,client_id,timestamp,item_id
0,55574854-48574951555577,2017-03-01 11:41:00,КОЛЯСКИ
1,55575453-56535648535679,2017-03-01 12:22:00,ОДЕЖДА ДЛЯ НОВОРОЖДЕННЫХ (0-2 лет)
2,55574950-57515657535772,2017-03-01 12:31:00,ИГРУШКИ ДЛЯ ДЕВОЧЕК
3,55574851-55545249535475,2017-03-01 03:06:00,ОДЕЖДА ДЛЯ НОВОРОЖДЕННЫХ (0-2 лет)
4,55574854-56495552515179,2017-03-01 11:38:00,ИГРУШКИ ДЛЯ РАЗВИТИЯ МАЛЫШЕЙ
...,...,...,...
1247882,55574953-50495349574974,2017-06-30 23:08:00,ЗАМЕНИТЕЛИ МОЛОКА
1247883,55574953-50495349574974,2017-06-30 23:08:00,ЗАМЕНИТЕЛИ МОЛОКА
1247884,55575348-48504953555074,2017-06-30 23:13:00,ДЕТСКАЯ ОДЕЖДА (7-16 лет)
1247885,55575450-53495456535575,2017-06-30 23:58:00,ПЮРЕ


In [194]:
df1 = df1.sample(n=500000, random_state=1)

In [195]:
# Convert the data into a transactional format
te = TransactionEncoder()
te_ary = te.fit(df.groupby('client_id')['item_id'].apply(list)).transform(df.groupby('client_id')['item_id'].apply(list))
df = pd.DataFrame(te_ary, columns=te.columns_)

In [196]:
# Run the Apriori algorithm to find frequent item sets and generate association rules
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
association_rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)

In [197]:
# Show the association rules
print(association_rules)

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []


### Rec

In [241]:
df = df_r[['Telephone_new', 'Date', 'Group3', 'Group4']]
df = df.rename(columns = {'Telephone_new' : 'client_id', 'Date': 'timestamp_1', 'Group3': 'item_id', 'Group4': 'item_subcategory'})
df['timestamp_1'] = pd.to_datetime(df['timestamp_1'])
df = df.drop_duplicates(subset=['client_id', 'item_id'], keep='first')
df['timestamp_1'] = (df['timestamp_1'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df = df.dropna()

In [243]:
def get_user_item_matrix(df):
    # Create a matrix with users as rows and items as columns
    user_item = df.pivot(index='client_id', columns='item_id', values='timestamp_1')
    user_item = user_item.fillna(0)
    user_item = user_item.astype('int')
    
    return user_item

def get_content_similarity_matrix(df):
    # Get the item-item similarity matrix based on item content
    df = df.drop_duplicates(subset=['item_id', 'item_subcategory'], keep='first')
    item_content = df.pivot(index='item_id', columns='item_subcategory', values='timestamp_1')
    item_content = item_content.fillna(0)
    
    # Use Truncated SVD to reduce the dimensionality of the content matrix
    svd = TruncatedSVD(n_components=100, random_state=42)
    item_content = svd.fit_transform(item_content)
    
    # Calculate the cosine similarity between items
    item_similarity = cosine_similarity(item_content)
    
    return item_similarity

def recommend(user_item_matrix, item_similarity_matrix, n=10):
    # Get a list of all clients
    clients = user_item_matrix.index
    
    # Initialize an empty dataframe to store the recommendations
    recommendations = pd.DataFrame(columns=['client_id', 'recommended_items'])
    
    # Iterate over each client
    for client_id in clients:
        # Get the predicted ratings for all items
        item_predictions = item_similarity_matrix.dot(user_item_matrix.loc[client_id].T) / np.array([np.abs(item_similarity_matrix).sum(axis=1)])

        # Get the top n items with the highest predicted ratings
        top_n_items = item_predictions.argsort()[-n:][::-1]
        
        # Store the recommendations for this client in the dataframe
        recommendations = recommendations.append({'client_id': client_id, 'recommended_items': top_n_items}, ignore_index=True)
    
    return recommendations

In [244]:
# Get the user-item matrix
user_item_matrix = get_user_item_matrix(df)

In [245]:
df

Unnamed: 0,client_id,timestamp_1,item_id,item_subcategory
0,55574854-48574951555577,1488368460,КОЛЯСКИ,АКСЕССУАРЫ ДЛЯ КОЛЯСОК
1,55575453-56535648535679,1488370920,ОДЕЖДА ДЛЯ НОВОРОЖДЕННЫХ (0-2 лет),боди
2,55574950-57515657535772,1488371460,ИГРУШКИ ДЛЯ ДЕВОЧЕК,имитационные игрушки для девочек
3,55574851-55545249535475,1488337560,ОДЕЖДА ДЛЯ НОВОРОЖДЕННЫХ (0-2 лет),ползунки
4,55574854-56495552515179,1488368280,ИГРУШКИ ДЛЯ РАЗВИТИЯ МАЛЫШЕЙ,игрушки для ванной
...,...,...,...,...
1247862,55575054-55574849485475,1498854060,ИГРУШКИ ДЛЯ РАЗВИТИЯ МАЛЫШЕЙ,игрушки из дерева
1247863,55575053-56515157524878,1498857600,КАШИ,каши с наполнителем
1247874,55574851-50525757504974,1498862340,ИГРУШКИ ДЛЯ РАЗВИТИЯ МАЛЫШЕЙ,прочие игрушки для развития
1247878,55574857-57485151544876,1498863300,ПЮРЕ,пюре мясные


In [246]:
# Get the item-item similarity matrix based on item content
item_similarity_matrix = get_content_similarity_matrix(df)

In [247]:
recommendations = recommend(user_item_matrix, item_similarity_matrix)
print(recommendations)

                      client_id  \
0                             0   
1       32555749-545749525150 .   
2       49484949-49494949494911   
3       52535348-48484848484840   
4       52575355-48514856495345   
...                         ...   
193042  57565349-55535352575597   
193043  57565352-56495256545597   
193044  57575054-51535748525390   
193045  57575757-48485048565195   
193046  71764848-485149524951G3   

                                        recommended_items  
0       [[76, 24, 67, 38, 57, 68, 16, 64, 41, 7, 56, 7...  
1       [[44, 83, 51, 23, 45, 64, 48, 26, 14, 79, 80, ...  
2       [[33, 76, 58, 63, 22, 28, 5, 26, 7, 78, 14, 16...  
3       [[10, 59, 54, 24, 28, 77, 79, 15, 63, 6, 5, 27...  
4       [[58, 14, 5, 26, 63, 2, 45, 46, 71, 34, 11, 37...  
...                                                   ...  
193042  [[23, 35, 26, 13, 24, 18, 79, 74, 0, 5, 81, 72...  
193043  [[0, 27, 17, 61, 18, 50, 44, 51, 12, 35, 83, 6...  
193044  [[34, 22, 38, 77, 56, 67, 41, 2

In [162]:
def get_user_item_matrix(df):
    # Create a matrix with users as rows and items as columns
    user_item = df.pivot(index='client_id', columns='item_id', values='timestamp')
    user_item = user_item.fillna(0)
    user_item = user_item.astype('int')
    
    return user_item

def get_content_similarity_matrix(df):
    # Get the item-item similarity matrix based on item content
    item_content = df.pivot(index='item_id', columns='item_subcategory', values='timestamp')
    item_content = item_content.fillna(0)
    
    # Use Truncated SVD to reduce the dimensionality of the content matrix
    svd = TruncatedSVD(n_components=100, random_state=42)
    item_content = svd.fit_transform(item_content)
    
    # Calculate the cosine similarity between items
    item_similarity = cosine_similarity(item_content)
    
    return item_similarity

def recommend(user_item_matrix, item_similarity_matrix, n=10):
    # Get a list of all clients
    clients = user_item_matrix.index
    
    # Initialize an empty dataframe to store the recommendations
    recommendations = pd.DataFrame(columns=['client_id', 'recommended_items'])
    
    # Iterate over each client
    for client_id in clients:
        # Get the predicted ratings for all items
        item_predictions = item_similarity_matrix.dot(user_item_matrix.loc[client_id]) / np.array([np.abs(item_similarity_matrix).sum(axis=1)]).T

        # Convert item_predictions to a pandas DataFrame
        item_predictions = pd.DataFrame(item_predictions, index=user_item_matrix.columns, columns=[client_id])
        item_predictions = pd.Series(item_predictions[0], index=user_item_matrix.columns)

        # Get the top n items with the highest predicted ratings
        top_n_items = item_predictions.sort_values(by=client_id, ascending=False).iloc[:n].index
        
        # Store the recommendations for this client in the dataframe
        recommendations = recommendations.append({'client_id': client_id, 'recommended_items': top_n_items}, ignore_index=True)
    
    return recommendations

In [156]:
df = df.drop_duplicates(subset=['client_id', 'item_id'], keep='first')

In [None]:
df['timestamp'] = (df['timestamp'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [158]:
df_sim = df
df_sim = df_sim.drop_duplicates(subset=['item_id', 'item_subcategory'], keep='first')
df_sim

Unnamed: 0,client_id,timestamp,item_id,item_subcategory
0,55574854-48574951555577,1488368460,КОЛЯСКИ,АКСЕССУАРЫ ДЛЯ КОЛЯСОК
1,55575453-56535648535679,1488370920,ОДЕЖДА ДЛЯ НОВОРОЖДЕННЫХ (0-2 лет),боди
2,55574950-57515657535772,1488371460,ИГРУШКИ ДЛЯ ДЕВОЧЕК,имитационные игрушки для девочек
3,55574851-55545249535475,1488337560,ОДЕЖДА ДЛЯ НОВОРОЖДЕННЫХ (0-2 лет),ползунки
4,55574854-56495552515179,1488368280,ИГРУШКИ ДЛЯ РАЗВИТИЯ МАЛЫШЕЙ,игрушки для ванной
...,...,...,...,...
1046922,55574954-53484951535771,1498772340,ТОВАРЫ ДЛЯ МАМ,сумки для мам
1068583,55574854-51515455485474,1498257000,СПОРТ.ИНВЕНТАРЬ,халахупы
1074302,55575452-54505750495079,1498589760,КОЛЯСКИ ДЛЯ КУКОЛ,коляски для кукол КОРОЛЬ
1120256,55575049-52525448535572,1496673180,ТЕХНИКА ДЛЯ КУХНИ,микроволновки и мультиварки


In [159]:
# Get the user-item matrix
user_item_matrix = get_user_item_matrix(df)

In [160]:
# Get the item-item similarity matrix based on item content
item_similarity_matrix = get_content_similarity_matrix(df_sim)

In [163]:
recommendations = recommend(user_item_matrix, item_similarity_matrix)
print(recommendations)

ValueError: Shape of passed values is (84, 84), indices imply (84, 1)