In [1]:
import random
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import json
import dask.dataframe as dd

In [13]:
df = pd.read_csv('./data/processed_points_fulldb.csv')
df.columns = ['index', 'time', 'event', 'user_id', 'product_id', 'product_name', 'amount', 'price', 'points']
df = df.drop(columns=["index"])
# df["time"] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S')
df["time"] = df["time"].astype(np.datetime64)
df["points"] = df["points"].astype(np.float64)
df["user_id"] = df["user_id"].astype("category")
df["product_id"] = df["product_id"].astype("category")
df["is_train"] = 1
df.sort_values(by=['time'], inplace=True)
df.head()

Unnamed: 0,time,event,user_id,product_id,product_name,amount,price,points,is_train
0,2015-05-03 03:00:04,addtocart,1067045,110526,item noname #297662,1.0,82.0,3.0,1
1,2015-05-03 03:00:11,view,1217632,213271,item noname #60987,1.0,10266.0,1.0,1
2,2015-05-03 03:00:13,view,1021692,85424,item noname #252860,1.0,4920.0,1.0,1
3,2015-05-03 03:00:24,view,139933,132332,item noname #33661,1.0,9015.5,1.0,1
4,2015-05-03 03:00:26,view,1067045,110526,item noname #297662,1.0,2235.6,1.0,1


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2758516 entries, 0 to 2756781
Data columns (total 9 columns):
time            datetime64[ns]
event           object
user_id         category
product_id      category
product_name    object
amount          float64
price           float64
points          float64
is_train        int64
dtypes: category(2), datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 251.9+ MB


In [15]:
df.time.is_monotonic

True

In [16]:
gr_obj = df.groupby(["user_id"])

In [17]:
test_ratio = 0.3
count = 0
for name, group in gr_obj:
    test_idx = int((1-test_ratio)*len(group))
    if test_idx > 0:
        group.is_train.iloc[test_idx:] = 0
    
    if count == 0:
        group.to_csv('./data/processed_points_fulldb_train_test_split.csv', index=False)
    else:
        group.to_csv('./data/processed_points_fulldb_train_test_split.csv', index=False, mode="a", header=False)
    
    if count % 100000 == 0:
        print(count)
    count += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000


In [18]:
df.columns

Index(['time', 'event', 'user_id', 'product_id', 'product_name', 'amount',
       'price', 'points', 'is_train'],
      dtype='object')

In [19]:
temp_df = pd.read_csv('./data/processed_points_fulldb_train_test_split.csv').sort_values("time")
temp_df

Unnamed: 0,time,event,user_id,product_id,product_name,amount,price,points,is_train
2082703,2015-05-03 03:00:04,addtocart,1067045,110526,item noname #297662,1.0,82.0,3.0,1
2380950,2015-05-03 03:00:11,view,1217632,213271,item noname #60987,1.0,10266.0,1.0,1
1996036,2015-05-03 03:00:13,view,1021692,85424,item noname #252860,1.0,4920.0,1.0,1
268396,2015-05-03 03:00:24,view,139933,132332,item noname #33661,1.0,9015.5,1.0,1
2082704,2015-05-03 03:00:26,view,1067045,110526,item noname #297662,1.0,2235.6,1.0,1
...,...,...,...,...,...,...,...,...,...
2757511,2017-04-28,transaction,1407601,235099,New Bork Chair F310,720.0,10512.0,5.0,0
2757512,2017-04-28,transaction,1407601,235087,New Birkemose Chair_F310,1976.0,20056.4,5.0,0
2757513,2017-04-28,transaction,1407601,235099,New Bork Chair F310,720.0,10512.0,5.0,0
2757217,2017-04-28,transaction,1407600,235555,Nút nhựa f27.6 (bali),5040.0,0.0,5.0,0


In [20]:
temp_df.to_csv('./data/processed_points_fulldb_train_test_split_sorted.csv', index=False)

In [21]:
temp_df.is_train.value_counts()

1    2072194
0     686322
Name: is_train, dtype: int64

In [22]:
temp_df.is_train.unique()

array([1, 0], dtype=int64)

In [6]:
def train_test_split(x, test_ratio=0.3):
    test_idx = int((1-test_ratio)*len(x))
    x = x.sort_values(by=['time'])
    x.is_train.iloc[test_idx:] = 0
    return x
train_test_split_df = df.groupby(["user_id"]).apply(train_test_split).reset_index()

KeyboardInterrupt: 

In [3]:
dask_df = dd.read_csv('./data/processed_points_fulldb.csv')
dask_df.columns = ['index', 'time', 'event', 'user_id', 'product_id', 'product_name', 'amount', 'price', 'points']
dask_df = dask_df.drop(columns=["index"])
dask_df["points"] = dask_df["points"].astype(np.float64)
dask_df["user_id"] = dask_df["user_id"].astype("category")
dask_df["product_id"] = dask_df["product_id"].astype("category")
dask_df["is_train"] = 1
dask_df.sort_values(by=['time'])
dask_df.head()

Unnamed: 0,time,event,user_id,product_id,product_name,amount,price,points,is_train
0,2015-05-03 03:00:04,addtocart,1067045,110526,item noname #297662,1.0,82.0,3.0,1
1,2015-05-03 03:00:11,view,1217632,213271,item noname #60987,1.0,10266.0,1.0,1
2,2015-05-03 03:00:13,view,1021692,85424,item noname #252860,1.0,4920.0,1.0,1
3,2015-05-03 03:00:24,view,139933,132332,item noname #33661,1.0,9015.5,1.0,1
4,2015-05-03 03:00:26,view,1067045,110526,item noname #297662,1.0,2235.6,1.0,1


In [5]:
"From `{}` to `{}`".format(dask_df.time.max().compute(),dask_df.time.min().compute())

'From `2017-04-28` to `2015-05-03 03:00:04`'

In [6]:
dask_df.describe(include="all").compute().T

Unnamed: 0,unique,count,top,freq,mean,std,min,25%,50%,75%,max
time,2357456.0,2758516.0,2017-01-21,98.0,,,,,,,
event,3.0,2758516.0,view,2664312.0,,,,,,,
user_id,1407620.0,2758520.0,166767,7757.0,,,,,,,
product_id,235557.0,2758520.0,49063,3412.0,,,,,,,
product_name,235556.0,2758516.0,item noname #187946,3412.0,,,,,,,
amount,,2758520.0,,,1.3342,20.2253,1.0,1.0,1.0,1.0,5040.0
price,,2758520.0,,,3601.19,4748.36,0.0,552.0,1870.0,4524.0,31620.0
points,,2758520.0,,,1.08633,0.487181,1.0,1.0,1.0,1.0,5.0
is_train,,2758520.0,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [None]:
def train_test_split(x, test_ratio=0.3):
    test_idx = int((1-test_ratio)*len(x))
    x = x.sort_values(by=['time'])
    x.is_train.iloc[test_idx:] = 0
    return x
dask_df.groupby(["user_id"]).apply(train_test_split).compute().reset_index()


In [25]:
# User that have more than 1 action performed in database
user_appearance_df[user_appearance_df.time > 1]

Unnamed: 0,user_id,time
5,26,3
8,46,2
14,69,9
16,73,2
19,84,5
...,...,...
1407620,1407620,8
1407621,1407621,24
1407622,1407622,25
1407623,1407623,7


In [51]:
user_appearance_df.time.value_counts()

1       1001563
2        205994
3         79614
4         38798
5         22968
         ...   
291           1
293           1
2345          1
297           1
178           1
Name: time, Length: 345, dtype: int64

In [34]:
user_appearance_df.time.unique()
np.sort(user_appearance_df.time.unique())

array([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
         12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,
         23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,
         34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,
         45,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,
         56,   57,   58,   59,   60,   61,   62,   63,   64,   65,   66,
         67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
         78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,
         89,   90,   91,   92,   93,   94,   95,   96,   97,   98,   99,
        100,  101,  102,  103,  104,  105,  106,  107,  108,  109,  110,
        111,  112,  113,  114,  115,  116,  117,  118,  119,  120,  121,
        122,  123,  125,  126,  127,  128,  129,  130,  131,  132,  133,
        134,  135,  136,  137,  138,  139,  140,  141,  142,  143,  145,
        146,  147,  149,  150,  151,  152,  153,  1

In [35]:
user_appearance_df[user_appearance_df.time == 7757]

Unnamed: 0,user_id,time
462309,166767,7757


In [41]:
temp_user_history = dask_df[dask_df.user_id == 166767].compute()
temp_user_history.head()

Unnamed: 0,index,time,event,user_id,product_id,product_name,amount,price,points
9075,815333,2015-06-11 14:55:17,view,166767,18819,item noname #133542,1.0,1230.0,1.0
9380,815638,2015-06-11 15:15:35,view,166767,37850,item noname #167873,1.0,2054.0,1.0
9448,815706,2015-06-11 15:20:06,view,166767,73460,item noname #231726,1.0,9120.96,1.0
9511,815769,2015-06-11 15:24:48,view,166767,183369,item noname #427777,1.0,238.0,1.0
9517,815775,2015-06-11 15:25:32,view,166767,166781,item noname #398115,1.0,1156.0,1.0


In [47]:
temp_user_history.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
index,7757,,,,1689290.0,530979.0,815333.0,1217440.0,1672870.0,2071590.0,2745940.0
time,7757,7690.0,2015-07-24 19:21:02,5.0,,,,,,,
event,7757,3.0,view,6479.0,,,,,,,
user_id,7757,1.0,166767,7757.0,,,,,,,
product_id,7757,3814.0,11078,38.0,,,,,,,
product_name,7757,3814.0,item noname #119736,38.0,,,,,,,
amount,7757,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
price,7757,,,,3595.11,4764.94,0.0,549.0,1885.13,4471.2,31620.0
points,7757,,,,1.47364,1.14001,1.0,1.0,1.0,1.0,5.0


In [46]:
temp_user_history["user_id"] = temp_user_history["user_id"].astype("category")
temp_user_history["product_id"] = temp_user_history["product_id"].astype("category")

## Processing

In [13]:
dask_df = dask_df[["user_id","product_id","points"]]
dask_df.head()

Unnamed: 0,user_id,product_id,points
0,1067045,110526,3.0
1,1217632,213271,1.0
2,1021692,85424,1.0
3,139933,132332,1.0
4,1067045,110526,1.0


In [6]:
processed_ddf = dask_df.groupby(["user_id","product_id"]).mean().compute().reset_index()

In [7]:
processed_ddf.head()

Unnamed: 0,user_id,product_id,points
0,7,231138,1.0
1,12,44468,1.0
2,20,16005,1.0
3,22,195254,1.0
4,24,221570,1.0


In [8]:
processed_ddf.describe(include="all").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,2145773.0,706585.30642,406408.412822,0.0,355481.0,707453.0,1059410.0,1407624.0
product_id,2145773.0,117814.355813,67671.294299,0.0,59868.0,117262.0,176428.0,235556.0
points,2145773.0,1.039973,0.259406,1.0,1.0,1.0,1.0,5.0


In [9]:
users = list(np.sort(processed_ddf.user_id.unique()))
products = list(np.sort(processed_ddf.product_id.unique()))
points = list(processed_ddf.points)

In [10]:
# Get the rows and columns for our new matrix
rows = processed_ddf.user_id.astype(int)
cols = processed_ddf.product_id.astype(int)

In [11]:
user_item_sparse = sparse.csr_matrix((points, (rows, cols)), shape=(len(users), len(products)))
sparse_item_user = sparse.csr_matrix((points, (cols, rows)), shape=(len(products), len(users)))

In [17]:
model = implicit.approximate_als.FaissAlternatingLeastSquares(factors=20, regularization=0.1, iterations=4, use_gpu=True)
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model.fit(data_conf)



ModuleNotFoundError: No module named 'faiss'

In [18]:
%%time
model_nmslib = implicit.approximate_als.NMSLibAlternatingLeastSquares(factors=20, regularization=0.1, iterations=1)
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

#Fit the model
model_nmslib.fit(data_conf)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Wall time: 2min 43s


In [14]:
%%timerecommend_all= implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=1)
alpha_val = 15
data_conf = (user_item_sparse * alpha_val).astype('double')

#Fit the model
model_als.fit(data_conf)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Wall time: 41.6 s


In [20]:
%%time
similar = model_nmslib.recommend_all(user_item_sparse,show_progress=True)

HBox(children=(IntProgress(value=0, max=1407625), HTML(value='')))




In [21]:
similar.shape

(1407625, 10)

In [22]:
similar

array([[  6455, 110721, 159091, ..., 131978, 218007,  97661],
       [223555, 191155,  19015, ..., 113911, 194523, 123283],
       [ 49063, 223555, 163949, ..., 131978,  61262,  73321],
       ...,
       [ 28765, 159091,  74902, ..., 191155,  82933,  87749],
       [118906, 215646, 106791, ...,   1301, 187259, 161837],
       [209418, 151431, 142287, ...,  34416, 193240,  50220]], dtype=int32)

In [44]:
item_id = 5 
n_similar = 10

# Use implicit to get similar items.
similar = model.similar_items(item_id, n_similar)

# Print the names of our most similar artists
for item in similar:
    idx, score = item
    print("#{}: {}".format(idx,score))


#294554: 0.8752251267433167
#1402246: 0.8752214908599854
#452112: 0.8752204179763794
#923998: 0.875220000743866
#1063389: 0.8752188682556152
#116671: 0.8752177953720093
#632357: 0.8752171993255615
#682286: 0.875217080116272
#424435: 0.8752157688140869
#1098435: 0.8752156496047974


In [14]:
user_id = 2000

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_user_item, N=len(item_lookup), recalculate_user=True)

product_ids = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    product_ids.append(item_lookup.product_name.loc[raw_data.product_id.loc[data.product_id_code == idx].iloc[0]==item_lookup.product_id].iloc[0])
    scores.append(score)

# Create a dataframe of artist names and scores

recommendations = pd.DataFrame({'product': product_ids, 'score': scores})


print (recommendations)

                                             product     score
0                               giay the thao ls0891  0.067265
1                                 quan lot nu uniqlo  0.054395
2                   tui xach chanel classic f1 chuan  0.049489
3                     bo gen toan than sexy tb 24713  0.037868
4                                  ao so mi nam f348  0.036700
..                                               ...       ...
853              ao so mi co duc phong cach han quoc -0.038736
854                             dong ho casio baby g -0.040554
855  tinker bell kids bo gap ngan 512 ice cream pink -0.041546
856          combo ban phim va chuot apple khong day -0.042863
857                                     quan lot nam -0.048139

[858 rows x 2 columns]
