## Collaborative recommend system on primitive data
### Playground for intensive, day 2, part 1

In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [60]:
ratings_dict = {
    "item": ["i1", "i3", "i4", "i2", "i4", "i2", "i3", "i4", "i5", "i1", "i2", "i3", "i1", "i2", "i4", "i5"],
    "user": ["u1", "u1", "u1", "u2", "u2", "u3", "u3", "u3", "u3", "u4", "u4", "u4", "u5", "u5", "u5", "u5"],
    "rating": [5, 4, 1, 3, 3, 2, 4, 4, 1, 4, 4, 5, 2, 4, 5, 2],
}

user_df = pd.DataFrame(
    {"user": ["u1", "u2", "u3", "u4", "u5"], "age": [20, 22, 30, 45, 40], "gender": ["m", "f", "m", "f", "m"]}
)

user_df["gender"] = user_df["gender"].map({"m": 0, "f": 1})


df = pd.DataFrame(ratings_dict)
df

Unnamed: 0,item,user,rating
0,i1,u1,5
1,i3,u1,4
2,i4,u1,1
3,i2,u2,3
4,i4,u2,3
5,i2,u3,2
6,i3,u3,4
7,i4,u3,4
8,i5,u3,1
9,i1,u4,4


In [61]:
df_pivot = df.pivot(index="user", columns="item", values="rating")

df_pivot

item,i1,i2,i3,i4,i5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
u1,5.0,,4.0,1.0,
u2,,3.0,,3.0,
u3,,2.0,4.0,4.0,1.0
u4,4.0,4.0,5.0,,
u5,2.0,4.0,,5.0,2.0


In [62]:
ages_normalized = StandardScaler().fit_transform(user_df[["age"]])

ages_normalized

array([[-1.16447843],
       [-0.96018397],
       [-0.14300612],
       [ 1.38920234],
       [ 0.87846618]])

In [63]:
normalized_ratings = StandardScaler().fit_transform(df_pivot)

normalized_ratings

array([[ 1.06904497,         nan, -0.70710678, -1.52127766,         nan],
       [        nan, -0.30151134,         nan, -0.16903085,         nan],
       [        nan, -1.50755672, -0.70710678,  0.50709255, -1.        ],
       [ 0.26726124,  0.90453403,  1.41421356,         nan,         nan],
       [-1.33630621,  0.90453403,         nan,  1.18321596,  1.        ]])

In [64]:
df_pivot_normalized = pd.DataFrame(normalized_ratings, columns=df_pivot.columns, index=df_pivot.index)

df_pivot_normalized

item,i1,i2,i3,i4,i5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
u1,1.069045,,-0.707107,-1.521278,
u2,,-0.301511,,-0.169031,
u3,,-1.507557,-0.707107,0.507093,-1.0
u4,0.267261,0.904534,1.414214,,
u5,-1.336306,0.904534,,1.183216,1.0


In [65]:
df_pivot_normalized["age"] = ages_normalized
df_pivot_normalized["gender"] = user_df["gender"].values

df_pivot_normalized

item,i1,i2,i3,i4,i5,age,gender
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
u1,1.069045,,-0.707107,-1.521278,,-1.164478,0
u2,,-0.301511,,-0.169031,,-0.960184,1
u3,,-1.507557,-0.707107,0.507093,-1.0,-0.143006,0
u4,0.267261,0.904534,1.414214,,,1.389202,1
u5,-1.336306,0.904534,,1.183216,1.0,0.878466,0


In [66]:
df_pivot_normalized.fillna(0, inplace=True)

df_pivot_normalized

item,i1,i2,i3,i4,i5,age,gender
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
u1,1.069045,0.0,-0.707107,-1.521278,0.0,-1.164478,0
u2,0.0,-0.301511,0.0,-0.169031,0.0,-0.960184,1
u3,0.0,-1.507557,-0.707107,0.507093,-1.0,-0.143006,0
u4,0.267261,0.904534,1.414214,0.0,0.0,1.389202,1
u5,-1.336306,0.904534,0.0,1.183216,1.0,0.878466,0


In [67]:
cosine_sim = cosine_similarity(df_pivot_normalized)

cosine_sim

array([[ 1.        ,  0.41758017, -0.02261304, -0.41937875, -0.76748555],
       [ 0.41758017,  1.        ,  0.17601966, -0.17599665, -0.3833197 ],
       [-0.02261304,  0.17601966,  1.        , -0.52776769, -0.39061512],
       [-0.41937875, -0.17599665, -0.52776769,  1.        ,  0.2900224 ],
       [-0.76748555, -0.3833197 , -0.39061512,  0.2900224 ,  1.        ]])

In [68]:
cosine_sim_df = pd.DataFrame(cosine_sim, columns=df_pivot_normalized.index, index=df_pivot_normalized.index)

cosine_sim_df

user,u1,u2,u3,u4,u5
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
u1,1.0,0.41758,-0.022613,-0.419379,-0.767486
u2,0.41758,1.0,0.17602,-0.175997,-0.38332
u3,-0.022613,0.17602,1.0,-0.527768,-0.390615
u4,-0.419379,-0.175997,-0.527768,1.0,0.290022
u5,-0.767486,-0.38332,-0.390615,0.290022,1.0


In [69]:
df_pivot_normalized.reset_index(inplace=True)

df_pivot_normalized

item,user,i1,i2,i3,i4,i5,age,gender
0,u1,1.069045,0.0,-0.707107,-1.521278,0.0,-1.164478,0
1,u2,0.0,-0.301511,0.0,-0.169031,0.0,-0.960184,1
2,u3,0.0,-1.507557,-0.707107,0.507093,-1.0,-0.143006,0
3,u4,0.267261,0.904534,1.414214,0.0,0.0,1.389202,1
4,u5,-1.336306,0.904534,0.0,1.183216,1.0,0.878466,0


In [70]:
idx = df_pivot_normalized[df_pivot_normalized["user"] == "u3"].index

idx

Index([2], dtype='int64')

In [71]:
sim_scores = cosine_sim[idx].flatten()

sim_scores


array([-0.02261304,  0.17601966,  1.        , -0.52776769, -0.39061512])

In [72]:
sim_scores = sorted(list(enumerate(sim_scores)), key=lambda x: x[1], reverse=True)[1:3]

sim_scores

[(1, 0.1760196596196068), (0, -0.02261303539223646)]

In [73]:
indices = [i[0] for i in sim_scores]

indices

[1, 0]

In [74]:
df_pivot_normalized.iloc[indices]

item,user,i1,i2,i3,i4,i5,age,gender
1,u2,0.0,-0.301511,0.0,-0.169031,0.0,-0.960184,1
0,u1,1.069045,0.0,-0.707107,-1.521278,0.0,-1.164478,0


### The same example using `surprize` library

In [75]:
from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic

In [76]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)

data.raw_ratings

[('u1', 'i1', 5.0, None),
 ('u1', 'i3', 4.0, None),
 ('u1', 'i4', 1.0, None),
 ('u2', 'i2', 3.0, None),
 ('u2', 'i4', 3.0, None),
 ('u3', 'i2', 2.0, None),
 ('u3', 'i3', 4.0, None),
 ('u3', 'i4', 4.0, None),
 ('u3', 'i5', 1.0, None),
 ('u4', 'i1', 4.0, None),
 ('u4', 'i2', 4.0, None),
 ('u4', 'i3', 5.0, None),
 ('u5', 'i1', 2.0, None),
 ('u5', 'i2', 4.0, None),
 ('u5', 'i4', 5.0, None),
 ('u5', 'i5', 2.0, None)]

In [77]:
sim_options = {
    "name": "cosine",
    "user_based": False,
}
algo = KNNBasic(sim_options=sim_options)

In [78]:
train_set = data.build_full_trainset()

In [79]:
algo.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x31b787690>

In [80]:
prediction = algo.predict("u1", "i1")

prediction

Prediction(uid='u1', iid='i1', r_ui=None, est=3.7466953361697586, details={'actual_k': 3, 'was_impossible': False})