# Unsupervised Learning

In [1]:
import numpy as np
import pandas as pd

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.decomposition import NMF
from joblib import dump, load

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
movies  = pd.read_csv("../data/movies.csv", index_col=0)
ratings = pd.read_csv("../data/ratings.csv", index_col=0)
tags    = pd.read_csv("../data/tags.csv", index_col=1)

In [3]:
movies.drop(columns='genres', inplace=True)
ratings.drop(columns='timestamp', inplace=True)

In [4]:
ratitle = pd.merge(ratings, movies, left_on='movieId', right_index=True)

In [5]:
avgrat = ratitle.groupby('movieId').agg({"title": "first", "rating": "mean"})
avgrat.sort_values(['rating','title'], ascending=[False, True])

Unnamed: 0_level_0,title,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
27751,'Salem's Lot (2004),5.0
77846,12 Angry Men (1997),5.0
141816,12 Chairs (1976),5.0
5468,20 Million Miles to Earth (1957),5.0
27373,61* (2001),5.0
...,...,...
8137,"Wasp Woman, The (1959)",0.5
8236,While the City Sleeps (1956),0.5
157172,Wizards of the Lost Kingdom II (1989),0.5
102025,Yongary: Monster from the Deep (1967),0.5


In [6]:
avgrat20users = avgrat[ratitle.value_counts('movieId') >= 20]

  avgrat20users = avgrat[ratitle.value_counts('movieId') >= 20]


In [7]:
avgrat20users.sort_values('rating', ascending=False)

Unnamed: 0_level_0,title,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1104,"Streetcar Named Desire, A (1951)",4.475000
318,"Shawshank Redemption, The (1994)",4.429022
922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.333333
898,"Philadelphia Story, The (1940)",4.310345
475,In the Name of the Father (1993),4.300000
...,...,...
2720,Inspector Gadget (1999),2.095238
2053,"Honey, I Blew Up the Kid (1992)",2.050000
546,Super Mario Bros. (1993),2.000000
1882,Godzilla (1998),1.954545


In [8]:
ratings20 = ratings.value_counts('movieId') >= 20
ratings20 = ratings20[ratings20 == True]
ratings20 = ratings20.sort_index()

In [9]:
ratwide = ratings.pivot(columns='movieId', values='rating')
ratwide = ratwide.loc[:,ratings20.index]

In [10]:
R = ratwide.fillna(ratwide.mean())

In [11]:
%%script echo skip

model = NMF(n_components=76, solver='cd', init='nndsvda', random_state=10, max_iter=5000, tol=5e-5)
model.fit(R)
print(model.reconstruction_err_)

skip


In [12]:
#dump(model,'nmf_model')

In [13]:
model = load('../models/nmf_model')
print(model.reconstruction_err_)
Q = model.components_

163.00006783146634


In [14]:
%%script echo skip

Q = model.components_
P = model.transform(R)  
nR = np.dot(P, Q)

print(nR)

skip


In [15]:
def id_to_title(id:int)-> str:
    return movies.loc[id][0]

In [16]:
def title_to_id(title:str) -> int:
    return movies.index[movies['title'] == title][0]

In [17]:
dictoftitles = {}
for id in R.columns:
    dictoftitles[f"{id}"] = id_to_title(id)

movies_filtered = pd.DataFrame.from_dict(dictoftitles, orient='index', columns=['title'])
movies_filtered.index.name = 'movieId'

In [18]:
movies_filtered[movies_filtered['title'].str.contains("Star")]

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
260,Star Wars: Episode IV - A New Hope (1977)
316,Stargate (1994)
329,Star Trek: Generations (1994)
1196,Star Wars: Episode V - The Empire Strikes Back...
1210,Star Wars: Episode VI - Return of the Jedi (1983)
1356,Star Trek: First Contact (1996)
1371,Star Trek: The Motion Picture (1979)
1372,Star Trek VI: The Undiscovered Country (1991)
1373,Star Trek V: The Final Frontier (1989)
1374,Star Trek II: The Wrath of Khan (1982)


In [19]:
new_user = {
    "Titanic (1997)": 3,
    "Pulp Fiction (1994)": 5,
    "Interstellar (2014)": 5,
    "Terminator 2: Judgment Day (1991)": 3,
    "American Beauty (1999)": 5,
    "Kill Bill: Vol. 1 (2003)": 4,
    "Whiplash (2014)": 4,
    "Dead Poets Society (1989)": 4,
    "Fight Club (1999)": 5,
    "Grand Budapest Hotel, The (2014)": 3,
    "Fear and Loathing in Las Vegas (1998)": 5
}

In [20]:
def convert_dict(dict:dict) -> dict:
    new_dict = {}
    items = dict.items()

    for item in items:
        new_dict.update({title_to_id(item[0]): item[1]})
    
    return new_dict

In [21]:
new_user_dict = convert_dict(new_user)

In [22]:
movieIds = list(R.columns)
empty_list = [np.nan]*len(movieIds)
ratings_dict = dict(zip(movieIds, empty_list))

In [23]:
for movie, rating in new_user_dict.items():
    ratings_dict[movie] = rating

In [24]:
new_user_df = pd.DataFrame(list(ratings_dict.values()), index=movieIds)
new_user_df.columns = ['rating']

In [25]:
new_user_df.count()

rating    11
dtype: int64

In [26]:
neutralrating = (5.0 + 0.5) / 2
new_user_df_filled = new_user_df.fillna(neutralrating)
new_user_df_filled = new_user_df_filled.T
new_user_df_filled

Unnamed: 0,1,2,3,5,6,7,10,11,16,17,...,122920,122922,134130,134853,139385,148626,152081,164179,166528,168252
rating,2.75,2.75,2.75,2.75,2.75,2.75,2.75,2.75,2.75,2.75,...,2.75,2.75,2.75,2.75,2.75,2.75,2.75,2.75,2.75,2.75


In [27]:
new_user_P = model.transform(new_user_df_filled)
new_user_P

array([[0.00000000e+00, 0.00000000e+00, 2.04296772e-03, 0.00000000e+00,
        2.83170303e-03, 0.00000000e+00, 1.11280006e-03, 5.63880317e-04,
        3.41807118e-03, 0.00000000e+00, 2.42893469e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 2.47691886e-04, 4.97793607e-04,
        0.00000000e+00, 0.00000000e+00, 2.11682469e-04, 1.26133147e-03,
        0.00000000e+00, 0.00000000e+00, 6.97499436e-04, 2.62195380e-04,
        5.27051144e-04, 6.31508570e-04, 6.32214729e-04, 7.39428818e-04,
        0.00000000e+00, 6.13405011e-04, 2.20882007e-03, 4.56737218e-03,
        4.58048143e-03, 1.28148758e-03, 6.41923171e-04, 0.00000000e+00,
        4.26561459e-04, 8.82303787e-04, 5.03617827e-04, 0.00000000e+00,
        0.00000000e+00, 1.84789447e-03, 0.00000000e+00, 7.55334647e-05,
        1.20800853e-04, 5.27882169e-03, 4.32242126e-03, 0.00000000e+00,
        5.94210242e-04, 3.85904980e-04, 0.00000000e+00, 3.05471459e-04,
        6.87111875e-04, 1.31307070e-03, 4.92020881e-06, 1.032043

In [28]:
preds = np.dot(new_user_P, Q)

In [29]:
recs = pd.DataFrame(preds, columns=new_user_df_filled.columns)

In [30]:
not_rated_mask = np.isnan(new_user_df.T.values[0])
not_rated = recs.columns[not_rated_mask]
items_to_recommend = recs[not_rated]
items_to_recommend = items_to_recommend.T
items_to_recommend.columns = ['predicted_rating']
items_to_recommend_sorted = items_to_recommend.sort_values(by='predicted_rating', ascending=False)

In [31]:
items_to_recommend_sorted['predicted_rating'] = items_to_recommend_sorted['predicted_rating'].map(lambda x: round(x,2))
items_to_recommend_sorted.reset_index(names='title', inplace=True)
items_to_recommend_sorted['title'] = items_to_recommend_sorted['title'].map(lambda x: id_to_title(x))
items_to_recommend_sorted.index = items_to_recommend_sorted.index + 1
items_to_recommend_sorted

Unnamed: 0,title,predicted_rating
1,"Streetcar Named Desire, A (1951)",3.41
2,Dances with Wolves (1990),3.37
3,"Wolf of Wall Street, The (2013)",3.36
4,"Shining, The (1980)",3.34
5,Pretty Woman (1990),3.30
...,...,...
1282,Twilight (2008),1.76
1283,Inspector Gadget (1999),1.75
1284,Buffy the Vampire Slayer (1992),1.75
1285,"Honey, I Blew Up the Kid (1992)",1.66


In [32]:
recdf = items_to_recommend_sorted.loc[:10]
list = []
for i in range(recdf.shape[0]):
    list.append(f"{recdf.index[i]}, {recdf['title'][i+1]}, {recdf['predicted_rating'][i+1]}")

In [33]:
list

['1, Streetcar Named Desire, A (1951), 3.41',
 '2, Dances with Wolves (1990), 3.37',
 '3, Wolf of Wall Street, The (2013), 3.36',
 '4, Shining, The (1980), 3.34',
 '5, Pretty Woman (1990), 3.3',
 '6, Notorious (1946), 3.3',
 '7, Seven (a.k.a. Se7en) (1995), 3.28',
 '8, Shutter Island (2010), 3.28',
 "9, Rosemary's Baby (1968), 3.28",
 '10, Airplane! (1980), 3.28']

In [40]:
n_ratings = []
mov_ratio = []
for i in range(101):
    ratings_series = ratings.value_counts('movieId') >= i
    den = ratings_series.shape[0]
    nom = ratings_series.value_counts().loc[True]
    n_ratings.append(i)
    mov_ratio.append(nom*100/den)

data = {"n_ratings": n_ratings, "mov_ratio": mov_ratio}
n_rat_vs_movratio = pd.DataFrame(data)

In [41]:
n_rat_vs_movratio

Unnamed: 0,n_ratings,mov_ratio
0,0,100.000000
1,1,100.000000
2,2,64.561909
3,3,51.213492
4,4,42.986425
...,...,...
96,96,1.542575
97,97,1.501440
98,98,1.450021
99,99,1.439737


In [36]:
import plotly.express as px

In [45]:
fig = px.line(
    n_rat_vs_movratio,
    x='n_ratings',
    y='mov_ratio',
    template='plotly_dark',
    labels={
        'n_ratings': '# of ratings',
        'mov_ratio': '% of movies'
    }
    )
#fig.write_json("../code_streamlit/img/plot.json")
fig.show()