In [28]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD

In [29]:
# list user by id and their rating to each movie 
columns=['user_id','item_id','rating','timestamp']
data=pd.read_csv('ml-100k/u.data',sep='\t',names=columns)
data.head()


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [30]:
columns=['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 
'Action', 'Adventure','Animation','Childrens','Comedy','Crime','Documentary', 'Drama', 'Fantasy', 
'Film-Noir', 'Horror','Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies=pd.read_csv('ml-100k/u.item',sep='|',names=columns, encoding='latin-1')
movie_names=pd.DataFrame(movies, columns=['item_id','movie title'])
print(movie_names)


      item_id                                movie title
0           1                           Toy Story (1995)
1           2                           GoldenEye (1995)
2           3                          Four Rooms (1995)
3           4                          Get Shorty (1995)
4           5                             Copycat (1995)
...       ...                                        ...
1677     1678                          Mat' i syn (1997)
1678     1679                           B. Monkey (1998)
1679     1680                       Sliding Doors (1998)
1680     1681                        You So Crazy (1994)
1681     1682  Scream of Stone (Schrei aus Stein) (1991)

[1682 rows x 2 columns]


In [31]:
combined_movies_data=pd.merge(data,movies, on='item_id')
print(combined_movies_data)

       user_id  item_id  rating  timestamp  \
0          196      242       3  881250949   
1           63      242       3  875747190   
2          226      242       5  883888671   
3          154      242       3  879138235   
4          306      242       5  876503793   
...        ...      ...     ...        ...   
99995      840     1674       4  891211682   
99996      655     1640       3  888474646   
99997      655     1637       3  888984255   
99998      655     1630       3  887428735   
99999      655     1641       3  887427810   

                                             movie title release date  \
0                                           Kolya (1996)  24-Jan-1997   
1                                           Kolya (1996)  24-Jan-1997   
2                                           Kolya (1996)  24-Jan-1997   
3                                           Kolya (1996)  24-Jan-1997   
4                                           Kolya (1996)  24-Jan-1997   
...      

In [32]:
# find movie with must number of views 
ating=pd.DataFrame(combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head())


In [33]:
Filter= combined_movies_data['item_id']==50
print(combined_movies_data[Filter]['movie title'].unique())


['Star Wars (1977)']


In [34]:
# building a utility matrix 
rating_crosstab_mat=combined_movies_data.pivot_table(values='rating',index='user_id', 
columns='movie title',fill_value=0)
print(rating_crosstab_mat)


movie title  'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                       
1                                    0             0                      2   
2                                    0             0                      0   
3                                    0             0                      0   
4                                    0             0                      0   
5                                    0             0                      2   
...                                ...           ...                    ...   
939                                  0             0                      0   
940                                  0             0                      0   
941                                  0             0                      0   
942                                  0             0                      0   
943                                  0             0

In [35]:
# transposing the matrix //5
print(rating_crosstab_mat.shape)


(943, 1664)


In [36]:
X=rating_crosstab_mat.values.T
print(X.shape)


(1664, 943)


In [37]:
# decomposing the matrix // 6
SVD=TruncatedSVD(n_components=10,random_state=18)
result_mat=SVD.fit_transform(X)
print(result_mat.shape)


(1664, 10)


In [38]:
corr_mat=np.corrcoef(result_mat)
print(corr_mat.shape)


(1664, 1664)


In [39]:
# Isolating Star Wars From the Correlation Matrix 
movies_names=rating_crosstab_mat.columns
movies_list=list(movies_names)
star_wars= movies_list.index('Star Wars (1977)')
print(star_wars)
corr_star_wars=corr_mat[star_wars]
print(corr_star_wars.shape)

1398
(1664,)


In [40]:
# Recommending a Highly Correlated movie 
print(list(movies_names[(corr_star_wars < 1.0) & (corr_star_wars > 0.9)]))

['Aliens (1986)', 'Blade Runner (1982)', 'Die Hard (1988)', 'Empire Strikes Back, The (1980)', 'Fugitive, The (1993)', 'Indiana Jones and the Last Crusade (1989)', "Jackie Chan's First Strike (1996)", 'Raiders of the Lost Ark (1981)', 'Return of the Jedi (1983)', 'Star Trek: First Contact (1996)', 'Strange Days (1995)', 'Terminator 2: Judgment Day (1991)', 'Terminator, The (1984)', 'Toy Story (1995)']
