In [1]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.decomposition import TruncatedSVD

In [2]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]
movie_names.head()

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
combined_movies_data = pd.merge(df, movie_names, on='item_id')
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [5]:
top = combined_movies_data.groupby('movie title')['rating'].count().sort_values(ascending=False)[:10]
print(list(dict(top).keys()))

['Star Wars (1977)', 'Contact (1997)', 'Fargo (1996)', 'Return of the Jedi (1983)', 'Liar Liar (1997)', 'English Patient, The (1996)', 'Scream (1996)', 'Toy Story (1995)', 'Air Force One (1997)', 'Independence Day (ID4) (1996)']


In [6]:
filter = combined_movies_data['item_id']==258
combined_movies_data[filter]['movie title'].unique()

array(['Contact (1997)'], dtype=object)

In [7]:
rating_crosstab = combined_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [8]:
X = rating_crosstab.T
X.shape

(1664, 943)

In [9]:
SVD = TruncatedSVD(n_components=12, random_state=17)

resultant_matrix = SVD.fit_transform(X)

resultant_matrix[1]

array([ 0.43658434, -0.25726146,  0.3529551 , -0.66866719, -0.2933699 ,
       -0.00608773, -0.23155994, -0.5680511 ,  0.33680062, -0.24743227,
        0.35260685, -0.08358355])

In [10]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat[1]

array([-0.10298113,  1.        ,  0.06549218, ...,  0.16134137,
        0.5091753 ,  0.23355053])

In [11]:
movie_names = rating_crosstab.columns
movie_list = list(movie_names)

star_wars = movie_list.index('Star Wars (1977)')
movie_list.index('Scream (1996)')

1284

In [12]:
corr_star_wars = corr_mat[1398]
corr_toy_story = corr_mat[1523]

In [30]:
fun = ((corr_toy_story<1.0) & (corr_toy_story > 0.9) & (corr_star_wars<1.0) & (corr_star_wars > 0.85))
' '.join(list(movie_names[fun]))

'Men in Black (1997) Return of the Jedi (1983) Rumble in the Bronx (1995) Star Trek: First Contact (1996) Willy Wonka and the Chocolate Factory (1971)'

In [14]:
import os
import pickle

dest = os.path.join('app', 'pkl_objects')

with open(os.path.join(dest, r'corr_mat.pickle'), 'wb') as f:
    pickle.dump(corr_mat, f)

with open(os.path.join(dest, r'movie_names.pickle'), 'wb') as f:
    pickle.dump(movie_names, f)
