In [1]:
import pandas as pd
import sklearn

## Creating DataFrame from MovieLens Data

In [3]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

In [7]:
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

In [8]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,0,24,M,technician,85711
1,1,53,F,other,94043
2,2,23,M,writer,32067
3,3,24,M,technician,43537
4,4,33,F,other,15213


In [9]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,0,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1995
1,1,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,1995
2,2,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995
3,3,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,4,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1995


In [10]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,195,241,3.0,881250949
1,185,301,3.0,891717742
2,21,376,1.0,878887116
3,243,50,2.0,880606923
4,165,345,1.0,886397596


## Create a Sparse Matrix

In [78]:
# https://www.geeksforgeeks.org/how-to-create-a-sparse-matrix-in-python/#
from scipy.sparse import csr_matrix

In [82]:
ratings_small = ratings.sample(frac = 0.0001).reset_index(drop = True)
len(ratings_small)

10

In [83]:
row = ratings_small["user_id"].astype(int).to_list()
col = ratings_small["movie_id"].astype(int).to_list()
print(row)
print(col)

[144, 268, 711, 936, 36, 550, 2, 677, 560, 327]
[12, 201, 1118, 235, 824, 75, 349, 180, 227, 510]


In [84]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()
row1 = le1.fit_transform(row)
col1 = le2.fit_transform(col)
print(row1)
print(col1)

[2 3 8 9 1 5 0 7 6 4]
[0 3 9 5 8 1 6 2 4 7]


In [85]:
data = ratings_small["rating"].astype(int).to_list()
print(data)

[5, 2, 4, 4, 2, 4, 3, 3, 3, 4]


In [86]:
sparseMatrix = csr_matrix((data, (row1, col1)), shape = (len(set(row1)), len(set(col1)))).toarray()

In [116]:
sparseMatrix

array([[0, 0, 0, 0, 0, 0, 3, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 2, 0],
       [5, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 2, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 4, 0, 0],
       [0, 4, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 3, 0, 0, 0, 0, 0],
       [0, 0, 3, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 4],
       [0, 0, 0, 0, 0, 4, 0, 0, 0, 0]])

In [89]:
rows_og = le1.inverse_transform(list(range(10)))
cols_og = le2.inverse_transform(list(range(10)))
print(rows_og)
print(cols_og)

[  2  36 144 268 327 550 560 677 711 936]
[  12   75  180  201  227  235  349  510  824 1118]


In [124]:
df = pd.DataFrame(sparseMatrix, columns=cols_og, index=rows_og)
df.head()

Unnamed: 0,12,75,180,201,227,235,349,510,824,1118
2,0,0,0,0,0,0,3,0,0,0
36,0,0,0,0,0,0,0,0,2,0
144,5,0,0,0,0,0,0,0,0,0
268,0,0,0,2,0,0,0,0,0,0
327,0,0,0,0,0,0,0,4,0,0


## Truncated SVD

In [123]:
df

Unnamed: 0,12,75,180,201,227,235,349,510,824,1118
2,0,0,0,0,0,0,3,0,0,0
36,0,0,0,0,0,0,0,0,2,0
144,5,0,0,0,0,0,0,0,0,0
268,0,0,0,2,0,0,0,0,0,0
327,0,0,0,0,0,0,0,4,0,0
550,0,4,0,0,0,0,0,0,0,0
560,0,0,0,0,3,0,0,0,0,0
677,0,0,3,0,0,0,0,0,0,0
711,0,0,0,0,0,0,0,0,0,4
936,0,0,0,0,0,4,0,0,0,0


In [120]:
# https://towardsdatascience.com/a-complete-guide-to-recommender-system-tutorial-with-sklearn-surprise-keras-recommender-5e52e8ceace1
from sklearn.decomposition import TruncatedSVD

# hyperparameters
epsilon = 1e-9
latent_factors = 4

# generate item latent features
item_svd = TruncatedSVD(n_components=latent_factors)
item_features = item_svd.fit_transform(df.transpose()) + epsilon

# generate user latent features
user_svd = TruncatedSVD(n_components=latent_factors)
user_features = user_svd.fit_transform(df) + epsilon

In [125]:
item_features.shape, user_features.shape

((10, 4), (10, 4))