In [2]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict
import re

In [3]:
dataset_path = './ml-20M'

In [108]:
num_users = 3000
num_movies = 3000

In [4]:
# 20M ratings record
# 27,278 movies
# 138,493 users
ratings = pd.read_csv(os.path.join(dataset_path, 'ratings.csv'))
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [62]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,20000260.0,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529,1100918000.0
std,40038.63,19789.48,1.051989,162169400.0
min,1.0,1.0,0.5,789652000.0
25%,34395.0,902.0,3.0,966797700.0
50%,69141.0,2167.0,3.5,1103556000.0
75%,103637.0,4770.0,4.0,1225642000.0
max,138493.0,131262.0,5.0,1427784000.0


In [96]:
# 27,278 movies
movies = pd.read_csv(os.path.join(dataset_path,'movies.csv'))

In [97]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [98]:
movies.describe()

Unnamed: 0,movieId
count,27278.0
mean,59855.48057
std,44429.314697
min,1.0
25%,6931.25
50%,68068.0
75%,100293.25
max,131262.0


In [86]:
#num_movies = 27278
movie_id, genres = list(movies['movieId']), list(movies['genres'])
genre_names = ["action", "adventure", "animation", "children", "comedy", "crime",
               "documentary", "drama", "fantasy", "film-noir", "horror", "imax", "musical",
               "mystery", "romance", "sci-fi", "thriller", "war", "western", "(no genres listed)"]

for genre_name in genre_names:
    movies[genre_name] = np.zeros([num_movies, 1]).astype(int)

for idx in range(len(movie_id)):
    genre = genres[idx].lower()
    all_genres = genre.split("|")
    #print(all_genres)
    
    for g in all_genres:
        movies.at[idx,g] = 1

#for col in range(3,23):
    #movies.rename(columns = { movies.columns[col]: movies.columns[col].lower()},inplace=True)
    #print(movies.columns[col].lower())

movies = movies[movies['movieId']<=3000]
movies = movies.set_index(['movieId']).drop(columns=['title','genres'])
movies.head(5)

Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,film-noir,horror,imax,musical,mystery,romance,sci-fi,thriller,war,western,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [109]:
ratings = ratings[ratings['userId']<=num_users]
ratings = ratings[ratings['movieId']<=num_movies]
ratings.describe()
print(len(ratings))

274295


In [90]:
from scipy.sparse import csr_matrix
from scipy import sparse

# 20M ratings record
# 27,278 movies
# 138,493 users

num_ratings = len(ratings)  # 274295

# userId matrix with shape[274295,3000]
uid_mat = csr_matrix(([], ([], [])), shape=(num_ratings, num_users))

# movieId matrix with shape[274295,3000]
mid_mat = csr_matrix(([], ([], [])), shape=(num_ratings, num_movies))

# uids contains a sequence of user_id in the order of ratings
uids = list(ratings['userId']-1)
mids = list(ratings['movieId']-1)

# In each rating, the user who rated the movie is set to one
uid_mat[range(num_ratings), uids] = 1
# In each rating, the movie which is rated is set to one
mid_mat[range(num_ratings), mids] = 1



In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [93]:
# stop_words = set(stopwords.words('english'))   # have to download
movie_id = list(ratings['movieId'])
genres = movies.columns
#print(movie_id)

movie_genres = []
for iloc in range(num_movies):
    judge = list(movies.iloc[iloc]==1)
    judge_index = [i for i, x in enumerate(judge) if x]
    movie_genres.append(' '.join([genres[i] for i in judge_index]))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(movie_genres)
movie_genres_map = X.toarray()

In [94]:
print(X.shape)
print(X.toarray()) # X.toarray() is numpy.ndarray
print(movie_genres_map[0])

(3000, 21)
[[0.         0.40293069 0.57234773 ... 0.         0.         0.        ]
 [0.         0.51844871 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[0.         0.40293069 0.57234773 0.44774109 0.26153614 0.
 0.         0.         0.49111271 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.        ]


In [99]:
genres_mat = np.zeros([num_ratings, 21])

for i in range(num_ratings):
    genres_mat[i] = movie_genres_map[int(mids[i])]

In [100]:
genres_mat = sparse.csr_matrix(genres_mat)

In [101]:
ratings_mat = np.array(list(ratings['rating']))#.reshape((100000,1))
print(ratings_mat)

[3.5 3.5 3.5 ... 4.5 4.  4. ]


In [102]:
from scipy.sparse import hstack
mat = hstack((uid_mat, mid_mat, genres_mat))
print(mat.shape)
print(type(mat))

(274295, 6021)
<class 'scipy.sparse.coo.coo_matrix'>


In [103]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(mat, ratings_mat, test_size=0.2)

In [104]:
print(X_train.shape)
print(type(X_train))
print(y_train)

(219436, 6021)
<class 'scipy.sparse.csr.csr_matrix'>
[5.  4.  4.  ... 2.5 3.  3. ]


In [105]:
from pyfm import pylibfm
from sklearn.metrics import mean_squared_error
fm = pylibfm.FM(num_factors=10, num_iter=10, verbose=True, task="regression", initial_learning_rate=0.01,learning_rate_schedule="optimal")
fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.43152
-- Epoch 2
Training MSE: 0.38859
-- Epoch 3
Training MSE: 0.37603
-- Epoch 4
Training MSE: 0.36555
-- Epoch 5
Training MSE: 0.35388
-- Epoch 6
Training MSE: 0.34327
-- Epoch 7
Training MSE: 0.33424
-- Epoch 8
Training MSE: 0.32744
-- Epoch 9
Training MSE: 0.32044
-- Epoch 10
Training MSE: 0.31466


In [106]:
y_pred = fm.predict(X_test)
print(y_pred.shape)

(54859,)


In [107]:
print(mean_squared_error(y_test, y_pred))
print(y_test[:100])
print(y_pred[:100])

0.7275589454019264
[2.  4.  3.5 4.  2.  3.  3.  4.  3.  4.  4.  4.  5.  1.  5.  5.  3.  4.5
 4.  4.5 1.  4.  3.  2.  4.5 3.  4.  5.  3.  3.  4.  3.  4.  3.  3.5 5.
 3.  5.  4.  4.  3.  3.  3.5 4.  5.  4.  5.  4.  4.  5.  4.5 2.  4.  5.
 2.5 4.  3.  3.5 3.  3.  5.  1.5 4.  3.  4.  4.  3.  2.5 2.5 4.  5.  5.
 4.  4.  4.  3.  3.5 4.  5.  4.5 5.  5.  4.  3.  5.  4.  4.  5.  5.  2.
 4.  5.  3.  1.  3.  4.  4.  4.  2.  5. ]
[3.740907   3.85277232 4.41097062 3.86376403 3.28242707 3.34230071
 3.0661987  3.8861593  3.93682639 3.98850021 4.31411353 3.9171819
 3.44147025 1.48437086 3.7606433  3.79877485 4.14005028 3.15908186
 3.89816843 3.26701911 2.35846008 3.99958229 2.55995252 2.14383694
 3.12541247 3.35824072 3.25041574 4.0410966  3.8203086  3.12894671
 4.0346086  3.67937339 3.7877409  2.33423841 3.69318966 4.00504644
 3.63365387 3.22560599 3.59001323 4.34578321 2.48058813 4.13453341
 3.88677081 3.73412448 4.89571649 3.38089764 4.64290468 4.1960587
 4.3270353  4.48374451 3.50481992 3.5364244 

In [None]:
### Draft

In [66]:
tags = pd.read_csv(os.path.join(dataset_path, "tags.csv"), encoding="utf-8", sep=",")
tags.head(20)
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,465564.0,465564.0,465564.0
mean,68712.354263,32627.76292,1298711000.0
std,41877.674053,36080.241157,79208910.0
min,18.0,1.0,1135429000.0
25%,28780.0,2571.0,1245007000.0
50%,70201.0,7373.0,1302291000.0
75%,107322.0,62235.0,1366218000.0
max,138472.0,131258.0,1427771000.0


In [23]:
movie_id, tags = list(tags_df[u'movieId']), list(tags_df[u'tag'])
tags = [str(tag) for tag in tags]
len(tags)
len(movie_id)

465564

In [20]:
movie_tag_map = defaultdict(list)
stop_words = set()
for idx in range(len(movie_id)):
    tag = tags[idx].lower()
    tag = re.sub("[^a-zA-Z0-9 ]", " ", tag)
    tag = tag.strip()
    tag = re.sub("\s+", " ", tag)
    
    if len(tag) > 0:
        tag_words = tag.split(" ")
        tag = " ".join([x for x in tag_words if x not in stop_words])
        
        movie_tag_map[mid_map[movie_id[idx]]].append(tag)

NameError: name 'mid_map' is not defined

In [26]:
df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]],columns=['A', 'B', 'C'])

In [27]:
df

Unnamed: 0,A,B,C
0,0,2,3
1,0,4,1
2,10,20,30


In [29]:
df.at[1, 'B']

4