In [115]:
import os
import math
import datetime

from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
MOVIES_DAT_FILENAME = "data/movies.dat"
RATINGS_DAT_FILENAME = "data/ratings.dat"

font = {'size': 22}
matplotlib.rc('font', **font)
plt.rcParams["figure.figsize"] = (20,9)

### Read in the data

In [3]:
movies_df = pd.read_csv(MOVIES_DAT_FILENAME, delimiter="::", names=["MovieID", "Title", "Genres"])
movies_df.head(3)

  movies_df = pd.read_csv(MOVIES_DAT_FILENAME, delimiter="::", names=["MovieID", "Title", "Genres"])


Unnamed: 0,MovieID,Title,Genres
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2.0,Jumanji (1995),Adventure|Children|Fantasy
2,3.0,Grumpier Old Men (1995),Comedy|Romance


In [4]:
ratings_df = pd.read_csv(RATINGS_DAT_FILENAME, delimiter="::", names=["UserID", "MovieID", "Rating", "Timestamp"])
ratings_df.head(3)

  ratings_df = pd.read_csv(RATINGS_DAT_FILENAME, delimiter="::", names=["UserID", "MovieID", "Rating", "Timestamp"])


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1.0,122.0,5.0,838985046
1,1.0,185.0,5.0,838983525
2,1.0,231.0,5.0,838983392


In [5]:
movies_df["Genres"] = movies_df["Genres"].apply(lambda x: x.split("|"))
movies_df.head(3)

Unnamed: 0,MovieID,Title,Genres
0,1.0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2.0,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3.0,Grumpier Old Men (1995),"[Comedy, Romance]"


In [6]:
all_genres = set()
movies_df["Genres"].apply(lambda x: [all_genres.add(x_i) for x_i in x])
all_genres

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

## Data Cleaning

In [8]:
movies_df[movies_df["MovieID"].isna()]

Unnamed: 0,MovieID,Title,Genres
76,,French Twist (Gazon maudit) (1995),"[Comedy, Romance]"
96,,Black Sheep (1996),[Comedy]
9158,,"March of the Penguins (Marche de l'empereur, L...",[Documentary]
9242,,Why We Fight (2005),[Documentary]


In [9]:
movies_df["MovieID"] = movies_df.index

In [10]:
print(ratings_df[ratings_df["MovieID"].isna()])
print(ratings_df[ratings_df["Rating"].isna()])
print(ratings_df[ratings_df["Rating"] > 5])
print(ratings_df[ratings_df["Rating"] < 0])

          UserID  MovieID  Rating   Timestamp
57           3.0      NaN     4.0  1164885590
4097459  29338.0      NaN     2.0   938741134
          UserID  MovieID  Rating   Timestamp
4096580  29330.0   4007.0     NaN  1037022521
4096702  29330.0   5308.0     NaN  1052604830
4097255  29334.0   3897.0     NaN  1007958704
4097563  29339.0   1188.0     NaN   953451230
          UserID  MovieID        Rating   Timestamp
14           1.0    466.0  1.000000e+10   838984679
165          5.0    780.0  1.000000e+10   857911264
383          8.0    215.0  1.000000e+10  1115858875
6872139  49077.0   4848.0  5.000000e+01  1043427601
6872174  49077.0   5292.0  9.999000e+03  1043352336
6872203  49077.0   5582.0  9.999990e+05  1043370871
6872220  49077.0   5952.0  1.000000e+10  1043350326
          UserID  MovieID  Rating   Timestamp
431          8.0    522.0    -3.5  1115859283
461          8.0    784.0    -3.5  1115858371
4796401  34264.0   5962.0    -4.0  1108230893
4796410  34264.0   6266.0    -4.

In [11]:
ratings_df = ratings_df[~ratings_df["MovieID"].isna()]
ratings_df = ratings_df[~ratings_df["Rating"].isna()]
ratings_df["MovieID"] = ratings_df["MovieID"].apply(int)
ratings_df = ratings_df[ratings_df["Rating"] <= 5]
ratings_df = ratings_df[ratings_df["Rating"] >= 0]

# (Q1) What are the titles of top 5 most popular movies i.e. have the most ranking in the whole dataset?

In [22]:
count_df = ratings_df[["MovieID", "Rating"]].groupby(["MovieID"]).count()
top_5 = list(count_df.sort_values(by="Rating")[-5:].index)
for movie_id in top_5:
    print(movies_df[movies_df["MovieID"] == movie_id]["Title"].values[0])

Stuart Saves His Family (1995)
In the Name of the Father (1993)
Dances with Wolves (1990)
Corrina, Corrina (1994)
Once Were Warriors (1994)


# (Q2) What are the top 5 ranked movie genres on average in the whole dataset?

In [36]:
movie_id_genre_to_genre_vector = dict()
for m_id in range(len(movies_df)):
    movie_id_genre_to_genre_vector[m_id] = dict(movies_df.iloc[m_id][list(all_genres)])

In [40]:
for genre in all_genres:
    ratings_df[genre] = ratings_df["MovieID"].apply(lambda m_id: movie_id_genre_to_genre_vector[m_id][genre] if m_id in movie_id_genre_to_genre_vector else -1)
    ratings_df = ratings_df[ratings_df[genre] != -1]
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Sci-Fi,Thriller,Adventure,Drama,Action,Children,...,(no genres listed),Musical,Mystery,Documentary,Western,Comedy,Crime,War,Romance,Animation
0,1.0,122,5.0,838985046,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1.0,185,5.0,838983525,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1.0,231,5.0,838983392,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,292,5.0,838983421,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,316,5.0,838983392,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [48]:
mean_ratings = {}
for genre in all_genres:
    genre_rating = ratings_df[ratings_df[genre] == 1]
    mean_ratings[genre] = genre_rating["Rating"].mean()
mean_ratings = {k: v for k, v in sorted(mean_ratings.items(), key=lambda item: item[1])}
print(list(mean_ratings.keys())[-5:])
print(list(mean_ratings.values())[-5:])

['Film-Noir', 'Mystery', 'IMAX', 'War', 'Western']
[3.591536590033077, 3.621238491393564, 3.648477212027225, 3.661950168008622, 3.716090862100278]


# (Q3) How many movies have been ranked the most consecutive days?


In [65]:
ratings_df["datetime"] = ratings_df["Timestamp"].apply(lambda x: datetime.datetime.utcfromtimestamp(x).date())
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Sci-Fi,Thriller,Adventure,Drama,Action,Children,...,Musical,Mystery,Documentary,Western,Comedy,Crime,War,Romance,Animation,datetime
0,1.0,122,5.0,838985046,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1996-08-02
1,1.0,185,5.0,838983525,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1996-08-02
2,1.0,231,5.0,838983392,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1996-08-02
3,1.0,292,5.0,838983421,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1996-08-02
4,1.0,316,5.0,838983392,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1996-08-02


In [75]:
def count_longest_consecutive(date_list):
    if len(date_list) == 0:
        return 0
    max_left = 0
    max_right = 0
    left = 0
    for right, date in enumerate(date_list):
        if date - date_list[left] > datetime.timedelta(days=1):
            current_max = max_right - max_left
            new_max = right - left
            if new_max > current_max:
                max_left = left
                max_right = right
            left = right
    return right - left

In [76]:
all_movie_id = list(movies_df.index.unique())
longest_consecutive = {}
for movie_id in all_movie_id:
    movie_sub_df = ratings_df[ratings_df["MovieID"] == movie_id]
    movie_sub_df = movie_sub_df.sort_values(by="datetime")
    longest_consecutive[movie_id] = count_longest_consecutive(list(movie_sub_df["datetime"]))

In [78]:
longest_consecutive = {k: v for k, v in sorted(longest_consecutive.items(), key=lambda item: item[1])}
print(list(longest_consecutive.values())[-1])
# there are 18 movies that have been ranked the most consecutive days

18


# (Q4) What are the top 5 recommended movies made to one user, e.g. , UserID = 122 (any user can be selected)

In [None]:
# strategy is to use genre to recommend movies

In [95]:
USER_ID = 122
def top_n_recommned_to_user(ratings_df, movies_df, user_id, n=5):
    ratings_sub_df = ratings_df[ratings_df["UserID"] == user_id]
    ratings_train_x = ratings_sub_df[list(all_genres)]
    ratings_train_y = ratings_sub_df["Rating"]

    reg = LinearRegression().fit(ratings_train_x, ratings_train_y)

    all_movie_x_minus_already_watched = movies_df[~movies_df["MovieID"].isin(list(ratings_sub_df["MovieID"]))]
    all_movie_x_genre_vector = all_movie_x_minus_already_watched[list(all_genres)]
    all_movie_x_genre_vector.reset_index()
    rating_hat = reg.predict(all_movie_x_genre_vector)
    top_movies = np.array(rating_hat).argsort()[-n:][::-1]
    return all_movie_x_minus_already_watched.iloc[top_movies]
print(top_n_recommned_to_user(ratings_df, movies_df, USER_ID))

       MovieID                                              Title  \
10478    10478  Friend Among Strangers, Stranger Among Friends...   
1384      1384                   Last of the Mohicans, The (1992)   
3959      3959                                  Alamo, The (1960)   
2452      2452                                   Westworld (1973)   
4464      4464                                      Sunset (1988)   

                                    Genres  Sci-Fi  Thriller  Adventure  \
10478               [Action, War, Western]       0         0          0   
1384       [Action, Romance, War, Western]       0         0          0   
3959         [Action, Drama, War, Western]       0         0          0   
2452   [Action, Sci-Fi, Thriller, Western]       1         1          0   
4464   [Action, Comedy, Thriller, Western]       0         1          0   

       Drama  Action  Children  Film-Noir  ...  (no genres listed)  Musical  \
10478      0       1         0          0  ...         

# (Q5) What are the top 5 movies that are most frequently recommended by your model? (use training set)

In [109]:
def top_n_recommned_to_user_training(ratings_df, movies_df, user_id, n=5):
    ratings_sub_df = ratings_df[ratings_df["UserID"] == user_id]
    ratings_train_x = ratings_sub_df[list(all_genres)]
    ratings_train_y = ratings_sub_df["Rating"]
    if len(ratings_train_x) == 0:
        return None

    reg = LinearRegression().fit(ratings_train_x, ratings_train_y)
    rating_hat = reg.predict(ratings_train_x)

    top_movies = np.array(rating_hat).argsort()[-n:][::-1]
    return all_movie_x_minus_already_watched.iloc[top_movies]

In [114]:
all_users = list(ratings_df["UserID"].unique())
training_recommend_frequency = {}
for user_id in tqdm(all_users):
    top_n_df = top_n_recommned_to_user_training(ratings_df, movies_df, user_id)
    if top_n_df is None:
        continue
    for movie_id in list(top_n_df["MovieID"]):
        if movie_id not in training_recommend_frequency:
            training_recommend_frequency[movie_id] = 0
        training_recommend_frequency[movie_id] += 1

 28%|██▊       | 19304/69877 [04:40<12:14, 68.88it/s]


KeyboardInterrupt: 

In [107]:
training_recommend_frequency = {k: v for k, v in sorted(training_recommend_frequency.items(), key=lambda item: item[1])}

In [113]:
movie_ids = list(training_recommend_frequency.keys())[-5:]
print(movie_ids)
print(list(training_recommend_frequency.values())[-5:])
print(movies_df[movies_df["MovieID"].isin(movie_ids)]["Title"])

[1, 4, 3, 2, 0]
[616, 638, 642, 706, 729]
0                      Toy Story (1995)
1                        Jumanji (1995)
2               Grumpier Old Men (1995)
3              Waiting to Exhale (1995)
4    Father of the Bride Part II (1995)
Name: Title, dtype: object


# (Q6) Calculate the RMSE of your model for your test set.

In [None]:
# I will consider 20% of the individual's movie rating at random as the test set

In [126]:
def get_rmse_recommned_to_user(ratings_df, movies_df, user_id, n=5):
    ratings_sub_df = ratings_df[ratings_df["UserID"] == user_id]
    train_df = ratings_sub_df.iloc[:int(len(ratings_sub_df) * 0.8)]
    test_df = ratings_sub_df.iloc[int(len(ratings_sub_df) * 0.8):]

    ratings_train_x = train_df[list(all_genres)]
    ratings_train_y = train_df["Rating"]
    
    ratings_test_x = test_df[list(all_genres)]
    ratings_test_y = test_df["Rating"]
    if len(ratings_train_x) == 0 or len(ratings_test_x) == 0:
        return None

    reg = LinearRegression().fit(ratings_train_x, ratings_train_y)

    rating_hat = reg.predict(ratings_test_x)
    squared_error = (rating_hat - ratings_test_y) ** 2
    root_mean_squared_error = math.sqrt(squared_error.mean())
    return root_mean_squared_error

In [None]:
all_users = list(ratings_df["UserID"].unique())
training_recommend_frequency = {}
mean_rmse = []
for i, user_id in tqdm(enumerate(all_users)):
    rmse = get_rmse_recommned_to_user(ratings_df, movies_df, user_id)
    if rmse:
        mean_rmse.append(rmse)
    if i % 100 == 99:
        print(sum(mean_rmse) / len(mean_rmse))

111it [00:01, 70.95it/s]

1.1565808676489493


213it [00:03, 67.15it/s]

1.2435351503527878


312it [00:04, 67.11it/s]

1.2541324460337933


411it [00:05, 69.71it/s]

1.2511884124159471


508it [00:07, 67.60it/s]