# CS 550 - Massive Data Mining 
# Recommendation Systems
## Team Members
### 1. Rushabh Bid ()
### 2. Fatima AlSaadeh ()
### 3. Keya Desai ()
### 4. Naveen Narayanan Meyyappan (nm941)

#### Data set: Movie review data set from "https://grouplens.org/datasets/movielens/latest/" 



## Data selection and preprocessing


In [0]:
# Installing the required packages
pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 4.0MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1678565 sha256=85f9799d95ce25ba04a8246c980eea6bc5be516a8617b18ee8622769075354ea
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


In [0]:
# Importing the required libraries
from surprise import Reader, Dataset
from surprise import SVD, accuracy, SVDpp, SlopeOne, BaselineOnly, CoClustering
import datetime
import requests, zipfile, io
from os import path
import pandas as pd
import tqdm as tqdm
from numpy import *
from sklearn.model_selection import train_test_split 
from collections import defaultdict
import time

In [0]:
# Reading the data 
rating_data = {}
movie_data = {}
training_data = []
testing_data = []
mapping_data = []
unique_user_id = []

# download http://files.grouplens.org/datasets/movielens/ml-latest-small.zip with 1M records File
# all files should be placed inside ml-latest folder
if not path.exists('ml-latest-small'):
    print("Downloading Files for first time use: ")
    download_file = requests.get('http://files.grouplens.org/datasets/movielens/ml-latest-small.zip')
    zipped_file = zipfile.ZipFile(io.BytesIO(download_file.content)) # having First.csv zipped file.
    zipped_file.extractall()


# Loading the mapping data which is to map each movie Id
# in the ratings with it's title and genre
# the resulted data structure is a dictionary where the
# movie id is the key, the genre and titles are values
def load_mapping_data():
    chunk_size = 500000
    df_dtype = {
        "movieId": int,
        "title": str,
        "genres": str
    }
    cols = list(df_dtype.keys())
    for df_chunk in tqdm.tqdm(pd.read_csv('ml-latest-small/movies.csv', usecols=cols, dtype=df_dtype, chunksize=chunk_size)):
        df_chunk.shape[0]
        combine_data = [list(a) for a in
                        zip(df_chunk["movieId"].tolist(), df_chunk["title"].tolist(),
                            df_chunk["genres"].tolist())]
        for a in combine_data:
            movie_data[a[0]] = [a[1], a[2]]
    del df_chunk

# Loading the rating data which is around 27M records it takes around 2 minutes
# the resulted data structure us a dictionary where the
# user id is the key and all their raings are values for example for user 1 :
# 1 = {
#     [movieId,rating,timestamp],
#     [movieId,rating,timestamp],
#     [movieId,rating,timestamp],
#   }


def load_data():
    chunk_size = 50000
    df_dtype = {
        "userId": int,
        "movieId": int,
        "rating": float,
        "timestamp": int,
    }
    cols = list(df_dtype.keys())
    for df_chunk in tqdm.tqdm(pd.read_csv('ml-latest-small/ratings.csv', usecols=cols, dtype=df_dtype, chunksize=chunk_size)):
        user_id = df_chunk["userId"].tolist()
        unique_user_id.extend(set(user_id))
        movie_id = df_chunk["movieId"].tolist()
        rating = df_chunk["rating"].tolist()
        timestamp = df_chunk["timestamp"].tolist()
        combine_data = [list(a) for a in zip(user_id, movie_id, rating, timestamp)]
        for a in combine_data:
            if a[0] in rating_data.keys():
                rating_data[a[0]].extend([[a[0], a[1], a[2], a[3]]])
            else:
                rating_data[a[0]] = [[a[0], a[1], a[2], a[3]]]
    del df_chunk
    return(rating_data)

# Split the data into training and testing
# this processes isn't being done for the whole dataset instead it's being done
# for each user id, for each user we split their ratings 80 training and 20 testing
# the resulted training and testing datasets are including the whole original dataset

def spilt_data():
    t0 = time.time()
    t1 = time.time()
    for u in unique_user_id:
        if len(rating_data[u]) == 1:
            x_test = rating_data[u]
            x_train = rating_data[u]
        else:
            x_train, x_test = train_test_split(rating_data[u], test_size=0.2)
        training_data.extend(x_train)
        testing_data.extend(x_test)
    total = t1 - t0
    print(int(total))

def get_movie_title(movie_id):
    if movie_id in movie_data.keys():
        return movie_data[movie_id][0]

def get_movie_genre(movie_id):
    if movie_id in movie_data.keys():
        return movie_data[movie_id][1]

In [0]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    training_dataframe = training_dataframe.iloc[:, :-1]
    testing_dataframe = testing_dataframe.iloc[:, :-1]
    reader = Reader(rating_scale=(0,5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset=testset.construct_testset(testset.raw_ratings)
    return([trainset,testset])

In [0]:
def movie_recommendation(predictions, n=10):
    # First map the predictions to each user.
    recommendations_for_each_user = defaultdict(list)
    # Creating a dictionary with user_id as the key and the movie_id and the estimated_rating as the value
    for user_id, movie_id, true_rating, estimated_rating, _ in predictions:
        recommendations_for_each_user[user_id].append((movie_id, estimated_rating))
    # Now we will sort the Estimated_rating of different movies for each user
    for user_id, user_ratings in recommendations_for_each_user.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        # Filtering the values to top n
        recommendations_for_each_user[user_id] = user_ratings[:n]
    return(recommendations_for_each_user)

In [0]:
def baseline(trainset, testset):
    algo = BaselineOnly()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)

In [0]:
def svdalgorithm(trainset, testset):
    algo = SVD()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)

In [0]:
if __name__ == "__main__":
    print("Data Loading and Processing, Estimated Time 2 minutes :")
    load_data()
    print("Training and Testing DataSets Construction, Estimated Time 40 seconds :")
    spilt_data()
    print("Mapping Data Processing :")
    load_mapping_data()
    print("Movie name with id = 1 :")
    print(get_movie_title(1))
    print("Movie genre with id = 1 :")
    print(get_movie_genre(1))
    training_dataframe=pd.DataFrame.from_records(training_data)
    training_dataframe.columns=["userId","movieId","rating","timestamp"]
    testing_dataframe=pd.DataFrame.from_records(testing_data)
    testing_dataframe.columns=["userId","movieId","rating","timestamp"]
    trainset,testset=convert_traintest_dataframe_forsurprise(training_dataframe,testing_dataframe)
    print("Baseline algorithm using surprise package")
    baseline(trainset, testset)
    print("SVD algorithm using surprise package")
    predictions=svdalgorithm(trainset,testset)

1it [00:00,  5.55it/s]

Data Loading and Processing, Estimated Time 2 minutes :


3it [00:00,  7.90it/s]


Training and Testing DataSets Construction, Estimated Time 40 seconds :


1it [00:00, 55.10it/s]

0
Mapping Data Processing :
Movie name with id = 1 :
Toy Story (1995)
Movie genre with id = 1 :
Adventure|Animation|Children|Comedy|Fantasy





Baseline algorithm using surprise package
Estimating biases using als...
Predictions
RMSE: 0.8157
MAE:  0.6272
SVD algorithm using surprise package
Predictions
RMSE: 0.2974
MAE:  0.2224


In [0]:
top_10_movie_recommendations = movie_recommendation(predictions, n=10)
# Print the recommended movies for each user
for user_id, user_ratings in top_10_movie_recommendations.items():
  print(user_id, [movie_id for (movie_id, estimated_rating) in user_ratings])

1 [2018, 1625, 1220, 1625, 1023, 1927, 1220, 1732, 260, 1196, 1732, 1927, 1210, 1210, 1210, 1224, 1073, 2329, 1256, 1097, 2947, 608, 608, 1031, 1298, 1617, 1089, 101, 101, 2005, 2596, 2596, 2529, 2529, 2993, 2033, 2872, 940, 1291, 1291, 2991, 2991, 1278, 1278, 2137, 2427, 2502, 2502, 596, 954, 1206, 3450, 3729, 2948, 2948, 1240, 2571, 3703, 2899, 2654, 2115, 1282, 1080, 3033, 3033, 3033, 2161, 3053, 3053, 457, 457, 3441, 3147, 1213, 2700, 2139, 2139, 2141, 2692, 2116, 2116, 2094, 1049, 1025, 1025, 1025, 5060, 5060, 1024, 2944, 2944, 151, 1197, 1197, 2858, 2858, 2470, 2470, 163, 2058, 2058, 2459, 2459, 1222, 1222, 1222, 3253, 3253, 1226, 1226, 1226, 2949, 47, 1270, 2291, 2387, 2048, 2048, 2987, 2987, 1517, 2353, 2353, 2353, 3702, 3702, 157, 157, 3034, 3034, 1208, 592, 480, 480, 356, 356, 356, 1029, 1029, 1029, 1029, 1029, 3273, 1214, 1214, 2000, 1090, 110, 110, 2997, 1265, 1265, 1265, 1265, 2406, 2273, 2273, 2366, 2916, 235, 2268, 2648, 1777, 3479, 3479, 2096, 733, 2012, 3639, 3639, 363