# Group Members

**Name:** Gayathri Prerepa

**Net ID:** gp2254


**Name:** Rohan Sardana

**Net ID:** rs7445

# Setting up the environment

## Installing required dependencies

In [1]:
!pip install neattext
!pip install pyspark

Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[K     |████████████████████████████████| 114 kB 8.3 MB/s 
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 33 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 56.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=da32324e4ebbc09ffdfd1260829b92b69ae85db683f166b789b532d9996e7b43
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1

## Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import neattext.functions as nfx
import os
import glob
import math
import random
from collections import Counter
from multiprocessing import Pool
import sys
import time
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.ml.recommendation import ALS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

## Mouting drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Setting up Spark

In [4]:
conf = SparkConf().set("spark.ui.port", "4050")
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Dataset Pre-Processing

## Functions for Cleaning Datasets

In [5]:
#Function: To split String to List
def convert_stol(lst):
    return (lst.split())

In [6]:
#Function: Lowercase List to String
def lowcase_list_to_string(df, column_name):
  df[column_name] = df[column_name].apply(lambda x : ' '.join(str(y) for y in x))
  df[column_name] = df[column_name].str.lower()
  return df

In [7]:
#Function: To convert String to List
def string_to_list(df, column_name):
  df[column_name] = df[column_name].apply(lambda x: ' '.join(sorted(set(x.split(' ')))))
  df[column_name] = df[column_name].apply(lambda x: convert_stol(x))
  return df

In [8]:
#Function: To remove stopwords and punctuations from String
def clean_column(df, column_name):
  df[column_name] = df[column_name].apply(nfx.remove_puncts)
  df[column_name] = df[column_name].apply(nfx.remove_special_characters)
  df[column_name] = df[column_name].apply(nfx.remove_numbers)
  df[column_name] = df[column_name].apply(nfx.remove_stopwords)
  return df

## Datasets for Movies

### Information Dataset

In [9]:
#Movie genre dataset
movies = pd.read_csv('/content/drive/MyDrive/MMDS_Project/datasets/movielens/movies.csv')
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [10]:
#Movie tags dataset
tags = pd.read_csv('/content/drive/MyDrive/MMDS_Project/datasets/movielens/tags.csv')
tags = tags.drop(['timestamp'], axis = 1)
tags.head()

Unnamed: 0,userId,movieId,tag
0,3,260,classic
1,3,260,sci-fi
2,4,1732,dark comedy
3,4,1732,great dialogue
4,4,7569,so bad it's good


In [11]:
#Combining Movie datasets
movies_df = pd.merge(movies, tags, on="movieId")
aggregation_functions = { 'tag': [lambda x: x.drop_duplicates().tolist()]}
movies_df = movies_df.groupby('movieId').agg(aggregation_functions).reset_index()
movies_df = pd.merge(movies_df, movies, on="movieId")
movies_df = movies_df.drop(["movieId"], axis = 1)
movies_df.columns = ['movieId','tags', 'title', 'genres']
movies_df.head()

Unnamed: 0,movieId,tags,title,genres
0,1,"[Owned, imdb top 250, Pixar, time travel, chil...",Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,"[Robin Williams, time travel, fantasy, based o...",Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,"[funny, best friend, duringcreditsstinger, fis...",Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,"[based on novel or book, chick flick, divorce,...",Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,"[aging, baby, confidence, contraception, daugh...",Father of the Bride Part II (1995),[Comedy]


In [12]:
#Preprocessing, Normalizing and Saving

#Lowercasing genres
movies_df = lowcase_list_to_string(movies_df, 'genres')
movies_df = string_to_list(movies_df,'genres')

#Lowercasing tags and removing filler words
movies_df = lowcase_list_to_string(movies_df, 'tags')
movies_df = clean_column(movies_df,'tags')
movies_df = string_to_list(movies_df, 'tags')
movies_data = movies_df

#Saving final movies dataframe to file
list_of_movies = movies_data['title'].tolist()
movies_data.to_csv('/content/drive/MyDrive/MMDS_Project/datasets/final_data/movies_data.csv',index = False)
movies_data.head()

Unnamed: 0,movieId,tags,title,genres
0,1,"[acting, action, adventure, allen, american, a...",Toy Story (1995),"[adventure, animation, children, comedy, fantasy]"
1,2,"[action, adaptation, adapted, adventure, allsb...",Jumanji (1995),"[adventure, children, fantasy]"
2,3,"[actually, ann, best, burgess, clv, comedinha,...",Grumpier Old Men (1995),"[comedy, romance]"
3,4,"[based, book, characters, chick, clv, divorce,...",Waiting to Exhale (1995),"[comedy, drama, romance]"
4,5,"[aging, baby, child, clv, comedy, confidence, ...",Father of the Bride Part II (1995),[comedy]


### Ratings Dataset

In [13]:
#Movie ratings dataset
ratings = pd.read_csv('/content/drive/MyDrive/MMDS_Project/datasets/movielens/ratings.csv')
movies_ratings = ratings.drop(['timestamp'], axis = 1)
movies_ratings.to_csv('/content/drive/MyDrive/MMDS_Project/datasets/final_data/movies_ratings.csv',index = False)
movies_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


## Datasets for Books

### Information Dataset

In [14]:
#Books genre dataset
books_df = pd.read_csv("/content/drive/MyDrive/MMDS_Project/datasets/books_1.Best_Books_Ever.csv")
books_df = books_df[["title", "genres","description", "isbn"]]
books_df.columns = ["title", "genres","tags", "isbn"]

books_df['title'] = books_df['title'].str.lower()
books_df.head()

Unnamed: 0,title,genres,tags,isbn
0,the hunger games,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,9780439023481
1,harry potter and the order of the phoenix,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",There is a door at the end of a silent corrido...,9780439358071
2,to kill a mockingbird,"['Classics', 'Fiction', 'Historical Fiction', ...",The unforgettable novel of a childhood in a sl...,9999999999999
3,pride and prejudice,"['Classics', 'Fiction', 'Romance', 'Historical...",Alternate cover edition of ISBN 9780679783268S...,9999999999999
4,twilight,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",About three things I was absolutely positive.\...,9780316015844


In [15]:
#Preprocessing, Normalizing and Saving

#Normalizing tags
books_df['tags'] = books_df.tags.astype(str)
books_df['tags'] = books_df['tags'].str.lower()
books_df = clean_column(books_df, 'tags')
books_df = string_to_list(books_df, 'tags')

#Normalizing genres
books_df['genres'] = books_df['genres'].str.lower()
books_df = clean_column(books_df, 'genres')
books_df = string_to_list(books_df, 'genres')

#Saving final books dataframe to file
list_of_books = books_df['title'].tolist()
books_df.to_csv('/content/drive/MyDrive/MMDS_Project/datasets/final_data/books_data.csv',index = False)
books_df.head()

Unnamed: 0,title,genres,tags,isbn
0,the hunger games,"[action, adult, adventure, apocalyptic, dystop...","[ages, america, annual, beforeand, begun, boy,...",9780439023481
1,harry potter and the order of the phoenix,"[adult, adventure, audiobook, childrens, class...","[arts, authorities, big, boundless, corridor, ...",9780439358071
2,to kill a mockingbird,"[adult, classics, fiction, high, historical, l...","[academy, alabama, american, appeal, awardwinn...",9999999999999
3,pride and prejudice,"[adult, classic, classics, fiction, historical...","[alternate, appeared, austen, austens, beau, b...",9999999999999
4,twilight,"[adult, fantasy, fiction, paranormal, romance,...","[absolutely, bethat, bite, bloodand, didnt, do...",9780316015844


### Ratings Dataset

In [16]:
#Books ratings dataset
path2 = r'/content/drive/MyDrive/MMDS_Project/datasets/user/'
all_files2 = glob.glob(os.path.join(path2, "*.csv"))          # Used os.path.join as this makes concatenation OS independent

df_from_each_file2 = (pd.read_csv(f) for f in all_files2)
books_ratings = pd.concat(df_from_each_file2, ignore_index=True)
books_ratings.head()

Unnamed: 0,ID,Name,Rating
0,1,Agile Web Development with Rails: A Pragmatic ...,it was amazing
1,1,The Restaurant at the End of the Universe (Hit...,it was amazing
2,1,Siddhartha,it was amazing
3,1,The Clock of the Long Now: Time and Responsibi...,really liked it
4,1,"Ready Player One (Ready Player One, #1)",really liked it


In [17]:
#Preprocessing, Normalizing and Saving

#Normalizing
d = {"This user doesn't have any rating":0, 'really liked it':4, 'liked it':3,'it was amazing':5, 'it was ok':2,'did not like it':1 }
books_ratings['rating'] = books_ratings['Rating'].map(d)
books_ratings.columns = ['userId','title', 'review', 'rating']
books_ratings['title'] = books_ratings['title'].str.lower()
books_ratings = books_ratings.drop(columns=['review'])
books_ratings = books_ratings.loc[books_ratings['title'].isin(list_of_books)]
books_ratings = books_ratings.sort_values(by=['userId'])
books_ratings = books_ratings.reset_index(drop=True)

#Saving final books ratings dataframe to file
books_ratings.to_csv('/content/drive/MyDrive/MMDS_Project/datasets/final_data/books_ratings.csv',index = False)
books_ratings.head()

Unnamed: 0,userId,title,rating
0,1,siddhartha,5
1,1,one hundred years of solitude,4
2,1,deception point,2
3,1,the tale of mr. jeremy fisher,5
4,1,the tale of peter rabbit,5


### Books PySpark (Spark) DataFrame
*(For Collaborative Filtering)*

In [19]:
#Conversion of Pandas to Spark and Saving in new file
books_ratings_temp = books_ratings.reset_index()
books_ratings_schema = StructType([
                                   StructField("index", IntegerType(), True),
                                   StructField("userId", IntegerType(), True),
                                   StructField("title", StringType(), True),
                                   StructField("rating", IntegerType(), True)
                                   ])
books_ratings_spark = spark.createDataFrame(books_ratings_temp, schema = books_ratings_schema)
books_ratings_spark.write.option("header",True).csv("/content/drive/MyDrive/MMDS_Project/datasets/final_data/books_ratings_spark.csv")
books_ratings_spark.show()

+-----+------+--------------------+------+
|index|userId|               title|rating|
+-----+------+--------------------+------+
|    0|     1|          siddhartha|     5|
|    1|     1|one hundred years...|     4|
|    2|     1|     deception point|     2|
|    3|     1|the tale of mr. j...|     5|
|    4|     1|the tale of peter...|     5|
|    5|     1|the life-changing...|     5|
|    6|     1|team of rivals: t...|     5|
|    7|     1|losing my virgini...|     5|
|    8|     1|   lord of the flies|     5|
|    9|     1|                open|     5|
|   10|     1|zero to one: note...|     4|
|   11|     1|elon musk: tesla,...|     5|
|   12|     1|            airframe|     3|
|   13|     1|    the terminal man|     3|
|   14|     1|                  s.|     4|
|   15|     1|all the light we ...|     4|
|   16|     1|sapiens: a brief ...|     5|
|   17|     1|death in the afte...|     4|
|   18|     1|delivering happin...|     5|
|   19|     1|the diary of a yo...|     4|
+-----+----

# Recommendation Functions

## Loading Pre-Processed Datasets

In [20]:
#Movies Datasets
movies_data = pd.read_csv('/content/drive/MyDrive/MMDS_Project/datasets/final_data/movies_data.csv')
list_of_movies = movies_data['title'].tolist()
movies_ratings = pd.read_csv('/content/drive/MyDrive/MMDS_Project/datasets/final_data/movies_ratings.csv')

#Books Datasets
books_data = pd.read_csv('/content/drive/MyDrive/MMDS_Project/datasets/final_data/books_data.csv')
list_of_books = books_data['title'].tolist()
books_ratings = pd.read_csv('/content/drive/MyDrive/MMDS_Project/datasets/final_data/books_ratings.csv')

#Books Spark Data
books_ratings_schema = StructType([
                                   StructField("index", IntegerType(), True),
                                   StructField("userId", IntegerType(), True),
                                   StructField("title", StringType(), True),
                                   StructField("rating", IntegerType(), True)
                                   ])
books_ratings_spark = spark.read.option("header",True).schema(books_ratings_schema).csv("/content/drive/MyDrive/MMDS_Project/datasets/final_data/books_ratings_spark.csv")

In [21]:
#Printing Shapes of all the loaded databases
print("----------MOVIES DATASET----------")
print("Number of Total Movies:", movies_data['title'].count())
print("Number of Total Movie Reviews:", movies_ratings['rating'].count())
print("\n----------BOOKS DATASET----------")
print("Number of Total Books:", books_data['title'].count())
print("Number of Total Book Reviews:", books_ratings['rating'].count())

----------MOVIES DATASET----------
Number of Total Movies: 45251
Number of Total Movie Reviews: 25000095

----------BOOKS DATASET----------
Number of Total Books: 52478
Number of Total Book Reviews: 155635


## Initial processing of the input book

In [22]:
#Function: Initial processing of the input book name and getting it's genre
def process_input_book(book_name):
  book_index= books_data.loc[books_data['title'] == book_name].index.values.astype(int)[0]
  # print("Input Book Index:",book_index)

  input_book_genre = books_data['genres'][book_index].strip("][''").split("', '")
  # print("Input Book Genre:", input_book_genre)
  return book_name, book_index, input_book_genre

## Books Recommendation

### Get users with similar preferences

In [23]:
#Function: Get list of users who gave ratings to the input book
def sim_users(book_name):
  similar_users_df = books_ratings.loc[books_ratings['title'] == book_name]
  similar_users_df = similar_users_df.sort_values(by='rating', ascending = False)
  max_rating = similar_users_df['rating'].tolist()[0]
  similar_users_df = similar_users_df[similar_users_df['rating'] == max_rating]
  similar_users = similar_users_df['userId'].tolist()
  return similar_users

### Similar Books by Collaborative Filtering

In [24]:
#Function: Recommendation Model using ALS (Alternating Least Squares)
def ALS_Rec(training):
  als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="index", ratingCol="rating", coldStartStrategy="drop")
  model = als.fit(training)
  return model

In [25]:
#Getting Highly rated Books from similar users
def sim_books(input_book_name, all_similar_users):
  similar_users = []
  if(len(all_similar_users)>5):
    similar_users.append(random.sample(all_similar_users, 5))
  else:
    similar_users = all_similar_users
  print("\nRecommending...")
  similar_books_index = []
  ALS_model = ALS_Rec(books_ratings_spark)
  ALS_recs = ALS_model.recommendForAllUsers(5)
  for similar_user in similar_users:
    # print(similar_user)
    dfa = ALS_recs.filter(ALS_recs.userId == similar_user)
    sbi = dfa.select("recommendations.index").collect()[0][0]
    sbs = dfa.select("recommendations.rating").collect()[0][0]
    for each_b in range(len(sbi)):
      similar_books_index.append([sbi[each_b],sbs[each_b]])

  similar_books_df = pd.DataFrame(similar_books_index)
  similar_books_df = similar_books_df.sort_values(by=1,ascending = False)
  similar_books_df = similar_books_df.head(6)
  similar_books_i = similar_books_df[0].tolist()

  similar_books = []
  for each_bindex in similar_books_i:
    dfb = books_ratings_spark.filter(books_ratings_spark.index == each_bindex)
    sbn = dfb.select("title").collect()[0][0]
    similar_books.append(sbn)
  if (input_book_name in similar_books):
    similar_books.remove(input_book_name)
  else:
    similar_books = similar_books[0:5]

  return similar_books

## Movie Recommendation

### Reducing Movies Dataset by Genre


In [26]:
#Function: Reduce movies dataset by genre matching
def matching_movies(input_book_genre):
  matched_genre = []
  for movie_index,each_movie in enumerate(list_of_movies):
    matched = 0
    movie_genre = movies_data['genres'][movie_index].strip("][''").split("', '")
    for each_movie_genre in movie_genre:
      for each_book_genre in input_book_genre:
        if(each_movie_genre == each_book_genre):
          matched+=1
    matched_genre.append([movie_index,matched])

  matched_genre = pd.DataFrame(matched_genre, columns = ['movieId', 'genreMatches'])
  matched_genre = matched_genre.sort_values(by='genreMatches', ascending=False)
  matched_genre = matched_genre.reset_index(drop=True)
  highest_matches = matched_genre['genreMatches'][0]
  matched_genre = matched_genre[matched_genre.genreMatches == highest_matches]
  matched_genre_movies = matched_genre['movieId'].tolist()

  reduced_movies = movies_data.loc[movies_data['movieId'].isin(matched_genre_movies)]
  return reduced_movies

### Cosine Similarity

In [27]:
def Cosine_Similarity(tags1, tags2):
    tags1 = ''.join(map(str, tags1))
    tags2 = ''.join(map(str, tags2))
    data = [tags1,tags2]

    # #Cosine Similarity using CountVectorizer
    # count_vectorizer = CountVectorizer()
    # vector_matrix = count_vectorizer.fit_transform(data)
    # tokens = count_vectorizer.get_feature_names()

    #Cosine Similarity using TfidfVectorizer
    Tfidf_vect = TfidfVectorizer()
    vector_matrix = Tfidf_vect.fit_transform(data)
    tokens = Tfidf_vect.get_feature_names_out()

    cosine_similarity_matrix = cosine_similarity(vector_matrix)
    doc_names = [f'doc_{i+1}' for i, _ in enumerate(cosine_similarity_matrix)]
    df_cosine = pd.DataFrame(data=cosine_similarity_matrix, index=doc_names, columns=['doc_1','doc_2'])

    return df_cosine['doc_1']['doc_2']

### Jaccard Similarity

In [28]:
def Jaccard_Similarity(tags1, tags2):
  tags1 = set(tags1)
  tags2 = set(tags2)
  
  #Find the intersection of tags1 & tags2
  intersection = tags1.intersection(tags2)

  #Find the union of tags1 & tags2
  union = tags1.union(tags2)
    
  #Calculate Jaccard similarity score using length of intersection set divided by length of union set
  return float(len(intersection)) / len(union)

### Movie Recommendation Function

In [32]:
#Function: Recommendation of Similar Movies according to the input book by applying Jaccard Similarity on the descriptions of both
def rec_movies(similar_books, reduced_movies):
  movie_sim = []
  red_movies = reduced_movies['title'].tolist()
  # print(red_movies)
  for every_book in similar_books:
    a1 = books_data[books_data.title == every_book]
    book_tags = a1['tags'].tolist()[0].strip("][''").split("', '")
    
    for m,every_movie in enumerate(red_movies):
      a2 = reduced_movies[reduced_movies.title == every_movie]
      movie_tags = reduced_movies['tags'].tolist()[m].strip("][''").split("', '")

      # sim_score = Cosine_Similarity(book_tags,movie_tags)
      sim_score = Jaccard_Similarity(book_tags,movie_tags)

      movie_sim.append([every_movie,sim_score])

  similar_movies_df = pd.DataFrame(movie_sim)
  similar_movies_df = similar_movies_df.sort_values(by=1,ascending = False)
  similar_movies_df = similar_movies_df.drop_duplicates(subset=[0])
  similar_movies_df = similar_movies_df.head(5)
  similar_movies = similar_movies_df[0].tolist()
  return similar_movies

We used Jaccard Similarity because:

1.   Jaccard similarity takes only **unique set of words** for each sentence /document while cosine similarity takes **total length of the vectors**.
2.   This means that if we repeat the word “friendship” in description 1 several times, cosine similarity **changes** but Jaccard similarity does not.
3. Jaccard similarity is good for cases where duplication does not matter, cosine similarity is good for cases where duplication matters while analyzing text similarity.

So, as we have two descriptions (one for book and one for movie), it will be better to use **Jaccard similarity** as repetition of a word does not reduce their similarity.



## Driver Function

In [29]:
#The main driver function for the Heterogeneous Recommendation System
def recommender():
  input_book_name = input("\nEnter Book Name: ")
  input_book_name = input_book_name.lower()
  if(input_book_name not in list_of_books):
    print("This book is not present in the dataset. Please enter a different book!")
    recommender()
  else:
    input_book_name, book_index, input_book_genre = process_input_book(input_book_name)
    reduced_movies = matching_movies(input_book_genre)
    similar_users = sim_users(input_book_name)
    print("\nThe UserIDs of users with similar reading preferences are:", end=" ")
    print(*similar_users, sep = ", ")

    recc_books = sim_books(input_book_name, similar_users)
    recc_books_p = [recc_book.title() for recc_book in recc_books]
    print("\n----------RECOMMENDED BOOKS----------")
    print(*recc_books_p, sep = "\n")

    recc_movies = rec_movies(recc_books, reduced_movies)
    print("\n----------RECOMMENDED MOVIES----------")
    print(*recc_movies, sep = "\n")

# Heterogeneous Recommender

In [34]:
#Run this cell to get recommendations
recommender()


Enter Book Name: Twilight

The UserIDs of users with similar reading preferences are: 284, 307

Recommending...

----------RECOMMENDED BOOKS----------
The Guernsey Literary And Potato Peel Pie Society
The Namesake
Where The Sidewalk Ends
Extremely Loud & Incredibly Close
The Curious Incident Of The Dog In The Night-Time

----------RECOMMENDED MOVIES----------
Borrowers, The (1997)
Goodfellas (1990)
Hell Up in Harlem (1973)
13th Warrior, The (1999)
Someone Like You (2001)
