### IMPORTING LIBRARIES AND DATA

In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
print("GATHERING DATA")
data = pd.read_csv("kafka-consumer/data/movie_data_1.csv")
# user_data = pd.read_csv("kafka-consumer/data/user_watched_256.csv")
# user_data = pd.read_csv("kafka-consumer/data/user_watched_ongoing.csv")

### DATA PREPROCESSING

In [None]:
print("PREPROCESSING")
# Convert date form to year
for i in range(len(data['release_date'])):
  if isinstance(data['release_date'][i], float):
    data.at[i, 'release_date'] = ""
  else:
    row = data['release_date'][i].split('-')[0]
    data.at[i, 'release_date'] = row
data = data[data['release_date'] != ""]

In [None]:
# Grabbing the names of all the belongs_to_collection attached to each movie
# for i in range(len(data['belongs_to_collection'])):
#   if isinstance(data['belongs_to_collection'][i], float):
#     data.at[i, 'belongs_to_collection'] = ""
#   else:
#     row = data['belongs_to_collection'][i]
#     row = ast.literal_eval(row)
#     if not row == {}:
#       row = row['name']
#     else:
#       row = ""
#     data.at[i, 'belongs_to_collection'] = row

In [None]:
# Delete rows where message = "movie not found"
data = data[data.message != "movie not found"]
# Delete rows where status = "Rumored" or nan
data = data[data.status != "Rumored"]
data = data.dropna(subset=['status'], inplace=False)

In [None]:
# Delete unnecessary columns
data = data.drop(['tmdb_id', 'imdb_id', 'original_title', 'belongs_to_collection', 'budget', 'homepage', 'original_language', 'overview', 'poster_path', 'production_countries', 'revenue', 'runtime', 'status', 'vote_average', 'vote_count', 'message'], axis=1)
print(data.columns)
print(data.shape)

In [None]:
# Grabbing the names of all the genres attached to each movie
data['genres'] = data['genres'].apply(literal_eval)
data['genres'] = data['genres'].apply(lambda x: [i['name'].lower() for i in x])
data['genres'] = data['genres'].apply(lambda x: [i.replace(' ','') for i in x])

In [None]:
# Grabbing the names of all the production_companies attached to each movie
data['production_companies'] = data['production_companies'].apply(literal_eval)
data['production_companies'] = data['production_companies'].apply(lambda x: [i['name'].lower() for i in x])
data['production_companies'] = data['production_companies'].apply(lambda x: [i.replace(' ','') for i in x])

In [None]:
# Grabbing the names of all the production_countries attached to each movie
# data['production_countries'] = data['production_countries'].apply(literal_eval)
# data['production_countries'] = data['production_countries'].apply(lambda x: [i['name'].lower() for i in x])

In [None]:
# Grabbing the names of all the spoken_languages attached to each movie
data['spoken_languages'] = data['spoken_languages'].apply(literal_eval)
data['spoken_languages'] = data['spoken_languages'].apply(lambda x: [i['name'].lower() for i in x])
data['spoken_languages'] = data['spoken_languages'].apply(lambda x: [i.replace(' ','') for i in x])

In [None]:
data.head()

In [None]:
data = data.drop(['popularity', 'release_date'], axis=1)

### MERGING ALL THE FEATURES

In [None]:
print("FEATURE MERGING")
data['metadata'] = data.apply(lambda x : x['title'] + ' ' + str(x['adult']) + ' ' + ' '.join(x['genres']) + ' ' + ' '.join(x['production_companies']) + ' ' + ' ' + ' '.join(x['spoken_languages']), axis = 1)

In [None]:
data[['id', 'metadata']]

### WRITING CLEAN DATA TO FILE

In [None]:
print("MODEL TRAINING")
data.to_csv("kafka-consumer/data/clean_data_1.csv", index=False)

### MODEL TRAINING

In [None]:
count_vec = CountVectorizer(stop_words='english')
count_vec_matrix = count_vec.fit_transform(data['metadata'])
cosine_sim_matrix = cosine_similarity(count_vec_matrix, count_vec_matrix)
# Movies index mapping
mapping = pd.Series(data.index,index = data['id'])

In [None]:
# Recommender function to recommend movies based on metadata
def similar_movies(input):
  index = mapping[input]
  # Get similarity values with other movies
  similarity_score = list(enumerate(cosine_sim_matrix[index]))
  similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
  # Get the scores of the 20 most similar movies. Ignore the first movie.
  similarity_score = similarity_score[1:20]
  indices = [i[0] for i in similarity_score]
  return (data['id'].iloc[indices])

In [None]:
# set(similar_movies('Live Free or Die Hard'))
# user_data[user_data["user_id"]==775307]
# data.loc[data['id']=='live+free+or+die+hard+2007']['title'].tolist()[0]

### MODEL PREDICTION

In [None]:
# def recommend(userid):
#   watched = set(user_data[user_data["user"] == userid]['movie'].tolist())
#   watchlist = set()
#   print("Movies watched by user " + str(userid) + ": ")
#   for movie in watched:
#     title = data.loc[data['id'] == movie]['id'].tolist()[0]
#     print(title)
#     watchlist = watchlist.union(set(similar_movies(title)))
#   return list(watchlist.difference(watched))

# recommendations = recommend(775307)
# print("\nRecommended movie list:\n", recommendations)

In [None]:
# UPDATE 1
# user watches multiple movies. he might rate multiple movies too. now when you receive the recommendations, 
# we need to make sure that the movie that user has rated the highest, is the one whose recommendations are given 
# first and then it goes on in a decreasing order. and once we collect all these movies, then we can sort them 
# according to the popularity and supply the top 20 movies.

# UPDATE 2
# we can also use a correlation matrix to see how features affect each other.