### Import Dependencies

In [2]:
import pandas as pd
from pathlib import Path
# Import dependencies 
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
# from scipy.sparse import csr_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
import hvplot.pandas

import tensorflow as tf
import warnings
warnings.simplefilter("ignore")

from matplotlib import pyplot as plt
from pprint import pprint
from sklearn.cluster import DBSCAN
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from pymongo import MongoClient
from mongo_credentials import connection_string

ModuleNotFoundError: No module named 'mongo_credentials'

### Import Mongo db Database

In [None]:
##strmongo = "mongodb+srv://dataquesters:project3@cluster0.dy07n.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
mongo = MongoClient(connection_string)
# confirm that our new database was created
print(mongo.list_database_names())

In [None]:
# assign the database to a variable name
db = mongo['movie_recommendations']
# review the collections in our new database
print(db.list_collection_names())

### Data Cleaning and Pre-processing

In [None]:
### This is the small Dataset
# Load CSVs
##links_csv = Path('Resources/ml-latest-small/links.csv')
##ratings_csv = Path('Resources/ml-latest-small/ratings.csv')
##movies_csv = Path('Resources/ml-latest-small/movies.csv')
##tags_csv = Path('Resources/ml-latest-small/tags.csv')

In [None]:
# Read CSV to create Dataframes
##tags_df = pd.read_csv(tags_csv)
##ratings_df = pd.read_csv(ratings_csv)
##movies_df = pd.read_csv(movies_csv)
##links_df = pd.read_csv(links_csv)

In [None]:
# Access the 'movies' collection
movies_collection = db['movies']

# Read data from the 'movies' collection into a Pandas DataFrame
movies_data = list(movies_collection.find())  # Convert the collection data to a list of dictionaries
movies_df = pd.DataFrame(movies_data)  # Create a DataFrame from the list of dictionaries

# Display the first few rows of the DataFrame
movies_df.head()

In [None]:
# Access the 'ratings' collection
ratings_collection = db['ratings']

# Read data from the 'ratings' collection into a Pandas DataFrame
ratings_data = list(ratings_collection.find())  # Convert the collection data to a list of dictionaries
ratings_df = pd.DataFrame(ratings_data)  # Create a DataFrame from the list of dictionaries

# Display the first few rows of the DataFrame
ratings_df.head()

In [None]:
# Split up the release year from the title
movies_df[['title', 'release_year']] = movies_df['title'].str.split(r' \s*\(\s*|\s*\)\s*', expand = True).iloc[:, [0, 1]]
movies_df.head()

In [None]:
# Split Genres into a list of each specific genre
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|') )
movies_df.head()

In [None]:
# Clean up ratings DF
year_rated = 1970 + (ratings_df['timestamp'] / 31540000)
year_rated = year_rated.astype('int')
ratings_df['year_rated'] = year_rated
ratings_df = ratings_df.drop(columns = 'timestamp')
ratings_df.head()

In [None]:
#counting the most frequent genre in our dataset
from collections import Counter
genre_frequency = Counter(g for genres in movies_df['genres'] for g in genres)
print(f"There are {len(genre_frequency)} genres.")
genre_frequency

### Lets group movie genres that are an obscure genre into a new genre as 'other'

In [None]:
genre_other = ['Western', 'IMAX','Film-Noir','(no genres listed)']

In [None]:
movies_df.head()

In [None]:
# Replacing low appearance genres as 'Other'
for i in movies_df['genres']:
    for k in range(len(i)):
        if i[k] in genre_other:
            i[k] = 'Other'

In [None]:
# Check if 'Other' genre was created
from collections import Counter
genre_frequency = Counter(g for genres in movies_df['genres'] for g in genres)
print(f"There are {len(genre_frequency)} genres.")
genre_frequency

### Data Per-processing is complete
### Lets test how the users would rank 'Thriller' movies based on their ratings of other movies in different genres

In [None]:
# Making a dataframe with only thriller movies
values = []
columns = []
for row, index in movies_df.iterrows():
    if 'Thriller' in index['genres']:
        columns.append(index.index)
        values.append(index.values)

thrillers_df = pd.DataFrame(values, columns = columns[0])
thrillers_df

In [None]:
# Making a dataframe with non-thriller movies
values = []
columns = []
for row, index in movies_df.iterrows():
    if 'Thriller' not in index['genres']:
        columns.append(index.index)
        values.append(index.values)

other_movies_df = pd.DataFrame(values, columns = columns[0])
other_movies_df

In [None]:
other_movies_genres = set(g for G in other_movies_df['genres'] for g in G)
other_movies_genres

In [None]:
# Create a boolean dataset of other movie genres ('0's and '1's)
for g in other_movies_genres:
    other_movies_df[g] = other_movies_df.genres.transform(lambda x: int(g in x))
    
other_movies_genres_df = other_movies_df.drop(columns=['movieId', 'title','genres', 'release_year', '_id'])
other_movies_genres_df

In [None]:
other_movies_genres_df.shape

In [None]:
%%time
# Create DBSCAN model and fit it to other_movies_genres_df
#model = DBSCAN(eps = 1, metric = 'l1')
model = DBSCAN(eps = 1, metric = 'l1', min_samples = 40)
#model = DBSCAN(eps = 0.5, metric = 'l2', min_samples = 36)
model.fit(other_movies_genres_df)

In [None]:
len(model.components_)

In [None]:
pd.Series(model.labels_).unique()

In [None]:
pd.Series(model.labels_).value_counts()

In [None]:
labeled_df = other_movies_genres_df
labeled_df['label'] = model.labels_ + 1
labeled_df

In [None]:
vectors = labeled_df.groupby('label').mean()
counts = labeled_df.groupby('label')['label'].count()

for cluster in range(labeled_df['label'].max()):
    print(f'\nCluster {cluster}, count: {counts[cluster]}')
    temp_df = vectors.transpose()[cluster].sort_values(ascending=False)
    identifying_categories = temp_df[temp_df > 0.9]
    [print(x) for x in identifying_categories.index] if len(identifying_categories > 0) else print('()')
    print('\n')
    print(vectors.transpose()[cluster].sort_values(ascending=False).head())
    print('\n' + '-'*40)

In [None]:
ratings_df.head()

In [None]:
other_movies_df.head()

In [None]:
labeled_df['movieId'] = other_movies_df['movieId']
labeled_df.head()

In [None]:
# Merge ratings with the labeled clusters dataframe
df2 = pd.merge(ratings_df, labeled_df[['label', 'movieId']], on='movieId', how = 'left')
df2 = df2.drop('year_rated', axis=1)
df2.head(10)

In [None]:
df2['label'].value_counts()

In [None]:
df2.isna().sum()

In [None]:
# Fill na with -1 to represent Thriller Movies
df2['label'] = df2['label'].fillna(-1)

In [None]:
df2.isna().sum()

In [None]:
df2 = df2.drop('_id', axis = 1)
df2.head()

In [None]:
thriller_movie_raters = df2[df2['label'] == -1].groupby('userId').count()['label']
top_thriller_movie_raters = thriller_movie_raters[thriller_movie_raters >= 5]
top_thriller_movie_raters

In [None]:
thrillers_df = thrillers_df.drop('_id', axis = 1)

In [None]:
thrillers_df.head()

In [None]:
ratings_df = ratings_df.drop('_id', axis = 1)

In [None]:
ratings_df.head()

In [None]:
thriller_users_df = pd.merge(ratings_df, thrillers_df, on = 'movieId', how = 'inner')
thriller_users_df.head()

In [None]:
thriller_movie_ids = set([x for x in thrillers_df['movieId']])
thriller_movie_raters = thriller_users_df['userId'].unique()

In [None]:
df3 = df2[df2['userId'].isin(top_thriller_movie_raters.index)].groupby(['userId', 'label']).mean().reset_index().set_index('userId').pivot(columns='label', values='rating')
df3

In [None]:
X = df3.drop(-1, axis=1).fillna(3)
y = df3[-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
nn_model = tf.keras.Sequential()
nn_model.add(tf.keras.layers.Dense(units=1, input_dim=len(X.columns)))
#nn_model.add(tf.keras.layers.Dense(units=36))
nn_model.add(tf.keras.layers.Dense(units=1))
nn_model.add(tf.keras.layers.Dense(units=1))
nn_model.compile(loss="MSE", optimizer="adam", metrics=["mse", "mae"])
fit_model = nn_model.fit(X_train, y_train, epochs=50)

In [None]:
r2_score(y_test, nn_model.predict(X_test))

In [None]:
mean_absolute_error(y_test, nn_model.predict(X_test))

In [None]:
plt.scatter(nn_model.predict(X_test), y_test)
plt.show()

### Run a grid search to find the best parameters for the DBSCAN
*Note* Only use with small dataset

In [None]:
epsilon = np.linspace(0.5, 1, num = 20)
epsilon

In [None]:
min_samples = np.arange(2, 40, step=4)
min_samples

In [None]:
import itertools
combinations = list(itertools.product(epsilon, min_samples))
N = len(combinations)
N

In [None]:
#metrics = ['l1', 'l2', 'mahattan']

In [None]:
from sklearn.metrics import silhouette_score as shs
def Gridsearch(combinations, X):
    scores = []
    all_label = []

    for i,(eps, num_samples) in enumerate(combinations):
        dbscan_model = DBSCAN(eps = eps, min_samples = num_samples, metric = 'l2').fit(X)
        labels = dbscan_model.labels_
        labels_set = set(labels)
        num_clusters = len(labels_set)
        if -1 in labels_set:
            num_clusters -= 1
        if(num_clusters < 2) or (num_clusters > 100):
            scores.append(-20)
            all_label.append('Poor')
            print(' at iteration: ',i,'ep = ',eps,'min_sample = ',num_samples,'number of cluster= ',num_clusters,'moving on..')
            continue
        scores.append(shs(X, labels))
        all_label.append(labels)
        print(' at iteration:',i,'score :',scores[-1],'Number of clusters :', num_clusters )
    best_index = np.argmax(scores)
    best_parameters = combinations[best_index]
    best_labels = all_label[best_index]
    best_score = scores[best_index]
    return{'best_epsilon': best_parameters[0],
           'best_min_samples' : best_parameters[1],
            'best_labels' : best_labels,
            'best_score' : best_score }

In [None]:
#best_para = Gridsearch(combinations, other_movies_genres_df.drop('movieId', axis=1))

In [None]:
#print(best_para)

### Lets see how the predicted data did against the actual data

In [None]:
results_dictionary = {'userId' : y_test.index, 'rating_actual' : y_test.values}
results_df = pd.DataFrame(results_dictionary)
results_df

In [None]:
predictions = nn_model.predict(X_test)
results_df['predictions'] = predictions
results_df

In [None]:
results_df['difference'] = results_df['rating_actual'] - results_df['predictions']
results_df.head()

In [None]:
mae_difference = results_df.loc[((results_df['difference'] < 0.2500900136795112) & (results_df['difference'] > 0)) | \
                                    ((results_df['difference'] > -0.2500900136795112) & (results_df['difference'] < 0))]
mae_difference

In [None]:
difference = results_df.loc[((results_df['difference'] < 0.1) & (results_df['difference'] > 0)) | \
                                    ((results_df['difference'] > -0.1) & (results_df['difference'] < 0))]
difference

In [None]:
small_difference = results_df.loc[((results_df['difference'] < 0.01) & (results_df['difference'] > 0)) | \
                                    ((results_df['difference'] > -0.01) & (results_df['difference'] < 0))]
small_difference

In [None]:
verysmall_difference = results_df.loc[((results_df['difference'] < 0.001) & (results_df['difference'] > 0)) | \
                                    ((results_df['difference'] > -0.001) & (results_df['difference'] < 0))]
verysmall_difference

In [None]:
big_difference = results_df.loc[(results_df['difference'] > 0.3) | (results_df['difference'] < -0.3)]
big_difference

In [None]:
big_difference = results_df.loc[(results_df['difference'] > 0.5) | (results_df['difference'] < -0.5)]
big_difference

In [None]:
results_df.boxplot(column = 'difference')
plt.show()