### Import Dependencies

In [1]:
import pandas as pd
from pathlib import Path
# Import dependencies 
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
# from scipy.sparse import csr_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
import hvplot.pandas

import tensorflow as tf
import warnings
warnings.simplefilter("ignore")

from matplotlib import pyplot as plt
from pprint import pprint
from sklearn.cluster import DBSCAN
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

### Data Cleaning and Pre-processing

In [2]:
# Load CSVs
links_csv = Path('Resources/ml-latest-small/links.csv')
ratings_csv = Path('Resources/ml-latest-small/ratings.csv')
movies_csv = Path('Resources/ml-latest-small/movies.csv')
tags_csv = Path('Resources/ml-latest-small/tags.csv')

In [3]:
# Read CSV to create Dataframes
tags_df = pd.read_csv(tags_csv)
ratings_df = pd.read_csv(ratings_csv)
movies_df = pd.read_csv(movies_csv)
links_df = pd.read_csv(links_csv)

In [4]:
# Split up the release year from the title
movies_df[['title', 'release_year']] = movies_df['title'].str.split(r' \s*\(\s*|\s*\)\s*', expand = True).iloc[:, [0, 1]]
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [5]:
# Split Genres into a list of each specific genre
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|') )
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [6]:
# Clean up ratings DF
year_rated = 1970 + (ratings_df['timestamp'] / 31540000)
year_rated = year_rated.astype('int')
ratings_df['year_rated'] = year_rated
ratings_df = ratings_df.drop(columns = 'timestamp')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,year_rated
0,1,1,4.0,2000
1,1,3,4.0,2000
2,1,6,4.0,2000
3,1,47,5.0,2000
4,1,50,5.0,2000


In [7]:
#counting the most frequent genre in our dataset
from collections import Counter
genre_frequency = Counter(g for genres in movies_df['genres'] for g in genres)
print(f"There are {len(genre_frequency)} genres.")
genre_frequency

There are 20 genres.


Counter({'Drama': 4361,
         'Comedy': 3756,
         'Thriller': 1894,
         'Action': 1828,
         'Romance': 1596,
         'Adventure': 1263,
         'Crime': 1199,
         'Sci-Fi': 980,
         'Horror': 978,
         'Fantasy': 779,
         'Children': 664,
         'Animation': 611,
         'Mystery': 573,
         'Documentary': 440,
         'War': 382,
         'Musical': 334,
         'Western': 167,
         'IMAX': 158,
         'Film-Noir': 87,
         '(no genres listed)': 34})

### Lets group movie genres with less than 200 appearances into a new genre as 'other'

In [8]:
genre_other = ['Western', 'IMAX','Film-Noir','(no genres listed)']

In [9]:
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [10]:
# Replacing low appearance genres as 'Other'
for i in movies_df['genres']:
    for k in range(len(i)):
        if i[k] in genre_other:
            i[k] = 'Other'

In [11]:
# Check if 'Other' genre was created
from collections import Counter
genre_frequency = Counter(g for genres in movies_df['genres'] for g in genres)
print(f"There are {len(genre_frequency)} genres.")
genre_frequency

There are 17 genres.


Counter({'Drama': 4361,
         'Comedy': 3756,
         'Thriller': 1894,
         'Action': 1828,
         'Romance': 1596,
         'Adventure': 1263,
         'Crime': 1199,
         'Sci-Fi': 980,
         'Horror': 978,
         'Fantasy': 779,
         'Children': 664,
         'Animation': 611,
         'Mystery': 573,
         'Other': 446,
         'Documentary': 440,
         'War': 382,
         'Musical': 334})

### Data Per-processing is complete
### Lets test how the users would rank 'Thriller' movies based on their ratings of other movies in different genres

In [61]:
# Making a dataframe with only thriller movies
values = []
columns = []
for row, index in movies_df.iterrows():
    if 'Thriller' in index['genres']:
        columns.append(index.index)
        values.append(index.values)

thrillers_df = pd.DataFrame(values, columns = columns[0])
thrillers_df

Unnamed: 0,movieId,title,genres,release_year
0,6,Heat,"[Action, Crime, Thriller]",1995
1,10,GoldenEye,"[Action, Adventure, Thriller]",1995
2,20,Money Train,"[Action, Comedy, Crime, Drama, Thriller]",1995
3,21,Get Shorty,"[Comedy, Crime, Thriller]",1995
4,22,Copycat,"[Crime, Drama, Horror, Mystery, Thriller]",1995
...,...,...,...,...
1889,185033,I Kill Giants,"[Drama, Fantasy, Thriller]",2018
1890,187031,Jurassic World: Fallen Kingdom,"[Action, Adventure, Drama, Sci-Fi, Thriller]",2018
1891,189333,Mission: Impossible - Fallout,"[Action, Adventure, Thriller]",2018
1892,189381,SuperFly,"[Action, Crime, Thriller]",2018


In [65]:
# Making a dataframe with non-thriller movies
values = []
columns = []
for row, index in movies_df.iterrows():
    if 'Thriller' not in index['genres']:
        columns.append(index.index)
        values.append(index.values)

other_movies_df = pd.DataFrame(values, columns = columns[0])
other_movies_df

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
7843,193581,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]",2017
7844,193583,No Game No Life: Zero,"[Animation, Comedy, Fantasy]",2017
7845,193585,Flint,[Drama],2017
7846,193587,Bungo Stray Dogs: Dead Apple,"[Action, Animation]",2018


In [68]:
other_movies_genres = set(g for G in other_movies_df['genres'] for g in G)
other_movies_genres

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Horror',
 'Musical',
 'Mystery',
 'Other',
 'Romance',
 'Sci-Fi',
 'War'}

In [74]:
# Create a boolean dataset of other movie genres ('0's and '1's)
for g in other_movies_genres:
    other_movies_df[g] = other_movies_df.genres.transform(lambda x: int(g in x))
    
other_movies_genres_df = other_movies_df.drop(columns=['movieId', 'title','genres', 'release_year'])
other_movies_genres_df

Unnamed: 0,Other,Sci-Fi,Mystery,War,Comedy,Children,Documentary,Musical,Animation,Fantasy,Romance,Adventure,Drama,Action,Horror,Crime
0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0
1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7843,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0
7844,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0
7845,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
7846,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [75]:
# Create DBSCAN model and fit it to other_movies_genres_df
model = DBSCAN(eps = 0.5, metric = 'l2', min_samples = 15)
model.fit(other_movies_genres_df)

In [76]:
len(model.components_)

6269

In [81]:
pd.Series(model.labels_).unique()

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
       67, 68, 69, 70, 71, 72, 73], dtype=int64)

In [79]:
labeled_df = other_movies_genres_df
labeled_df['label'] = model.labels_ + 1
labeled_df

Unnamed: 0,Other,Sci-Fi,Mystery,War,Comedy,Children,Documentary,Musical,Animation,Fantasy,Romance,Adventure,Drama,Action,Horror,Crime,label
0,0,0,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1
2,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,2
3,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,3
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7843,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0
7844,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0
7845,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,9
7846,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0


In [82]:
vectors = labeled_df.groupby('label').mean()
counts = labeled_df.groupby('label')['label'].count()

for cluster in range(labeled_df['label'].max()):
    print(f'\nCluster {cluster}, count: {counts[cluster]}')
    temp_df = vectors.transpose()[cluster].sort_values(ascending=False)
    identifying_categories = temp_df[temp_df > 0.9]
    [print(x) for x in identifying_categories.index] if len(identifying_categories > 0) else print('()')
    print('\n')
    print(vectors.transpose()[cluster].sort_values(ascending=False).head())
    print('\n' + '-'*40)


Cluster 0, count: 1579
()


Comedy       0.424319
Adventure    0.369854
Drama        0.323623
Action       0.298923
Fantasy      0.279291
Name: 0, dtype: float64

----------------------------------------

Cluster 1, count: 24
Children
Fantasy
Adventure


Children     1.0
Fantasy      1.0
Adventure    1.0
Other        0.0
Sci-Fi       0.0
Name: 1, dtype: float64

----------------------------------------

Cluster 2, count: 363
Comedy
Romance


Comedy     1.0
Romance    1.0
Other      0.0
Sci-Fi     0.0
Mystery    0.0
Name: 2, dtype: float64

----------------------------------------

Cluster 3, count: 276
Comedy
Romance
Drama


Comedy     1.0
Romance    1.0
Drama      1.0
Other      0.0
Sci-Fi     0.0
Name: 3, dtype: float64

----------------------------------------

Cluster 4, count: 946
Comedy


Comedy     1.0
Other      0.0
Sci-Fi     0.0
Mystery    0.0
War        0.0
Name: 4, dtype: float64

----------------------------------------

Cluster 5, count: 23
Children
Adventure


Children 