## Part 0: Import Dependencies and Set-Up

In [60]:
# Import Dependencies
import hvplot.pandas
import numpy as np
import os
import pandas as pd
import plotly.express as px
import random
#from scipy.spatial import distance
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer, StandardScaler


In [2]:
# Pandas Settings

# Display All Columns
pd.set_option('display.max_columns', None)


In [3]:
# Path to file directory and variables for input files.
file_dir = os.path.join("Data")

# imdb Titles metadata (Extracted from title.basics.tsv)
titles_metadata_file = f'{file_dir}/title_basics_non-adult_movies.tsv'

# imdb US Titles only ids (Extracted from title.akas.tsv)
titles_us_ids_only_file = f'{file_dir}/US_title_ids_unique.csv'

# imdb Ratings data (Derived from title.ratings.tsv)
ratings_data_file = f'{file_dir}/title_ratings.csv'


In [4]:
# Set Viewer Title for Testing
#viewerTitle = "Apocalypse Now"
#viewerTitle = "The Maltese Falcon (1941)"
viewerTitle = "Toy Story"
#viewerTitle = "Witness (1985)"


## Part 1: Import Data, Clean and Transform Data

In [5]:
# Import imdb Titles metadata, imdb US Title IDs, imdb Ratings data

titles_metadata = pd.read_csv(titles_metadata_file, sep='\t')
titles_us_ids_only = pd.read_csv(titles_us_ids_only_file)
ratings_data = pd.read_csv(ratings_data_file)


In [6]:
# Check titles_metadata DataFrame
print(titles_metadata.shape)
titles_metadata.count()
titles_metadata.head()


(584642, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
4,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [7]:
# Drop all Titles where primaryTitle differs from originalTitle
# (Since language of titles is not often available, this is an attempt
# to filter out obscure non-English language films)

titles_metadata = titles_metadata.loc[titles_metadata['primaryTitle'] == titles_metadata['originalTitle']]


In [8]:
# Look for Films with the same primaryTitle
# and set primaryTitle to primaryTitle + (startYear)

duplicate_titles_df = pd.concat(g for _, g in titles_metadata.groupby('primaryTitle') if len(g) > 1)

duplicate_titles_df['primaryTitle'] = duplicate_titles_df.apply(lambda row: "".join([row['primaryTitle'], " (", str(row['startYear']), ")"]), axis=1)
duplicate_titles_df['originalTitle'] = duplicate_titles_df['primaryTitle']

duplicate_titles_df


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
417445,tt3120962,movie,#5 (2013),#5 (2013),0,2013,\N,68,"Biography,Comedy,Fantasy"
553039,tt8219776,movie,#5 (2018),#5 (2018),0,2018,\N,\N,Documentary
262785,tt11803670,movie,#Love (\N),#Love (\N),0,\N,\N,\N,Drama
342883,tt15521960,movie,#Love (\N),#Love (\N),0,\N,\N,\N,"Comedy,Romance"
446725,tt4004608,movie,$elfie Shootout (2016),$elfie Shootout (2016),0,2016,\N,86,Comedy
...,...,...,...,...,...,...,...,...,...
580668,tt9686590,movie,Ûmi no kyodai (1935),Ûmi no kyodai (1935),0,1935,\N,\N,Drama
450813,tt4149802,movie,Ümmü Sibyan: Zifir (2014),Ümmü Sibyan: Zifir (2014),0,2014,\N,\N,Horror
513928,tt6448010,movie,Ümmü Sibyan: Zifir (2015),Ümmü Sibyan: Zifir (2015),0,2015,\N,\N,\N
195195,tt0431498,movie,Üvey ana (1967),Üvey ana (1967),0,1967,\N,\N,"Drama,Romance"


In [9]:
# Merge duplicate_titles_df back with titles_metadata

cols = list(titles_metadata.columns)
titles_metadata.loc[titles_metadata['tconst'].isin(duplicate_titles_df['tconst']), cols] = duplicate_titles_df[cols]


In [10]:
# Check titles_us_ids_only DataFrame
print(titles_us_ids_only.shape)
titles_us_ids_only.count
titles_us_ids_only.head()


(1308380, 1)


Unnamed: 0,tconst
0,tt0000001
1,tt0000002
2,tt0000005
3,tt0000005
4,tt0000005


In [11]:
# Drop all Titles from titles_metadata that are not in titles_us_ids_only

titles_metadata = pd.merge(titles_metadata, titles_us_ids_only, on='tconst', how='inner')
titles_metadata = titles_metadata.drop_duplicates()


In [12]:
# Check results
print(titles_metadata.shape)
titles_metadata.head()


(207524, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
1,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,\N,120,"Adventure,Fantasy"
2,tt0001101,movie,Abraham Lincoln's Clemency,Abraham Lincoln's Clemency,0,1910,\N,\N,\N
3,tt0001159,movie,The Connecticut Yankee,The Connecticut Yankee,0,1910,\N,\N,\N
4,tt0001230,movie,Gentleman Joe,Gentleman Joe,0,1910,\N,\N,\N


In [13]:
# Drop titles_metadata Rows with "\N" for genres and startYear
# Drop titleType isAdult and endYear Columns

#titles_metadata['genres'].value_counts()

titles_metadata = titles_metadata.loc[~(titles_metadata['genres'] == "\\N") & ~(titles_metadata['startYear'] == "\\N")]
titles_metadata.drop(['titleType'], axis=1, inplace=True)
titles_metadata.drop(['isAdult'], axis=1, inplace=True)
titles_metadata.drop(['endYear'], axis=1, inplace=True)


In [14]:
# Check results
print(titles_metadata.shape)
#titles_metadata.dtypes
titles_metadata.head()


(153255, 6)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"
1,tt0000679,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,1908,120,"Adventure,Fantasy"
5,tt0001285,The Life of Moses,The Life of Moses,1909,50,"Biography,Drama,Family"
11,tt0001498,The Battle of Trafalgar,The Battle of Trafalgar,1911,51,War
17,tt0001892,Den sorte drøm,Den sorte drøm,1911,53,Drama


In [15]:
# Convert startYear Column to int

titles_metadata['startYear'] = pd.to_numeric(titles_metadata['startYear'])


In [16]:
# Check results
titles_metadata.dtypes


tconst            object
primaryTitle      object
originalTitle     object
startYear          int64
runtimeMinutes    object
genres            object
dtype: object

In [17]:
# Drop titles_metadata Rows with 'startYear' less than 1920

titles_metadata = titles_metadata.loc[titles_metadata['startYear'] >= 1920]


In [18]:
# Check results
print(titles_metadata.shape)
titles_metadata.head()


(148483, 6)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
199,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western"
597,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy
2517,tt0008422,Perils of the West,Perils of the West,1922,\N,Western
2746,tt0008736,The Victim (1920),The Victim (1920),1920,\N,Drama
3089,tt0009187,His Temporary Wife,His Temporary Wife,1920,\N,Comedy


In [19]:
# Check ratings_metadata DataFrame
print(ratings_data.shape)
ratings_data.count()
ratings_data.head()


(1201036, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1834
1,tt0000002,6.0,236
2,tt0000003,6.5,1594
3,tt0000004,6.0,153
4,tt0000005,6.2,2410


In [20]:
# Merge titles_metadata and ratings_data on tconst

movies_df = pd.merge(titles_metadata, ratings_data, on="tconst")

print(movies_df.shape)
movies_df.head()


(101078, 8)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16


In [21]:
# Add url column to movies_df
movies_df['url'] = movies_df.apply(lambda row: "".join(["https://www.imdb.com/title/", row['tconst'], "/"]), axis=1)

print(movies_df.shape)
movies_df.head()


(101078, 9)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/


In [22]:
# Check DataFrame

movies_df.dtypes


tconst             object
primaryTitle       object
originalTitle      object
startYear           int64
runtimeMinutes     object
genres             object
averageRating     float64
numVotes            int64
url                object
dtype: object

In [23]:
# Convert 'genres' entries into lists

movies_df['genres_list'] = movies_df.apply(lambda row: row['genres'].split(","), axis=1)

print(movies_df.shape)
movies_df.head()


(101078, 10)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/,"[Adventure, Music, Western]"
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/,[Comedy]
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/,"[Comedy, Drama]"
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/,[Drama]
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/,[Horror]


In [24]:
# Transform (get_dummies via Multi Label Bin Encoding) movies_df by 'genres'

genres = movies_df['genres_list']

mlb = MultiLabelBinarizer()

X = pd.DataFrame(mlb.fit_transform(genres), columns=mlb.classes_, index=movies_df.index)

print(X.shape)
X.head()


(101078, 27)


Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
# Merge X back with movies_df

movies_df = pd.merge(movies_df, X, how='inner', left_index=True, right_index=True)

print(movies_df.shape)
movies_df.head()


(101078, 37)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/,"[Adventure, Music, Western]",0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/,[Comedy],0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/,"[Comedy, Drama]",0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/,[Horror],0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
# Find viewerTitle Data for Testing Purposes

movies_df.loc[(movies_df['primaryTitle'] == viewerTitle)]


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
28773,tt0114709,Toy Story,Toy Story,1995,81,"Adventure,Animation,Comedy",8.3,932854,https://www.imdb.com/title/tt0114709/,"[Adventure, Animation, Comedy]",0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
# Integrate 'averageRating' into X DataFrame with 'primaryTitle' as new Index
Z = pd.merge(movies_df[['primaryTitle', 'averageRating']], X, how='outer', left_index=True, right_index=True)
Z.set_index('primaryTitle', inplace=True)

print(Z.shape)
Z.head()


(101078, 28)


Unnamed: 0_level_0,averageRating,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
primaryTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
Dodge City Trail,3.7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
Charley's Aunt (1925),6.6,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Deadlier Sex,6.2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
My Husband's Other Wife,4.2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Nachtgestalten (1920),6.4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
# Standardize the data with StandardScaler()

Z = StandardScaler().fit_transform(Z)
Z[:5]


array([[-1.71653767e+00, -3.77062220e-01,  3.37491431e+00,
        -1.28065202e-01, -2.02193760e-01, -5.90199873e-01,
        -3.45768806e-01, -4.57870903e-01, -8.52523275e-01,
        -1.94734823e-01, -1.82354760e-01, -8.62282712e-02,
        -3.14538512e-03, -1.71714170e-01, -3.60494818e-01,
         5.34368513e+00, -1.44379836e-01, -2.35657951e-01,
        -6.27944836e-02, -1.44153917e-02, -3.67865697e-01,
        -2.05039599e-01, -1.21828679e-02, -1.35221147e-01,
        -8.89680067e-03, -3.62890797e-01, -1.41169264e-01,
         5.23990879e+00],
       [ 3.62269440e-01, -3.77062220e-01, -2.96303820e-01,
        -1.28065202e-01, -2.02193760e-01,  1.69434127e+00,
        -3.45768806e-01, -4.57870903e-01, -8.52523275e-01,
        -1.94734823e-01, -1.82354760e-01, -8.62282712e-02,
        -3.14538512e-03, -1.71714170e-01, -3.60494818e-01,
        -1.87136775e-01, -1.44379836e-01, -2.35657951e-01,
        -6.27944836e-02, -1.44153917e-02, -3.67865697e-01,
        -2.05039599e-01, -1.21

## Part 2: Principal Component Analysis

In [56]:
# Use PCA to reduce dimensions to three principal components
pca = PCA(n_components=3)

movies_pca = pca.fit_transform(Z)
movies_pca


array([[-1.95995254e-01, -6.56189827e-02,  2.30303226e+00],
       [ 2.06306856e-01, -7.38499325e-01,  5.58224341e-01],
       [-2.17631925e-03, -1.31021311e+00, -3.79936373e-01],
       ...,
       [ 2.17001215e+00,  1.02029325e+00,  3.75602713e-01],
       [ 1.89742756e+00,  1.04973476e+00,  4.45170978e-01],
       [-2.58385503e+00,  8.34595718e-01,  1.49176084e+00]])

In [57]:
# Create a DataFrame with the three principal components
col_names = ["PC 1", "PC 2", "PC 3"]
#movies_pca_df = pd.DataFrame(movies_pca, columns=col_names, index=movies_df['primaryTitle'])
movies_pca_df = pd.DataFrame(movies_pca, columns=col_names, index=movies_df.index)

print(movies_pca_df.shape)
movies_pca_df.head()


(101078, 3)


Unnamed: 0,PC 1,PC 2,PC 3
0,-0.195995,-0.065619,2.303032
1,0.206307,-0.738499,0.558224
2,-0.002176,-1.310213,-0.379936
3,-0.5692,-0.280299,-0.584228
4,-0.770555,1.334553,0.52971


In [58]:
# Fetch the explained variance

pca.explained_variance_ratio_


array([0.07437366, 0.05729043, 0.05483386])

## Part 3: Clustering Using K-Means

In [38]:
# Create an elbow curve to find the best value for K
# Note: Comment Out and Skip after finding out the ideal K is 4.

# Inertia Calculation Set-Up
#inertia = []
#k = list(range(1, 11))

# Calculate inertia for the range of K values
#for i in k:
#    km = KMeans(n_clusters=i, random_state=0)
#    km.fit(movies_pca_df)
#    inertia.append(km.inertia_)

# Plot Set-Up
#elbow_data = {"k": k, "inertia": inertia}

#elbow_df = pd.DataFrame(elbow_data)

#elbow_df.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


In [39]:
# Initialize the K-Means model using k=4

model = KMeans(n_clusters=4, random_state=0)


In [40]:
# Fit the model

model.fit(movies_pca_df)


KMeans(n_clusters=4, random_state=0)

In [41]:
# Predict clusters
predictions = model.predict(movies_pca_df)
predictions


array([3, 1, 1, ..., 0, 0, 3])

In [42]:
# Create a new DataFrame including predicted clusters and movies metadata.
# Concatenate the movies_df and movies_pca_df on the same columns.

clustered_df = pd.concat([movies_df, movies_pca_df], axis=1, sort=False)



# Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = model.labels_

print(clustered_df.shape)
clustered_df.head()


(101078, 42)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,PC 1,PC 2,PC 3,PC 4,Class
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/,"[Adventure, Music, Western]",0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,-0.114388,-0.182104,2.378206,-0.293506,3
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/,[Comedy],0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.20564,-0.784664,0.588032,0.92063,1
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/,"[Comedy, Drama]",0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.021298,-1.35536,-0.42726,0.554558,1
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.541462,-0.278372,-0.683578,-0.025347,1
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/,[Horror],0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.742362,1.30612,0.537829,1.09052,2


In [43]:
# Output clustered_df as TSV

#clustered_df.to_csv("Data/K-Means_Model_Output.tsv", sep='\t', index=False, doublequote=False)


#### Check Class Memberships

In [44]:
clustered_df['Class'].value_counts()

1    45637
2    21139
0    19262
3    15040
Name: Class, dtype: int64

## Part 4: Visualize Clusters

#### 3D-Scatter with Clusters

In [45]:
# Create a 3D-Scatter Plot with the PCA data and the clusters
#fig = px.scatter_3d(
#    clustered_df,
#    x="PC 1",
#    y="PC 2",
#    z="PC 3",
#    color="Class",
#    hover_name="primaryTitle",
#    hover_data=["averageRating"],
#    width=800,
#)

#fig.update_layout(legend=dict(x=0, y=1))
#fig.show()


### Evaluate Model

In [63]:
# Evaluate Results of K-Means Clustering Algorithm
# via Davies-Bouldin Index

davies_bouldin_score(Z, model.labels_)


2.662040891162623

## Part 5: Generate Recommendation for User

In [46]:
# Find viewerTitle Data for Testing Purposes

clustered_df.loc[(clustered_df['primaryTitle'] == viewerTitle)]


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western,PC 1,PC 2,PC 3,PC 4,Class
28773,tt0114709,Toy Story,Toy Story,1995,81,"Adventure,Animation,Comedy",8.3,932854,https://www.imdb.com/title/tt0114709/,"[Adventure, Animation, Comedy]",0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.32155,-2.154057,4.212114,-2.306199,3


In [47]:
# Find tconst for viewerTitle

viewer_tconst = clustered_df.loc[(clustered_df['primaryTitle'] == viewerTitle)]['tconst']
viewer_tconst


28773    tt0114709
Name: tconst, dtype: object

#### Take viewerTitle and find Closest Neighbor

In [48]:
# Find Class of viewerTitle

#clustered_df.head()
#clustered_df.loc[clustered_df['primaryTitle'] == viewerTitle]
viewerTitleClass = clustered_df.loc[clustered_df['primaryTitle'] == viewerTitle]['Class'].values[0]
viewerTitleClass


3

In [59]:
# Create a Distance Matrix by 'tconst'

# First, create a DataFrame of only the three Principal Components
# of Titles in the same Class as viewerTitle

clustered_df = clustered_df.loc[clustered_df['Class'] == viewerTitleClass]

distance_inputs_df = clustered_df[['tconst', 'PC 1', 'PC 2', 'PC 3']]
distance_inputs_df.set_index('tconst', inplace=True)

print(distance_inputs_df.shape)
distance_inputs_df.head()


(15040, 3)


Unnamed: 0_level_0,PC 1,PC 2,PC 3
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0003854,-0.114388,-0.182104,2.378206
tt0010949,-0.601295,0.421821,0.669588
tt0010950,-0.672553,-0.127657,1.625129
tt0010960,-1.079742,-0.073123,1.72493
tt0010965,-1.810465,-0.677173,3.112306


In [52]:
# Find Principal Component Coordinates
# for viewer_tconst

viewer_input_df = distance_inputs_df.loc[viewer_tconst]
viewer_input_df


Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tt0114709,0.32155,-2.154057,4.212114,-2.306199


In [53]:
# Convert distance_inputs_df to Numpy Array

distance_inputs = distance_inputs_df.to_numpy()
distance_inputs

viewer_input = viewer_input_df.to_numpy()
viewer_input


array([[ 0.32154983, -2.15405712,  4.21211408, -2.30619893]])

In [54]:
# Calculate Euclidean Distances

distance_results = cdist(viewer_input, distance_inputs, 'euclidean')
distance_results[0]


array([3.39010246, 4.48063809, 3.58364051, ..., 4.53524966, 4.9123777 ,
       5.05811726])

#### Output Recommendations

In [55]:
# Find the k Smallest Non-Zero Distance and their Positions
# Change k to change the number of Recommendations output

k = 5

# Sort distance_results Array
# and exclude Zeroes

distance_results_sorted = np.sort(distance_results[0])
distance_results_sorted = distance_results_sorted[np.nonzero(distance_results_sorted)]

print(f'\nInput Movie: {viewerTitle}\n')

print(f'Number of entries in Distance Array: {len(distance_results[0])}\n')

print(f'{k} Recommendations:\n')

recommendation_list = []

recommendation_dict = {}

# Loop until j = k

j = 0

# Initiate iterator for distance_results_sorted Array:
i = 0

# Get first result outside of loop
# Grab the first distance from distance_results_sorted:
entry = distance_results_sorted[i]


# Dictionary Output:
recommendation_index = list(distance_results[0]).index(entry)
recommendation_dict['title'] = clustered_df.iloc[recommendation_index]['primaryTitle']
recommendation_dict['url'] = clustered_df.iloc[recommendation_index]['url']
recommendation_dict['releaseYear'] = clustered_df.iloc[recommendation_index]['startYear']
recommendation_dict['averageRating'] = clustered_df.iloc[recommendation_index]['averageRating']
recommendation_dict['genres'] = clustered_df.iloc[recommendation_index]['genres'].replace(",", ", ")

recommendation_list.append(recommendation_dict)

i = i + 1
j = j + 1

#for entry in k_min_non_zero:
while j < k:
        
    recommendation_dict = {}
    
    if i == len(distance_results_sorted):
        break

    entry = distance_results_sorted[i]

    # Dictionary Output:
    recommendation_index = list(distance_results[0]).index(entry)
    title = clustered_df.iloc[recommendation_index]['primaryTitle']

    # If title is equal to the previous title, increment i and restart loop:

    if title == recommendation_list[j-1]['title']:
        i = i + 1
        continue

    recommendation_dict['title'] = clustered_df.iloc[recommendation_index]['primaryTitle']
    recommendation_dict['url'] = clustered_df.iloc[recommendation_index]['url']
    recommendation_dict['releaseYear'] = clustered_df.iloc[recommendation_index]['startYear']
    recommendation_dict['averageRating'] = clustered_df.iloc[recommendation_index]['averageRating']
    recommendation_dict['genres'] = clustered_df.iloc[recommendation_index]['genres'].replace(",", ", ")

    recommendation_list.append(recommendation_dict)

    j = j + 1


# Sort recommendatioin_list by 'averageRating'
recommendation_list = sorted(recommendation_list, key=lambda d: d['averageRating'], reverse=True)

# Output recommendation_list

recommendation_list



Input Movie: Toy Story

Number of entries in Distance Array: 15040

5 Recommendations:



[{'title': 'Coco (2017)',
  'url': 'https://www.imdb.com/title/tt2380307/',
  'releaseYear': 2017,
  'averageRating': 8.4,
  'genres': 'Adventure, Animation, Comedy'},
 {'title': 'Toy Story 3',
  'url': 'https://www.imdb.com/title/tt0435761/',
  'releaseYear': 2010,
  'averageRating': 8.2,
  'genres': 'Adventure, Animation, Comedy'},
 {'title': 'Monsters, Inc.',
  'url': 'https://www.imdb.com/title/tt0198781/',
  'releaseYear': 2001,
  'averageRating': 8.1,
  'genres': 'Adventure, Animation, Comedy'},
 {'title': 'Aladdin (1992)',
  'url': 'https://www.imdb.com/title/tt0103639/',
  'releaseYear': 1992,
  'averageRating': 8.0,
  'genres': 'Adventure, Animation, Comedy'},
 {'title': 'Toy Story 2',
  'url': 'https://www.imdb.com/title/tt0120363/',
  'releaseYear': 1999,
  'averageRating': 7.9,
  'genres': 'Adventure, Animation, Comedy'}]

In [None]:
# Output recommendation_list

recommendation_list
