In [2]:
# Import Dependencies
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


In [3]:
# Pandas Settings

# Display All Columns
pd.set_option('display.max_columns', None)

In [4]:
# Path to file directory and variables for the two files.
file_dir = os.path.join("Data")

# imdb Titles metadata
titles_metadata_file = f'{file_dir}/title_basics_non-adult_movies.tsv'

# imdb Ratings data
ratings_data_file = f'{file_dir}/title_ratings.csv'

In [5]:
# Import imdb Titles metadata, and imdb Ratings data

titles_metadata = pd.read_csv(titles_metadata_file, sep='\t')
ratings_data = pd.read_csv(ratings_data_file)


In [6]:
# Check titles_metadata DataFrame
print(titles_metadata.shape)
titles_metadata.count()
titles_metadata.head()


(584642, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
4,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [7]:
# Drop titles_metadata Rows with "\N" for genres

#titles_metadata['genres'].value_counts()

titles_metadata = titles_metadata.loc[~(titles_metadata['genres'] == "\\N")]


In [8]:
# Check titles_metadata DataFrame
print(ratings_data.shape)
ratings_data.count()
ratings_data.head()

(1201036, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1834
1,tt0000002,6.0,236
2,tt0000003,6.5,1594
3,tt0000004,6.0,153
4,tt0000005,6.2,2410


In [9]:
# Merge titles_metadata and ratings_data on tconst

movies_df = pd.merge(titles_metadata, ratings_data, on="tconst")

print(movies_df.shape)
movies_df.head()


(256027, 11)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography",6.1,736
1,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama,5.2,16
2,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,4.5,23
3,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama,3.8,23
4,tt0000675,movie,Don Quijote,Don Quijote,0,1908,\N,\N,Drama,4.9,19


In [10]:
# Add url column to movies_df
movies_df['url'] = movies_df.apply(lambda row: "".join(["https://www.imdb.com/title/", row['tconst'], "/"]), axis=1)

print(movies_df.shape)
movies_df.head()


(256027, 12)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,url
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography",6.1,736,https://www.imdb.com/title/tt0000574/
1,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama,5.2,16,https://www.imdb.com/title/tt0000591/
2,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,4.5,23,https://www.imdb.com/title/tt0000615/
3,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama,3.8,23,https://www.imdb.com/title/tt0000630/
4,tt0000675,movie,Don Quijote,Don Quijote,0,1908,\N,\N,Drama,4.9,19,https://www.imdb.com/title/tt0000675/


In [11]:
# Check DataFrame

movies_df.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult             int64
startYear          object
endYear            object
runtimeMinutes     object
genres             object
averageRating     float64
numVotes            int64
url                object
dtype: object

In [12]:
# Convert 'genres' entries into lists

movies_df['genres_list'] = movies_df.apply(lambda row: row['genres'].split(","), axis=1)

print(movies_df.shape)
movies_df.head()


(256027, 13)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography",6.1,736,https://www.imdb.com/title/tt0000574/,"[Action, Adventure, Biography]"
1,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama,5.2,16,https://www.imdb.com/title/tt0000591/,[Drama]
2,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,4.5,23,https://www.imdb.com/title/tt0000615/,[Drama]
3,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama,3.8,23,https://www.imdb.com/title/tt0000630/,[Drama]
4,tt0000675,movie,Don Quijote,Don Quijote,0,1908,\N,\N,Drama,4.9,19,https://www.imdb.com/title/tt0000675/,[Drama]


In [14]:
# Transform (get_dummies via Multi Label Bin Encoding) movies_df by 'genres'

genres = movies_df['genres_list']

mlb = MultiLabelBinarizer()

X = pd.DataFrame(mlb.fit_transform(genres), columns=mlb.classes_, index=movies_df.index)

print(X.shape)
X.head()

(256027, 27)


Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
# Merge X back with movies_df

movies_df = pd.merge(movies_df, X, how='outer', left_index=True, right_index=True)

print(movies_df.shape)
movies_df.head()

(256027, 40)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography",6.1,736,https://www.imdb.com/title/tt0000574/,"[Action, Adventure, Biography]",1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama,5.2,16,https://www.imdb.com/title/tt0000591/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama,4.5,23,https://www.imdb.com/title/tt0000615/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama,3.8,23,https://www.imdb.com/title/tt0000630/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,tt0000675,movie,Don Quijote,Don Quijote,0,1908,\N,\N,Drama,4.9,19,https://www.imdb.com/title/tt0000675/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# Find 'Toy Story' Data for Testing Purposes

movies_df.loc[(movies_df['primaryTitle'] == "Toy Story")]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
60233,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy",8.3,932854,https://www.imdb.com/title/tt0114709/,"[Adventure, Animation, Comedy]",0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
