# IMDB Data Project
Craig Gossen



## Load and Clean the Data

In [2]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
#Gather the data
titles_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [4]:
#Load the data
titles = pd.read_csv(titles_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [5]:
titles.head()


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
akas.head()


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [7]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1942
1,tt0000002,5.8,262
2,tt0000003,6.5,1768
3,tt0000004,5.6,178
4,tt0000005,6.2,2577


In [8]:
titles.shape

(9553998, 9)

In [9]:
akas.shape

(34675513, 8)

In [10]:
ratings.shape

(1271748, 3)

### Cleaning Titles

In [11]:
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9553998 entries, 0 to 9553997
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 656.0+ MB


In [12]:
#Replacing \N placeholders with NaN 
titles.replace({'\\N':np.nan},inplace = True)

In [13]:
#Drop null for runtimeMinutes and genres
titles.dropna(subset = ['runtimeMinutes', 'genres'], inplace=True)
titles.shape

(2700288, 9)

In [14]:
titles['titleType'].value_counts()

tvEpisode       1334460
short            589095
movie            375632
video            178404
tvMovie           90632
tvSeries          88838
tvSpecial         17536
tvMiniSeries      16748
tvShort            8626
videoGame           317
Name: titleType, dtype: int64

In [15]:
#Keep only Type == Movies
titles_moviefilter = titles['titleType'] == 'movie'
titles = titles[titles_moviefilter]
titles.shape

(375632, 9)

In [16]:
#Understand startYear values
titles['startYear'].value_counts().sort_index(ascending = False)

2029        2
2025        3
2024        6
2023     1381
2022    12034
        ...  
1900        2
1899        1
1897        1
1896        1
1894        1
Name: startYear, Length: 129, dtype: int64

In [17]:
#Keep startYear == 2000, 2001, 2002
titles = titles.query('startYear == "2000" or startYear == "2001" or startYear == "2002"')
titles.shape

(11565, 9)

In [18]:
#Drop 'Documentary' in column genre
is_documentary = titles['genres'].str.contains('documentary',case=False)
titles = titles[~is_documentary]
titles.shape

(8578, 9)

In [24]:
#Keep only US movies
#Filter the titles table down to only include the US by using the filter akas dataframe
keepers =titles['tconst'].isin(akas['titleId'])
keepers.shape

(8578,)

In [25]:
#Filter basics for keepers
titles = titles[keepers]
titles.shape

(8569, 9)

In [26]:
titles.head(20)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
77964,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
87114,tt0089067,movie,El día de los albañiles 2,El día de los albañiles 2,0,2001,,90,Comedy
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
100076,tt0102362,movie,Istota,Istota,0,2000,,80,"Drama,Romance"
110366,tt0112912,movie,Dune 7,Dune 7,0,2002,,97,Adventure
110478,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000,,86,"Musical,Romance"
110541,tt0113092,movie,For the Cause,For the Cause,0,2000,,100,"Action,Adventure,Drama"
111852,tt0114447,movie,The Silent Force,The Silent Force,0,2001,,90,Action
112120,tt0114722,movie,3 noches,3 noches,0,2001,,105,"Crime,Thriller"


### Cleaning AKAs

In [27]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [29]:
#Replacing \N placeholders with NaN 
akas.replace({'\\N':np.nan},inplace = True)

In [30]:
akas.shape

(34675513, 8)

In [31]:
#Keep only US in region
akas_regionfilter = akas['region'] == 'US'
akas = akas[akas_regionfilter]
akas.shape


(1408224, 8)

In [33]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


### Cleaning Ratings

In [34]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1942
1,tt0000002,5.8,262
2,tt0000003,6.5,1768
3,tt0000004,5.6,178
4,tt0000005,6.2,2577


In [35]:
#Replacing \N placeholders with NaN 
ratings.replace({'\\N':np.nan},inplace = True)

In [36]:
#Keep only US movies
#Filter the ratings table down to only include the US by usign the filter akas dataframe
keepers2 =ratings['tconst'].isin(akas['titleId'])
keepers2.shape

(1271748,)

In [37]:
#Filter ratings for keepers
ratings = ratings[keepers2]
ratings.shape

(486104, 3)

In [38]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1942
1,tt0000002,5.8,262
4,tt0000005,6.2,2577
5,tt0000006,5.1,177
6,tt0000007,5.4,808
