# IMDB Business-Problem

Some information was Adapted from:https://github.com/jirvingphd/how-to-make-successful-movies

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import os

In [2]:
def check_nulls_nunique(df,plot=True):
    report = pd.DataFrame({"# null":df.isna().sum(),
                 "% null":df.isna().sum()/len(df)*100,
                 '# unique':df.nunique(),
                          '% unique':df.nunique()/len(df)*100})
    display(report.round(2))
#     if plot:
#         missingno.matrix(df)
#         plt.show()

In [3]:
os.makedirs("Data/", exist_ok=True)
sorted(os.listdir("Data/"))

['.ipynb_checkpoints', 'title_akas.csv.gz']

# Loading Data

## Basics Dataframe

In [4]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9119487 entries, 0 to 9119486
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 626.2+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Exploring Title Basics Data

In [5]:
check_nulls_nunique(basics)

Unnamed: 0,# null,% null,# unique,% unique
tconst,0,0.0,9119487,100.0
titleType,0,0.0,11,0.0
primaryTitle,11,0.0,4193001,45.98
originalTitle,11,0.0,4213240,46.2
isAdult,0,0.0,9,0.0
startYear,0,0.0,151,0.0
endYear,0,0.0,98,0.0
runtimeMinutes,0,0.0,872,0.01
genres,10,0.0,2320,0.03


In [6]:
basics.replace({'\\N':np.nan},inplace=True)
check_nulls_nunique(basics)

Unnamed: 0,# null,% null,# unique,% unique
tconst,0,0.0,9119487,100.0
titleType,0,0.0,11,0.0
primaryTitle,11,0.0,4193001,45.98
originalTitle,11,0.0,4213240,46.2
isAdult,1,0.0,8,0.0
startYear,1208964,13.26,150,0.0
endYear,9024806,98.96,97,0.0
runtimeMinutes,6665013,73.09,871,0.01
genres,415264,4.55,2319,0.03


In [7]:
# basics.replace({'\\N':np.nan}, inplace=True)
# clean_basics = basics.dropna(inplace=True)

In [8]:
## Eliminate movies that are null for runtimeMinute, genres, and startYear
basics = basics.dropna(subset=['runtimeMinutes','genres','startYear'])
check_nulls_nunique(basics, plot=False)

Unnamed: 0,# null,% null,# unique,% unique
tconst,0,0.0,2350627,100.0
titleType,0,0.0,10,0.0
primaryTitle,1,0.0,1671231,71.1
originalTitle,1,0.0,1688651,71.84
isAdult,0,0.0,2,0.0
startYear,0,0.0,147,0.01
endYear,2305870,98.1,95,0.0
runtimeMinutes,0,0.0,852,0.04
genres,0,0.0,2193,0.09


In [9]:
## drop endYear
basics = basics.drop(columns=['endYear'])
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...
9119437,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013,49,Documentary
9119443,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,43,"Family,Game-Show,Reality-TV"
9119478,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,11,"Adventure,Animation,Comedy"
9119485,tt9916856,short,The Wind,The Wind,0,2015,27,Short


In [10]:
basics['titleType'].value_counts()

tvEpisode       1035983
short            565881
movie            359737
video            173942
tvMovie           87803
tvSeries          85476
tvSpecial         16360
tvMiniSeries      15873
tvShort            9279
videoGame           293
Name: titleType, dtype: int64

In [11]:
## keep only titleType==Movie
basics = basics.loc[ basics['titleType']=='movie']
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,58,"Adventure,Drama"
...,...,...,...,...,...,...,...,...
9119252,tt9916362,movie,Coven,Akelarre,0,2020,92,"Drama,History"
9119336,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,123,Drama
9119377,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary
9119404,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary


In [12]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
is_documentary.value_counts()

False    274094
True      85643
Name: genres, dtype: int64

In [13]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [14]:
basics['startYear'] = basics['startYear'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics['startYear'] = basics['startYear'].astype(float)


In [15]:
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<2002)]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,118,"Comedy,Fantasy,Romance"
77925,tt0079644,movie,November 1828,November 1828,0,2001.0,140,"Drama,War"
87074,tt0089067,movie,El día de los albañiles 2,El día de los albañiles 2,0,2001.0,90,Comedy
100034,tt0102362,movie,Istota,Istota,0,2000.0,80,"Drama,Romance"
110437,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000.0,86,"Musical,Romance"
...,...,...,...,...,...,...,...,...
9029425,tt9722990,movie,Emily Rising,Emily Rising,0,2001.0,45,"Drama,Fantasy"
9029965,tt9724228,movie,Johnny Winter: Pieces & Bits,Johnny Winter: Pieces & Bits,0,2001.0,78,Music
9064809,tt9798698,movie,Gay holocaust,Gay holocaust,0,2001.0,50,Comedy
9083134,tt9837894,movie,Tizca. Gli Uccelli Dipinti del Caucaso,Tizca. Gli Uccelli Dipinti del Caucaso,0,2001.0,91,Drama


# Exploring Ratings

## Ratings Dataframe

In [16]:
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1257603 entries, 0 to 1257602
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1257603 non-null  object 
 1   averageRating  1257603 non-null  float64
 2   numVotes       1257603 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1901
1,tt0000002,5.9,255
2,tt0000003,6.5,1696
3,tt0000004,5.7,167
4,tt0000005,6.2,2515


In [17]:
# Replace "\N" with np.nan (if any)
ratings.replace({'\\N':np.nan},inplace=True)
check_nulls_nunique(ratings)

Unnamed: 0,# null,% null,# unique,% unique
tconst,0,0.0,1257603,100.0
averageRating,0,0.0,91,0.01
numVotes,0,0.0,20435,1.62


# Exploring AKAs Data

## AKAs Dataframe

In [18]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
akas.info()
akas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32734474 entries, 0 to 32734473
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [19]:
## The AKAs file has the information on country where it released and language
akas = akas[(akas['region'] == 'US')]
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
32734146,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
32734183,tt9916720,10,The Demonic Nun,US,\N,tv,\N,0
32734185,tt9916720,12,The Nun 2,US,\N,imdbDisplay,\N,0
32734202,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


In [20]:
## replace\N placeholders with NaN and re-check for nuls
akas.replace({'\\N':np.nan},inplace=True)
check_nulls_nunique(akas)

Unnamed: 0,# null,% null,# unique,% unique
titleId,0,0.0,1260617,94.12
ordering,0,0.0,115,0.01
title,0,0.0,1033809,77.19
region,0,0.0,1,0.0
language,1335651,99.73,8,0.0
types,376239,28.09,13,0.0
attributes,1294720,96.67,163,0.01
isOriginalTitle,1375,0.1,2,0.0


In [21]:
akas.drop(columns='language',inplace=True)
akas.head()

Unnamed: 0,titleId,ordering,title,region,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,alternative,,0


# Final Filtering: Removing Non-US Movies

In [22]:
## Only keep movie ids that are in the final filtered akas
keepers = basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,118,"Comedy,Fantasy,Romance"
110437,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000.0,86,"Musical,Romance"
110500,tt0113092,movie,For the Cause,For the Cause,0,2000.0,100,"Action,Adventure,Drama"
111811,tt0114447,movie,The Silent Force,The Silent Force,0,2001.0,90,Action
113246,tt0115937,movie,Consequence,Consequence,0,2000.0,91,Drama
...,...,...,...,...,...,...,...,...
8792214,tt9212730,movie,Yakuza Zombie,Zonbi gokudo,0,2001.0,87,"Horror,Thriller"
8799504,tt9228234,movie,The Narc Enigma,The Narc Enigma,0,2001.0,93,Action
8885230,tt9412476,movie,Contratiempo Mortal,Contratiempo Mortal,0,2000.0,90,Action
8951474,tt9555974,movie,Haunted School,Gui xue xiao,0,2001.0,85,Horror


In [23]:
## Keeping only movie that exist in basics 
keepers_akas = ratings['tconst'].isin(akas['titleId'])
keepers_basics = ratings['tconst'].isin(basics['tconst'])

print('- If filter using AKAs:')
display(keepers_akas.value_counts())

print('- If filter using Basics:')
display(keepers_basics.value_counts())

- If filter using AKAs:


False    783551
True     474052
Name: tconst, dtype: int64

- If filter using Basics:


False    1254898
True        2705
Name: tconst, dtype: int64

In [24]:
ratings_final = ratings[keepers_basics]
ratings_final

Unnamed: 0,tconst,averageRating,numVotes
17916,tt0035423,6.4,84377
83900,tt0113026,5.6,1375
83951,tt0113092,3.4,819
85028,tt0114447,4.2,152
86235,tt0115937,7.7,13
...,...,...,...
1179329,tt7802790,5.9,504
1182901,tt7881990,4.4,73
1224290,tt8954964,4.7,22
1229775,tt9071078,5.5,11


In [25]:
basics.to_csv("Data/title_basics_cleaned.csv.gz",compression='gzip',index=False)
basics = pd.read_csv("Data/title_basics_cleaned.csv.gz", low_memory = False)
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935 entries, 0 to 2934
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          2935 non-null   object 
 1   titleType       2935 non-null   object 
 2   primaryTitle    2935 non-null   object 
 3   originalTitle   2935 non-null   object 
 4   isAdult         2935 non-null   int64  
 5   startYear       2935 non-null   float64
 6   runtimeMinutes  2935 non-null   int64  
 7   genres          2935 non-null   object 
dtypes: float64(1), int64(2), object(5)
memory usage: 183.6+ KB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,118,"Comedy,Fantasy,Romance"
1,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000.0,86,"Musical,Romance"
2,tt0113092,movie,For the Cause,For the Cause,0,2000.0,100,"Action,Adventure,Drama"
3,tt0114447,movie,The Silent Force,The Silent Force,0,2001.0,90,Action
4,tt0115937,movie,Consequence,Consequence,0,2000.0,91,Drama


In [26]:
ratings.to_csv("Data/title_ratings_cleaned.csv.gz",compression='gzip',index=False)
ratings = pd.read_csv("Data/title_ratings_cleaned.csv.gz", low_memory = False)
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1257603 entries, 0 to 1257602
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1257603 non-null  object 
 1   averageRating  1257603 non-null  float64
 2   numVotes       1257603 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1901
1,tt0000002,5.9,255
2,tt0000003,6.5,1696
3,tt0000004,5.7,167
4,tt0000005,6.2,2515


In [27]:
akas.to_csv("Data/title_akas_cleaned.csv.gz",compression='gzip',index=False)
akas = pd.read_csv("Data/title_akas_cleaned.csv.gz", low_memory = False)
akas.info()
akas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339306 entries, 0 to 1339305
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1339306 non-null  object 
 1   ordering         1339306 non-null  int64  
 2   title            1339306 non-null  object 
 3   region           1339306 non-null  object 
 4   types            963067 non-null   object 
 5   attributes       44586 non-null    object 
 6   isOriginalTitle  1337931 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 71.5+ MB


Unnamed: 0,titleId,ordering,title,region,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,alternative,,0.0


# Final Movie Count

In [28]:
print(f"There are {len(basics):,} rows in title_basics")
print(f"There are {len(ratings):,} rows in title_ratings")
print(f"There are {len(akas):,} rows in akas")

There are 2,935 rows in title_basics
There are 1,257,603 rows in title_ratings
There are 1,339,306 rows in akas
