## **IMDb TV Ratings**

In [1]:
# imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

_IMDb_ | https://datasets.imdbws.com/

**title.basics.tsv.gz**

- `tconst` (string) - alphanumeric unique identifier of the title
- `titleType` (string) - the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
- `primaryTitle` (string) - the more popular title / the title used by the filmmakers on promotional materials at the point of release
- `originalTitle` (string) - original title, in the original language
- `isAdult` (boolean) - 0: non-adult title; 1: adult title
- `startYear` (YYYY) - represents the release year of a title. In the case of TV Series, it is the series start year
- `endYear` (YYYY) - TV Series end year. ‘\N’ for all other title types
- `runtimeMinutes` - primary runtime of the title, in minutes
- `genres` (string array) - includes up to three genres associated with the title

**title.episode.tsv.gz**

- `tconst` (string) - alphanumeric identifier of episode
- `parentTconst` (string) - alphanumeric identifier of the parent TV Series
- `seasonNumber` (integer) - season number the episode belongs to
- `episodeNumber` (integer) - episode number of the tconst in the TV series

**title.ratings.tsv.gz**

- `tconst` (string) - alphanumeric unique identifier of the title
- `averageRating` - weighted average of all the individual user ratings
- `numVotes` - number of votes the title has received

In [2]:
title_basics = pd.read_csv("../data/title_basics.tsv", sep="\t", low_memory=False)
title_episode = pd.read_csv("../data/title_episode.tsv", sep="\t", low_memory=False)
title_ratings = pd.read_csv("../data/title_ratings.tsv", sep="\t", low_memory=False)

In [3]:
print(title_basics.shape)
print('')
print(title_basics.columns)
print('')
title_basics.head()

(10702767, 9)

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
title_basics.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [5]:
# convert data types
title_basics['titleType'] = title_basics['titleType'].astype('category')

In [6]:
# check for null values
title_basics.isna().sum()

tconst              0
titleType           0
primaryTitle       18
originalTitle      18
isAdult             0
startYear           0
endYear             0
runtimeMinutes      0
genres            312
dtype: int64

In [7]:
title_basics[title_basics.primaryTitle.isna()]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
625849,tt0648118,tvEpisode,,,0,1987,\N,60,"Crime,Drama,Mystery"
1253522,tt10516578,video,,,0,2017,\N,\N,"Music,Short"
3430926,tt14510930,tvEpisode,,,0,\N,\N,\N,\N
4073479,tt15700278,tvEpisode,,,0,2021,\N,\N,Talk-Show
4554578,tt17042812,movie,,,0,2010,\N,87,Thriller
5062105,tt1971246,tvEpisode,,,0,2011,\N,\N,Biography
5257752,tt2067043,tvEpisode,,,0,1965,\N,\N,Music
5596138,tt21883066,tvEpisode,,,0,2022,\N,\N,"News,Talk-Show"
5865630,tt2305914,tvEpisode,,,0,\N,\N,\N,"Comedy,Talk-Show"
5865637,tt2305918,tvEpisode,,,0,\N,\N,\N,"Comedy,Talk-Show"


In [8]:
title_basics[title_basics.genres.isna()]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
1096955,tt10233364,tvEpisode,Rolling in the Deep Dish\tRolling in the Deep ...,0,2019,\N,\N,Reality-TV,
1506642,tt10970874,tvEpisode,Die Bauhaus-Stadt Tel Aviv - Vorbild für die M...,0,2019,\N,\N,Talk-Show,
1894071,tt11670006,tvEpisode,...ein angenehmer Unbequemer...\t...ein angene...,0,1981,\N,\N,Documentary,
2004921,tt11868642,tvEpisode,GGN Heavyweight Championship Lungs With Mike T...,0,2020,\N,\N,Talk-Show,
2158796,tt12149332,tvEpisode,Jeopardy! College Championship Semifinal Game ...,0,2020,\N,\N,"Game-Show,Reality-TV,Short",
...,...,...,...,...,...,...,...,...,...
7692078,tt32124610,tvEpisode,Godzilla`s Revenge '1969\tGodzilla`s Revenge '...,0,1987,\N,\N,"Fantasy,Horror,Mystery",
7692810,tt32126116,tvEpisode,Conquest of the Planet of the Apes\tConquest o...,0,1987,\N,\N,"Fantasy,Horror,Mystery",
8029026,tt3984412,tvEpisode,"I'm Not Going to Come Last, I'm Just Going to ...",0,2014,\N,\N,"Game-Show,Reality-TV",
10659462,tt9822816,tvEpisode,Zwischen Vertuschung und Aufklärung - Missbrau...,0,2019,\N,\N,Talk-Show,


In [9]:
title_basics = title_basics.dropna()

In [10]:
title_basics['titleType'].cat.categories

Index(['movie', 'short', 'tvEpisode', 'tvMiniSeries', 'tvMovie', 'tvPilot',
       'tvSeries', 'tvShort', 'tvSpecial', 'video', 'videoGame'],
      dtype='object')

In [11]:
print(title_episode.shape)
print('')
print(title_episode.columns)
print('')
title_episode.head()

(8189406, 4)

Index(['tconst', 'parentTconst', 'seasonNumber', 'episodeNumber'], dtype='object')



Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1,9
1,tt0042816,tt0989125,1,17
2,tt0042889,tt0989125,\N,\N
3,tt0043426,tt0040051,3,42
4,tt0043631,tt0989125,2,16


In [12]:
title_episode.dtypes

tconst           object
parentTconst     object
seasonNumber     object
episodeNumber    object
dtype: object

In [13]:
# check for null values
title_episode.isna().sum()

tconst           0
parentTconst     0
seasonNumber     0
episodeNumber    0
dtype: int64

In [14]:
print(title_ratings.shape)
print('')
print(title_ratings.columns)
print('')
title_ratings.head()

(1427087, 3)

Index(['tconst', 'averageRating', 'numVotes'], dtype='object')



Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2043
1,tt0000002,5.7,273
2,tt0000003,6.5,2001
3,tt0000004,5.4,178
4,tt0000005,6.2,2760


In [15]:
title_ratings.dtypes

tconst            object
averageRating    float64
numVotes           int64
dtype: object

In [16]:
# check for null values
title_ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [17]:
# save to csv
title_basics.to_csv('../data/title_basics.csv', index=False)
title_episode.to_csv('../data/title_episode.csv', index=False)
title_ratings.to_csv('../data/title_ratings.csv', index=False)