## Data Loading and Summary

In [1]:
!wget https://datasets.imdbws.com/title.basics.tsv.gz
!wget https://datasets.imdbws.com/title.episode.tsv.gz
!wget https://datasets.imdbws.com/title.ratings.tsv.gz

--2024-08-01 15:17:25--  https://datasets.imdbws.com/title.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 52.85.132.80, 52.85.132.19, 52.85.132.66, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|52.85.132.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 192392247 (183M) [binary/octet-stream]
Saving to: ‘title.basics.tsv.gz’


2024-08-01 15:17:28 (65.8 MB/s) - ‘title.basics.tsv.gz’ saved [192392247/192392247]

--2024-08-01 15:17:28--  https://datasets.imdbws.com/title.episode.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 52.85.132.80, 52.85.132.19, 52.85.132.66, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|52.85.132.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46158848 (44M) [binary/octet-stream]
Saving to: ‘title.episode.tsv.gz’


2024-08-01 15:17:28 (74.0 MB/s) - ‘title.episode.tsv.gz’ saved [46158848/46158848]

--2024-08-01 15:17:28--  https://datasets.imdbws.

In [2]:
!gzip -d title.basics.tsv.gz
!gzip -d title.episode.tsv.gz
!gzip -d title.ratings.tsv.gz

## Data Overview

In [3]:
import pandas as pd

In [4]:
basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, na_values=['\\N'])
episode = pd.read_csv('title.episode.tsv', sep='\t', low_memory=False, na_values=['\\N'])
ratings = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False, na_values=['\\N'])

In [5]:
basics.head(3)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,5,"Action,Adventure,Animation"


In [6]:
episode.head(3)

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0031458,tt32857063,,
1,tt0041951,tt0041038,1.0,9.0
2,tt0042816,tt0989125,1.0,17.0


In [7]:
ratings.head(3)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2064
1,tt0000002,5.6,279
2,tt0000003,6.5,2039


In [8]:
len(episode), len(basics), len(ratings)

(8400596, 10962448, 1464644)

### Checking ID columns for uniqueness

In [9]:
basics['tconst'].is_unique, episode['tconst'].is_unique, ratings['tconst'].is_unique

(True, True, True)

In [10]:
# Check if columns 'tConst' and 'parentTconst' in episode have common values
len(set(episode['tconst']).intersection(set(episode['parentTconst'])))

0

## Data Cleaning

### Data Cleaning for `basics` Dataset

In [11]:
basics.dtypes

Unnamed: 0,0
tconst,object
titleType,object
primaryTitle,object
originalTitle,object
isAdult,float64
startYear,float64
endYear,float64
runtimeMinutes,object
genres,object


In [12]:
# Checking for null values
basics.isna().sum()

Unnamed: 0,0
tconst,0
titleType,0
primaryTitle,17
originalTitle,17
isAdult,1
startYear,1414482
endYear,10835206
runtimeMinutes,7546168
genres,485607


In [13]:
# view all rows with null values of 'primaryTitle' or originalTitle'
basics[(basics['primaryTitle'].isna()) | (basics['originalTitle'].isna())]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
1253393,tt10516578,video,,,0.0,2017.0,,,"Music,Short"
3430017,tt14510930,tvEpisode,,,0.0,,,,
4072843,tt15700278,tvEpisode,,,0.0,2021.0,,,Talk-Show
4553955,tt17042812,movie,,,0.0,2010.0,,87.0,Thriller
5061656,tt1971246,tvEpisode,,,0.0,2011.0,,,Biography
5257366,tt2067043,tvEpisode,,,0.0,1965.0,,,Music
5595729,tt21883066,tvEpisode,,,0.0,2022.0,,,"News,Talk-Show"
5865218,tt2305914,tvEpisode,,,0.0,,,,"Comedy,Talk-Show"
5865225,tt2305918,tvEpisode,,,0.0,,,,"Comedy,Talk-Show"
7535400,tt31462159,tvEpisode,,,0.0,2024.0,,,Drama


17 rows have missing values for both `primaryTitle` and `originalTitle` fields. We will remove these rows.

In [14]:
# Remove rows with null values in 'primaryTitle' and 'originalTitle'
basics = basics.dropna(subset=['primaryTitle', 'originalTitle'])

In [15]:
# View the only row with missing 'isAdult' value
pd.set_option('max_colwidth', None)
basics[basics['isAdult'].isna()]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
2989187,tt13704268,tvEpisode,Bay of the Triffids/Doctor of Doom\tBay of the Triffids/Doctor of Doom,0,,,,"Animation,Comedy,Family",


In this case, the `\t` character was intepreted as literal string isntead of a `Tab` seperator.

Instead of removing this row, we will fix it by reassigning the `primaryTitle`, `originalTitle`, and `isAdult` fields to their right values:
- `primaryTitle`: Bay of the Triffids/Doctor of Doom
- `originalTitle`: Bay of the Triffids/Doctor of Doom
- isAdult: 0

In [16]:
# prompt: at row index 2989193, replace these columns primaryTitle, originalTitle, and isAdult with the values ['Bay of the Triffids/Doctor of Doom', 'Bay of the Triffids/Doctor of Doom', 0]

basics.loc[2989193, ['primaryTitle', 'originalTitle', 'isAdult']] = ['Bay of the Triffids/Doctor of Doom', 'Bay of the Triffids/Doctor of Doom', 0]
basics.loc[2989193]

Unnamed: 0,2989193
tconst,tt13704278
titleType,tvEpisode
primaryTitle,Bay of the Triffids/Doctor of Doom
originalTitle,Bay of the Triffids/Doctor of Doom
isAdult,0.0
startYear,
endYear,
runtimeMinutes,
genres,"Animation,Comedy,Family"


Next, we check if all values in the field `isAdult` is either 0 or 1

In [17]:
basics['isAdult'].value_counts()

Unnamed: 0_level_0,count
isAdult,Unnamed: 1_level_1
0.0,10609805
1.0,351998
1985.0,79
1980.0,63
1978.0,54
1984.0,41
1982.0,32
1974.0,32
1972.0,29
2015.0,28


It seems like the some of the values in the field `isAdult` are year values and should belong to the `startYear` columns instead.

We closely inspect these rows

In [18]:
basics_isAdult_filter = basics[~basics['isAdult'].isin([0,1])]
basics_isAdult_filter

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
1096866,tt10233364,tvEpisode,Rolling in the Deep Dish\tRolling in the Deep Dish,0,2019.0,,,Reality-TV,
1506472,tt10970874,tvEpisode,Die Bauhaus-Stadt Tel Aviv - Vorbild für die Metropolen der Moderne?\tDie Bauhaus-Stadt Tel Aviv - Vorbild für die Metropolen der Moderne?,0,2019.0,,,Talk-Show,
1893777,tt11670006,tvEpisode,...ein angenehmer Unbequemer...\t...ein angenehmer Unbequemer...,0,1981.0,,,Documentary,
2004597,tt11868642,tvEpisode,GGN Heavyweight Championship Lungs With Mike Tyson and Snoop\tGGN Heavyweight Championship Lungs With Mike Tyson and Snoop,0,2020.0,,,Talk-Show,
2158217,tt12149332,tvEpisode,Jeopardy! College Championship Semifinal Game 3\tJeopardy! College Championship Semifinal Game 3,0,2020.0,,,Game-Show,
...,...,...,...,...,...,...,...,...,...
7991997,tt33052217,tvEpisode,Track of the Vampire'\tTrack of the Vampire',0,1986.0,,,"Fantasy,Horror,Mystery",
7992037,tt33052263,tvEpisode,The Clone Master\tThe Clone Master,0,1986.0,,,"Fantasy,Horror,Mystery",
8289142,tt3984412,tvEpisode,"I'm Not Going to Come Last, I'm Just Going to Die on The Amazing Race\tI'm Not Going to Come Last, I'm Just Going to Die on The Amazing Race",0,2014.0,,,"Game-Show,Reality-TV",
10919145,tt9822816,tvEpisode,Zwischen Vertuschung und Aufklärung - Missbrauchsgipfel im Vatikan\tZwischen Vertuschung und Aufklärung - Missbrauchsgipfel im Vatikan,0,2019.0,,,Talk-Show,


Looks like this is the same issue as before, where the `\t` seperators were intepreted as literal string.

We will check if all these rows contain a `\t` character, and if every row with value of `isAdult` not equal to 0 or 1 have missing `startYear` values

In [19]:
# Check if every row in basics_isAdult_filter contains `\t` in the primaryTitle
all(basics_isAdult_filter['primaryTitle'].str.contains('\t'))

True

In [20]:
basics_isAdult_filter['originalTitle'].value_counts()

Unnamed: 0_level_0,count
originalTitle,Unnamed: 1_level_1
0,627
1,1


In [21]:
basics_isAdult_filter['startYear'].unique()

array([nan])

We conclude that for every row where `isAdult` is neither 0 nor 1:

- Every value in `primaryTitle` contains '\t', which should be the `Tab` separator
- Every value in `originalTitle` is either 0 or 1, which should belong in the `isAdult` column
- Every value in `startYear` is missing, which is currently in the `isAdult` column

We will fix these rows by:
- Splitting all titles in `primaryTitle` by the `\t` character into `primaryTitle` and `originalTitle`
- Moving all current values in `originalTitle`, which are 0 or 1, to the `isAdult` field
- Moving all current values in `originalTitle` (e.g 1985, 1986) to the `startYear` field

In [22]:
# pd.set_option('mode.chained_assignment', None)
basics_isAdult_filter['startYear'] = basics_isAdult_filter['isAdult']
basics_isAdult_filter['isAdult'] = basics_isAdult_filter['originalTitle']
basics_isAdult_filter[['primaryTitle', 'originalTitle']] = basics_isAdult_filter['primaryTitle'].str.split('\t', expand=True)
basics_isAdult_filter

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_isAdult_filter['startYear'] = basics_isAdult_filter['isAdult']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_isAdult_filter['isAdult'] = basics_isAdult_filter['originalTitle']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics_isAdult_filter[['primaryTitle', 'originalTitle']] = ba

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
1096866,tt10233364,tvEpisode,Rolling in the Deep Dish,Rolling in the Deep Dish,0,2019.0,,Reality-TV,
1506472,tt10970874,tvEpisode,Die Bauhaus-Stadt Tel Aviv - Vorbild für die Metropolen der Moderne?,Die Bauhaus-Stadt Tel Aviv - Vorbild für die Metropolen der Moderne?,0,2019.0,,Talk-Show,
1893777,tt11670006,tvEpisode,...ein angenehmer Unbequemer...,...ein angenehmer Unbequemer...,0,1981.0,,Documentary,
2004597,tt11868642,tvEpisode,GGN Heavyweight Championship Lungs With Mike Tyson and Snoop,GGN Heavyweight Championship Lungs With Mike Tyson and Snoop,0,2020.0,,Talk-Show,
2158217,tt12149332,tvEpisode,Jeopardy! College Championship Semifinal Game 3,Jeopardy! College Championship Semifinal Game 3,0,2020.0,,Game-Show,
...,...,...,...,...,...,...,...,...,...
7991997,tt33052217,tvEpisode,Track of the Vampire',Track of the Vampire',0,1986.0,,"Fantasy,Horror,Mystery",
7992037,tt33052263,tvEpisode,The Clone Master,The Clone Master,0,1986.0,,"Fantasy,Horror,Mystery",
8289142,tt3984412,tvEpisode,"I'm Not Going to Come Last, I'm Just Going to Die on The Amazing Race","I'm Not Going to Come Last, I'm Just Going to Die on The Amazing Race",0,2014.0,,"Game-Show,Reality-TV",
10919145,tt9822816,tvEpisode,Zwischen Vertuschung und Aufklärung - Missbrauchsgipfel im Vatikan,Zwischen Vertuschung und Aufklärung - Missbrauchsgipfel im Vatikan,0,2019.0,,Talk-Show,


In [23]:
# Replace the rows in the original DataFrame
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

basics.loc[basics_isAdult_filter.index] = basics_isAdult_filter
basics.head(3)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,5,"Action,Adventure,Animation"


In [24]:
basics['isAdult'] = pd.to_numeric(basics['isAdult'])

In [25]:
basics.isna().sum()

Unnamed: 0,0
tconst,0
titleType,0
primaryTitle,0
originalTitle,0
isAdult,0
startYear,1413851
endYear,10835189
runtimeMinutes,7546155
genres,485606


There are still missing values in `startYear`, `endYear`, `runtimeMinute`, and `genres`. However, these fields are not mandatory and we can safely ignore these missing values.

Quick look at these rows with missing values to ensure they are good data

In [26]:
basics[basics.startYear.isna()]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
619,tt0000624,short,The Ugly Duckling,The Ugly Duckling,0.0,,,,Short
65748,tt0067098,tvEpisode,Willi Forst,Willi Forst,0.0,,,55,
83793,tt0085677,tvEpisode,High Country,High Country,0.0,,,,Sport
90916,tt0092975,tvEpisode,Erste Liebe und Heiratssachen,Erste Liebe und Heiratssachen,0.0,,,,Documentary
90962,tt0093025,movie,Tales of the Brothers Quay,The Films of the Brothers Quay,0.0,,,78,"Animation,Documentary"
...,...,...,...,...,...,...,...,...,...
10962336,tt9916616,short,Terror,Terror,0.0,,,13,"Drama,Short"
10962338,tt9916620,movie,The Copeland Case,The Copeland Case,0.0,,,,Drama
10962352,tt9916652,short,Untitled well- being Documentary,Untitled well- being Documentary,0.0,,,,Short
10962358,tt9916664,short,Untitled Land Army Girls Documentary,Untitled Land Army Girls Documentary,0.0,,,,Short


Now that all values in `primaryTitle`, `originalTitle`, `isAdult` fields have been reassigned and moved to their corresponding field, we will filter out all adult movies, which have `isAdult` equal to 1

In [27]:
# Remove all rows with 'isAdult' = 1
basics = basics[basics['isAdult'] == 0]
basics['isAdult'].value_counts()

Unnamed: 0_level_0,count
isAdult,Unnamed: 1_level_1
0.0,10610432


Finally, we reformat the `genres` column by adding a whitespace `\s` after every comma `,`

In [28]:
basics['genres'] = basics['genres'].str.replace(r',', r', ')
basics['genres'].head(3)

Unnamed: 0,genres
0,"Documentary, Short"
1,"Animation, Short"
2,"Action, Adventure, Animation"


### Data Cleaning for `episode` Dataset



In [29]:
episode.dtypes

Unnamed: 0,0
tconst,object
parentTconst,object
seasonNumber,float64
episodeNumber,float64


In [30]:
episode.isna().sum()

Unnamed: 0,0
tconst,0
parentTconst,0
seasonNumber,1702596
episodeNumber,1702596


Remove all rows with missing `seasonNumber` or `episodeNumber`

In [49]:
episode.dropna(inplace=True)
episode.isna().sum()

Unnamed: 0,0
tconst,0
parentTconst,0
seasonNumber,0
episodeNumber,0


In [50]:
episode['seasonNumber'] = episode['seasonNumber'].astype(int)
episode['episodeNumber'] = episode['episodeNumber'].astype(int)
episode.dtypes

Unnamed: 0,0
tconst,object
parentTconst,object
seasonNumber,int64
episodeNumber,int64


### Data Cleaning for `ratings` Dataset

In [31]:
ratings.dtypes

Unnamed: 0,0
tconst,object
averageRating,float64
numVotes,int64


In [32]:
ratings.isna().sum()

Unnamed: 0,0
tconst,0
averageRating,0
numVotes,0


## Data Merging for Episodes
Join all 3 tables on `basics.tconst = episode.tconst = ratings.tconst`



In [51]:
# prompt: inner join episode['tconst'] on ratings['tconst'] and basics['tconst'] into 1 table

# Merge ratings and episode on 'tconst'
ratings_episode = pd.merge(ratings, episode, on='tconst', how='inner')

# Merge the result with basics on 'tconst'
episode_all = pd.merge(ratings_episode, basics, on='tconst', how='inner')

In [52]:
print(episode_all['tconst'].is_unique)
print(len(episode_all))
episode_all.head(5)

True
717562


Unnamed: 0,tconst,averageRating,numVotes,parentTconst,seasonNumber,episodeNumber,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0041951,7.6,92,tt0041038,1,9,tvEpisode,The Tenderfeet,The Tenderfeet,0.0,1949.0,,30,Western
1,tt0042816,7.6,12,tt0989125,1,17,tvEpisode,Othello,Othello,0.0,1950.0,,135,Drama
2,tt0044093,4.6,19,tt0959862,1,6,tvEpisode,The Three Musketeers,The Three Musketeers,0.0,1950.0,,60,Drama
3,tt0045960,6.9,195,tt0044284,2,3,tvEpisode,King Lear,King Lear,0.0,1953.0,,75,"Drama, History, Music"
4,tt0046855,5.9,166,tt0046643,1,4,tvEpisode,A Christmas Carol,A Christmas Carol,0.0,1954.0,,60,"Adventure, Drama, Family"


Checking if all `parentTconst` are in `basics.tconst` and `ratings.tconst`

In [53]:
# Check if all episode_all['parentTconst'] is in both basics['tconst'] and ratings['tconst']

# Check if all episode_all['parentTconst'] are in basics['tconst']
tmp = all(episode_all['parentTconst'].isin(basics['tconst']))
print("All 'parentTconst' in basics['tconst']: ", tmp)

# Check if all episode_all['parentTconst'] are in ratings['tconst']
tmp = all(episode_all['parentTconst'].isin(ratings['tconst']))
print("All 'parentTconst' in ratings['tconst']: ", tmp)


All 'parentTconst' in basics['tconst']:  True
All 'parentTconst' in ratings['tconst']:  False


In [54]:
# Only keep episodes with the whole series ratings
# Filter all_data to keep only rows where 'parentTconst' is in ratings['tconst']

episode_all = episode_all[episode_all['parentTconst'].isin(ratings['tconst'])]
print(len(episode_all))
episode_all.head()

716664


Unnamed: 0,tconst,averageRating,numVotes,parentTconst,seasonNumber,episodeNumber,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0041951,7.6,92,tt0041038,1,9,tvEpisode,The Tenderfeet,The Tenderfeet,0.0,1949.0,,30,Western
1,tt0042816,7.6,12,tt0989125,1,17,tvEpisode,Othello,Othello,0.0,1950.0,,135,Drama
2,tt0044093,4.6,19,tt0959862,1,6,tvEpisode,The Three Musketeers,The Three Musketeers,0.0,1950.0,,60,Drama
3,tt0045960,6.9,195,tt0044284,2,3,tvEpisode,King Lear,King Lear,0.0,1953.0,,75,"Drama, History, Music"
4,tt0046855,5.9,166,tt0046643,1,4,tvEpisode,A Christmas Carol,A Christmas Carol,0.0,1954.0,,60,"Adventure, Drama, Family"


Recheck

In [55]:
# Double check that all `episode_all['parentTconst']` is in both basics['tconst'] and ratings ['tconst']
tmp = all(episode_all['parentTconst'].isin(basics['tconst']))
print("All episode['parentTconst'] in basics['tconst']: ", tmp)

# Check if all episode['parentTconst'] are in ratings['tconst']
tmp = all(episode_all['parentTconst'].isin(ratings['tconst']))
print("All episode['parentTconst'] in ratings['tconst']: ", tmp)


All episode['parentTconst'] in basics['tconst']:  True
All episode['parentTconst'] in ratings['tconst']:  True


Add a new column to `episode_all` containing the episode no., szn no., and primary title

In [61]:
episode_all['eSTitle'] = episode_all['seasonNumber'].astype(str) + '.' + episode_all['episodeNumber'].astype(str) + ' - ' + episode_all['primaryTitle']
episode_all.head()

Unnamed: 0,tconst,averageRating,numVotes,parentTconst,seasonNumber,episodeNumber,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,eSTitle
0,tt0041951,7.6,92,tt0041038,1,9,tvEpisode,The Tenderfeet,The Tenderfeet,0.0,1949.0,,30,Western,1.9 - The Tenderfeet
1,tt0042816,7.6,12,tt0989125,1,17,tvEpisode,Othello,Othello,0.0,1950.0,,135,Drama,1.17 - Othello
2,tt0044093,4.6,19,tt0959862,1,6,tvEpisode,The Three Musketeers,The Three Musketeers,0.0,1950.0,,60,Drama,1.6 - The Three Musketeers
3,tt0045960,6.9,195,tt0044284,2,3,tvEpisode,King Lear,King Lear,0.0,1953.0,,75,"Drama, History, Music",2.3 - King Lear
4,tt0046855,5.9,166,tt0046643,1,4,tvEpisode,A Christmas Carol,A Christmas Carol,0.0,1954.0,,60,"Adventure, Drama, Family",1.4 - A Christmas Carol


## Data Merging for Series

In [62]:
# Get distinct values in episode_all['parentTconst']
distinct_parentTconst = episode_all['parentTconst'].unique()

# Filter basics and ratings based on distinct_parentTconst
basics_parentTconst = basics[basics['tconst'].isin(distinct_parentTconst)]
ratings_parentTconst = ratings[ratings['tconst'].isin(distinct_parentTconst)]

# Inner join filtered_basics and filtered_ratings on 'tconst'
series_all = pd.merge(basics_parentTconst, ratings_parentTconst, on='tconst', how='inner')
series_all

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0039123,tvSeries,Kraft Theatre,Kraft Television Theatre,0.0,1947.0,1958.0,60,Drama,8.0,224
1,tt0039125,tvSeries,Public Prosecutor,Public Prosecutor,0.0,1947.0,1951.0,20,"Crime, Drama, Mystery",5.9,35
2,tt0040021,tvSeries,Actor's Studio,Actor's Studio,0.0,1948.0,1950.0,30,Drama,6.9,93
3,tt0040041,tvSeries,The Milton Berle Show,Texaco Star Theatre Starring Milton Berle,0.0,1948.0,1956.0,60,"Comedy, Family",7.6,167
4,tt0040048,tvSeries,Perry Como's Kraft Music Hall,The Perry Como Show,0.0,1948.0,1967.0,60,Music,7.6,122
...,...,...,...,...,...,...,...,...,...,...,...
37041,tt9914546,tvSeries,Moja Generacija Z,Moja Generacija Z,0.0,2019.0,,30,"Comedy, Drama, Family",4.2,196
37042,tt9914700,tvSeries,Aunty Donna: Best Content Ever!!1!,Aunty Donna: Best Content Ever!!1!,0.0,2017.0,2017.0,5,Comedy,8.2,27
37043,tt9915144,tvSeries,Never Ever,Never Ever,0.0,2018.0,,,"Biography, Documentary, History",9.0,91
37044,tt9916128,tvSeries,Salt City (India),Salt City (India),0.0,2022.0,,,Drama,6.0,129


In [63]:
# Check that the sets of value in episode_all['parentTconst'] and data_parentTconst['tconst'] are the same
set1 = set(episode_all['parentTconst'])
set2 = set(series_all['tconst'])

print(set1 == set2)


True


# Export data

In [64]:
episode_all.to_csv('episode_all.csv', index=False)
series_all.to_csv('series_all.csv', index=False)