In [24]:
import pandas as pd
import numpy as np

### Open dataset as pandas DataFrame

In [25]:
imdb = pd.read_csv("imdb_2019.tsv", sep="\t")

In [26]:
imdb.head()

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,87501.0,tt0089435,short,Kokoa,Kokoa,0.0,2019.0,,13.0,
1,89512.0,tt0091490,short,Martina's Playhouse,Martina's Playhouse,0.0,2019.0,,20.0,
2,114407.0,tt0116991,movie,Mariette in Ecstasy,Mariette in Ecstasy,0.0,2019.0,,,
3,126556.0,tt0129960,tvMovie,Eine geschlossene Gesellschaft,Eine geschlossene Gesellschaft,0.0,2019.0,,,
4,166388.0,tt0172112,short,Ambulans,Ambulans,0.0,2019.0,,11.0,


### Unique types of titles

In [27]:
imdb["titleType"].unique()

array(['short', 'movie', 'tvMovie', 'video', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'tvSpecial', 'videoGame', 'tvShort'], dtype=object)

In [28]:
# using python sets
len(set(imdb["titleType"]))

10

In [29]:
# using unique
len(imdb["titleType"].unique())

10

In [30]:
# using value_counts
val_counts = imdb["titleType"].value_counts()
print(len(val_counts))

10


### Slice `imdb` dataframe to return only the columns `titleType`, `primaryTitle`, `startYear`, and `runtimeMinutes`

In [31]:
imdb[["titleType", "primaryTitle", "startYear", "runtimeMinutes"]]

Unnamed: 0,titleType,primaryTitle,startYear,runtimeMinutes
0,short,Kokoa,2019.0,13.0
1,short,Martina's Playhouse,2019.0,20.0
2,movie,Mariette in Ecstasy,2019.0,
3,tvMovie,Eine geschlossene Gesellschaft,2019.0,
4,short,Ambulans,2019.0,11.0
...,...,...,...,...
301334,tvEpisode,Talent Coaching with IMOR's Bianca Desmore Mit...,2019.0,
301335,tvEpisode,Escape,2019.0,
301336,tvEpisode,Tinne Oltmans,2019.0,
301337,tvEpisode,Luc Janssens,2019.0,


### Create a subset of `imdb` named `tvEpisodes_2019` that only includes the type `tvEpisodes``

In [32]:
tvEpisodes_2019 = imdb.query("titleType == 'tvEpisode'")

In [33]:
tvEpisodes_2019 = imdb[imdb["titleType"]=="tvEpisode"]

### Adult films percentage over total releases in 2019

In [34]:
imdb["isAdult"].mean() * 100

2.9684176293144926

### Create a column containing number of words in the title

In [38]:
imdb["words_in_title"] = imdb["primaryTitle"].map(lambda title: len(str(title).split()))
imdb.columns

Index(['index', 'tconst', 'titleType', 'primaryTitle', 'originalTitle',
       'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres',
       'words_in_title'],
      dtype='object')

### Average `runtimeMinutes` for `short` type

In [36]:
imdb.query("titleType == 'short'")["runtimeMinutes"].mean()

12.536104279390065

### Filter `imdb` to return `tvMovie` type with 3 or more words in the title, and less than 75 minutes of `runTimeMinutes`

In [45]:
imdb[
    (imdb["words_in_title"] >= 3) &
    (imdb["titleType"] == "tvMovie") &
    (imdb["runtimeMinutes"] < 75)
]

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,words_in_title
10,290068.0,tt0302617,tvMovie,Great Bear Rainforest,Great Bear Rainforest,0.0,2019.0,,41.0,,3
77,970092.0,tt10002188,tvMovie,"La Sagi, una pionera del Barça","La Sagi, una pionera del Barça",0.0,2019.0,,54.0,,6
655,972490.0,tt10006422,tvMovie,Warrior Women of Dahomey,Warrior Women of Dahomey,0.0,2019.0,,60.0,,4
870,974152.0,tt10009314,tvMovie,Peter Kraus: Immer in Bewegung,Peter Kraus: Immer in Bewegung,0.0,2019.0,,59.0,,5
1549,977406.0,tt10015036,tvMovie,Arts Across the Heartland,Arts Across the Heartland,0.0,2019.0,,42.0,,4
...,...,...,...,...,...,...,...,...,...,...,...
300451,6587871.0,tt9908916,tvMovie,Restless Legs Syndrome: The New Cure?,Restless Legs Syndrome: The New Cure?,0.0,2019.0,,60.0,,6
300975,6590161.0,tt9913680,tvMovie,Olympiad drömmen 2020,Olympiad drömmen 2020,0.0,2019.0,,27.0,,3
301155,6591048.0,tt9915520,tvMovie,Fascism i en svart bil,Fascism i en svart bil,0.0,2019.0,,55.0,,5
301240,6591316.0,tt9916112,tvMovie,"A Priest, a Rabbi, and a Nun walk into a...","A Priest, a Rabbi, and a Nun walk into a...",0.0,2019.0,,21.0,,10
