# Movies Database Seach

by Israel Diaz

### Load Libraries

In [1]:
## General Libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

### Loading DataFiles
<h1>***Jump to section 2 if you already filtered and saved the data***</h1>

In [2]:
%%time
title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz'
title_aka = 'https://datasets.imdbws.com/title.akas.tsv.gz'
title_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

## loading urls into pandas dataframe
basics = pd.read_csv(title_basics, sep='\t', low_memory='False')
aka = pd.read_csv(title_aka, sep='\t', low_memory='False')
ratings = pd.read_csv(title_ratings, sep='\t', low_memory='False')

CPU times: total: 50 s
Wall time: 7min 38s


In [3]:
print('basics = ', 'len:',len(basics))
display(basics.head())
print('\naka = ', 'len:',len(aka))
display(aka.head())
print('\nratings = ', 'len:',len(ratings))
display(ratings.head())

basics =  len: 9634768


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"



aka =  len: 35026517


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0



ratings =  len: 1282624


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1953
1,tt0000002,5.8,263
2,tt0000003,6.5,1787
3,tt0000004,5.6,179
4,tt0000005,6.2,2589


### Data Cleaning

#### Replacing \N with NaN

The data dictionary reports that the files shows null values as \N, this would be an issue so it must be changed to NaN values.

In [4]:
## Replacing the \N values with NaN
basics.replace({'\\N': np.nan}, inplace=True)
aka.replace({'\\N': np.nan}, inplace=True)
ratings.replace({'\\N': np.nan}, inplace=True)

In [5]:
print('basics')
display(basics.head())
print('\naka')
display(aka.head())
print('\nratings')
display(ratings.head())

basics


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"



aka


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0



ratings


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1953
1,tt0000002,5.8,263
2,tt0000003,6.5,1787
3,tt0000004,5.6,179
4,tt0000005,6.2,2589


#### Eliminate runtimeMinute = Null

In [6]:
basics.dropna(subset=['runtimeMinutes', 'genres'], axis=0, inplace=True)

#### Keeping only titleType = movie

In [7]:
basics = basics[basics['titleType'] == 'movie']

#### Keeping startYear between 2000-2002

In [8]:
basics = basics[(basics['startYear'] >= '2000') & (basics['startYear'] < '2023')]

#### Dropping movies that contain documentaries

In [9]:
## filtering movies that contain documentaries in genres
is_documentary = basics['genres'].str.contains('documentary', case=False)
##saving
basics = basics[~is_documentary]
## showing results
basics.head(10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
87114,tt0089067,movie,El día de los albañiles 2,El día de los albañiles 2,0,2001,,90,Comedy
90917,tt0092960,movie,En tres y dos,En tres y dos,0,2004,,102,Drama
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
98043,tt0100275,movie,The Wandering Soap Opera,La telenovela errante,0,2017,,80,"Comedy,Drama,Fantasy"
100076,tt0102362,movie,Istota,Istota,0,2000,,80,"Drama,Romance"


#### Keeping only US movies

In [10]:
aka = aka[aka['region'] == 'US']

Applying to all other sets

In [11]:
# Filtering basics
us_movies = basics['tconst'].isin(aka['titleId'])
# results
us_movies[:10]

34803      True
61116      True
67669      True
77964     False
86801      True
87114     False
90917     False
93938      True
98043      True
100076    False
Name: tconst, dtype: bool

In [12]:
#Apply to the basics set
basics = basics[us_movies]

In [13]:
## filtering ratings
us_ratings = ratings['tconst'].isin(aka['titleId'])
# results
us_ratings[:10]

0     True
1     True
2    False
3    False
4     True
5     True
6     True
7     True
8     True
9     True
Name: tconst, dtype: bool

In [14]:
## Apply to ratings se
ratings = ratings[us_ratings]

## Saving DataFrames

In [15]:
import os
os.makedirs('data/', exist_ok=True)
os.listdir('data/')

['basics.csv.gz', 'aka.csv.gz', 'ratings.csv.gz']

In [16]:
## saving basics to compressed file
basics.to_csv("data/basics.csv.gz",compression='gzip',index=False)
## saving aka to compressed file
aka.to_csv("data/aka.csv.gz",compression='gzip',index=False)
## saving ratings to compressed file
ratings.to_csv("data/ratings.csv.gz",compression='gzip',index=False)

<h1>Start here if you have the files in data folder</h1>

#### Loading data

In [17]:
basics = pd.read_csv('data/basics.csv.gz', low_memory=False)
aka = pd.read_csv('data/aka.csv.gz', low_memory=False)
ratings = pd.read_csv('data/ratings.csv.gz', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [18]:
## Showing Info
print(f'MOVIES DATA:')
display(basics.info())

MOVIES DATA:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85575 entries, 0 to 85574
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          85575 non-null  object 
 1   titleType       85575 non-null  object 
 2   primaryTitle    85575 non-null  object 
 3   originalTitle   85575 non-null  object 
 4   isAdult         85575 non-null  int64  
 5   startYear       85575 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  85575 non-null  int64  
 8   genres          85575 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.9+ MB


None

In [19]:
print(f'AKA DATA:')
display(aka.info())

AKA DATA:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1416568 entries, 0 to 1416567
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1416568 non-null  object 
 1   ordering         1416568 non-null  int64  
 2   title            1416568 non-null  object 
 3   region           1416568 non-null  object 
 4   language         3833 non-null     object 
 5   types            974118 non-null   object 
 6   attributes       46043 non-null    object 
 7   isOriginalTitle  1415223 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 86.5+ MB


None

In [20]:
print(f'RATINGS DATA:')
display(ratings.info())

RATINGS DATA:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489933 entries, 0 to 489932
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         489933 non-null  object 
 1   averageRating  489933 non-null  float64
 2   numVotes       489933 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.2+ MB


None