# IMDB Business-Problem

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# Loading Data

## Basics Dataframe

In [2]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Ratings Dataframe

In [3]:
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

## AKAs Dataframe

In [4]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [5]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


# Exploring Data

## Missing Values


In [6]:
print(f'There are {basics.duplicated().sum()} duplicated values is the Basic Dataset\n')
print(f'There are {ratings.duplicated().sum()} duplicated values is the Ratings Dataset\n')
print(f'There are {akas.duplicated().sum()} duplicated values is the AKAs Dataset\n')
print(f'===========================================\n\n Basic Dataset Missing Values:\n')
# Basic Dataset Missing Values
basic_total = basics.isnull().sum().sort_values(ascending=False)
basic_percent = (basics.isnull().sum()/basics.isnull().count()).sort_values(ascending=False)
basic_missing_data = pd.concat([basic_total, basic_percent], axis=1, keys=['Total', 'Percentage'])
print(basic_missing_data)

print(f'===========================================\n\n Ratings Dataset Missing Values:\n')
# Ratings Dataset Missing Values
ratings_total = ratings.isnull().sum().sort_values(ascending=False)
ratings_percent = (ratings.isnull().sum()/ratings.isnull().count()).sort_values(ascending=False)
ratings_missing_data = pd.concat([ratings_total, ratings_percent], axis=1, keys=['Total', 'Percentage'])
print(ratings_missing_data)

print(f'===========================================\n\n AKAs Dataset Missing Values:\n')
# AKAs Dataset Missing Values
akas_total = akas.isnull().sum().sort_values(ascending=False)
akas_percent = (akas.isnull().sum()/akas.isnull().count()).sort_values(ascending=False)
akas_missing_data = pd.concat([akas_total, akas_percent], axis=1, keys=['Total', 'Percentage'])
print(akas_missing_data)

There are 0 duplicated values is the Basic Dataset

There are 0 duplicated values is the Ratings Dataset

There are 0 duplicated values is the AKAs Dataset


 Basic Dataset Missing Values:

                Total  Percentage
primaryTitle       11    0.000001
originalTitle      11    0.000001
genres             10    0.000001
tconst              0    0.000000
titleType           0    0.000000
isAdult             0    0.000000
startYear           0    0.000000
endYear             0    0.000000
runtimeMinutes      0    0.000000

 Ratings Dataset Missing Values:

               Total  Percentage
tconst             0         0.0
averageRating      0         0.0
numVotes           0         0.0

 AKAs Dataset Missing Values:

                 Total    Percentage
region             103  3.159215e-06
title                5  1.533600e-07
titleId              0  0.000000e+00
ordering             0  0.000000e+00
language             0  0.000000e+00
types                0  0.000000e+00
attributes  

### Exploring Basics Missing Values

In [7]:
basics.replace({'\\N':np.nan}, inplace=True)
clean_basics = basics.dropna(inplace=True)

### Exploring AKAs Missing Values

In [8]:
akas.replace({'\\N':np.nan}, inplace=True)
clean_akas = akas.dropna(inplace=True)

## Filtering

In [9]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [10]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(akas['titleId'])
keepers

25060      False
37589      False
38423      False
38424      False
38426      False
           ...  
9086358    False
9086391    False
9086492    False
9086698    False
9086878    False
Name: tconst, Length: 37662, dtype: bool

In [11]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
45577,tt0046415,tvSeries,The Count of Monte Cristo,Le comte de Monte-Cristo,0,1954,1954,183,"Adventure,Drama,Romance"
69499,tt0071010,tvSeries,The Manhunter,The Manhunter,0,1974,1975,60,Drama
2543361,tt1286039,tvSeries,Stargate Universe,SG.U Stargate Universe,0,2009,2011,43,"Drama,Sci-Fi"
4710451,tt1772752,tvSeries,A.N.T. Farm,A.N.T. Farm,0,2011,2014,30,"Comedy,Drama,Family"
4794041,tt1826071,tvSeries,Adini Feriha Koydum,Adini Feriha Koydum,0,2011,2012,120,"Drama,Romance"
5576072,tt2224968,tvSeries,Jang Geum's Dream,Jang Geum ieui Kkum,0,2005,2007,30,"Adventure,Animation,Comedy"
6268231,tt3672132,tvSeries,Anali Ogullu,Anali Ogullu,0,2014,2014,80,Comedy
6801492,tt4875520,tvSeries,Iliski Durumu: Karisik,Iliski Durumu: Karisik,0,2015,2016,120,"Comedy,Romance"
8349591,tt8315348,tvSeries,4N1K,4N1K,0,2018,2019,100,"Comedy,Romance"
8384082,tt8390060,tvSeries,Her sey yolunda merkez,Her sey yolunda merkez,0,2013,2013,90,Comedy


# Saving Compressed Files


In [12]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0046415,tvSeries,The Count of Monte Cristo,Le comte de Monte-Cristo,0,1954,1954,183,"Adventure,Drama,Romance"
1,tt0071010,tvSeries,The Manhunter,The Manhunter,0,1974,1975,60,Drama
2,tt1286039,tvSeries,Stargate Universe,SG.U Stargate Universe,0,2009,2011,43,"Drama,Sci-Fi"
3,tt1772752,tvSeries,A.N.T. Farm,A.N.T. Farm,0,2011,2014,30,"Comedy,Drama,Family"
4,tt1826071,tvSeries,Adini Feriha Koydum,Adini Feriha Koydum,0,2011,2012,120,"Drama,Romance"


In [13]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1898
1,tt0000002,5.9,254
2,tt0000003,6.5,1692
3,tt0000004,5.7,166
4,tt0000005,6.2,2508


In [14]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0022542,1,Di shtime fun Yisroel,US,yi,alternative,YIVO translation,0
1,tt0024265,4,Geleb un gelakht,US,yi,alternative,modern translation,0
2,tt0024751,9,Avram Ovenu,US,yi,alternative,YIVO translation,0
3,tt0026010,3,Der yidishe Kenigen Lir,US,yi,alternative,YIVO translation,0
4,tt0027911,1,Libe un Laydnshaft,US,yi,alternative,modern translation,0
