# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz'
title_akas = 'https://datasets.imdbws.com/title.akas.tsv.gz'
title_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
basics = pd.read_csv(title_basics, sep='\t', low_memory=False)
akas = pd.read_csv(title_akas, sep='\t', low_memory=False)
ratings = pd.read_csv(title_ratings, sep='\t', low_memory=False)

## Specifications - [LP URL](https://login.codingdojo.com/m/376/12528/88060)
Your stakeholder only wants you to include information for movies based on the following specifications:

- Exclude any movie with missing values for genre or runtime
- Include only full-length movies (titleType = "movie").
- Include only fictional movies (not from documentary genre)
- Include only movies that were released 2000 - 2021 (include 2000 and 2021)
- Include only movies that were released in the United States

## Deliverable
After filtering out movies that do not meet the stakeholder's specifications:

- Before saving, run a final .info() for each of the dataframes to show a summary of how many movies remain and the datatypes of each feature
- Save each file to a compressed csv file "Data/" folder inside your repository.
- Commit your changes to your repository in GitHub desktop and Publish repository / Push Changes.
- Submit the link to your repository

## Required Preprocessing - Details - [LP URL](https://login.codingdojo.com/m/376/12528/88061)
### Filtering/Cleaning Steps:

#### Title Basics:
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- keep only titleType==Movie
- keep startYear 2000-2022
- Eliminate movies that include "Documentary" in genre (see tip below)
- Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)

#### AKAs:
- keep only US movies.
- Replace "\N" with np.nan

#### Ratings:
- Replace "\N" with np.nan (if any)
- Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)

In [4]:
# Display the column names of basics
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9958356 entries, 0 to 9958355
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 683.8+ MB


In [5]:
# Check the number of null values in each column
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            15
dtype: int64

In [22]:
# Display the number of unique values under each column
basics.nunique()

tconst            9958356
titleType              11
primaryTitle      4497205
originalTitle     4519318
isAdult                11
startYear             154
endYear                96
runtimeMinutes        892
genres               2342
dtype: int64

In [10]:
# Replace "\N" with np.nan
basics.replace({'\N' : np.nan }, inplace=True)

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 0-1: malformed \N character escape (2460163111.py, line 2)

In [6]:
# Display the rows that contain null values in the genres column
filter1 = basics['genres'].isna() == True
basics[filter1]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
1097266,tt10233364,tvEpisode,Rolling in the Deep Dish\tRolling in the Deep ...,0,2019,\N,\N,Reality-TV,
1507601,tt10970874,tvEpisode,Die Bauhaus-Stadt Tel Aviv - Vorbild für die M...,0,2019,\N,\N,Talk-Show,
1895426,tt11670006,tvEpisode,...ein angenehmer Unbequemer...\t...ein angene...,0,1981,\N,\N,Documentary,
2006373,tt11868642,tvEpisode,GGN Heavyweight Championship Lungs With Mike T...,0,2020,\N,\N,Talk-Show,
2160471,tt12149332,tvEpisode,Jeopardy! College Championship Semifinal Game ...,0,2020,\N,\N,"Game-Show,Short",
2305590,tt12415330,tvEpisode,Anthony Davis High Brow Tank\tAnthony Davis Hi...,0,2017,\N,\N,Reality-TV,
2994318,tt13704268,tvEpisode,Bay of the Triffids/Doctor of Doom\tBay of the...,0,\N,\N,\N,"Animation,Comedy,Family",
6467980,tt27147391,tvEpisode,LATINO Accents QUIZ! w@MrHReviews @EchoBaseNet...,0,2023,\N,\N,"News,Talk-Show",
6521038,tt27404292,tvEpisode,Nord-Koreas röda prinsessa\tNord-Koreas röda p...,0,2022,\N,\N,Documentary,
6557814,tt27493617,tvEpisode,War Room Round Table: Building an AI Networkin...,0,2023,\N,\N,Talk-Show,
