# Cleaning the dataset

## Read the data

In [1]:
import pandas as pd
from collections import Counter
from IPython.display import display

In [16]:
df = pd.read_csv('../../data/raw/netflix.csv')

In [17]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


## Get the number of movies of each countries

In [4]:
countries = df['country'].dropna(how='any')
count_countries = Counter()
for country in countries:
    count_countries = count_countries + Counter(list(map(str.strip, country.split(','))))

country_df = pd.DataFrame.from_dict(count_countries, orient='index').reset_index().rename(columns={'index': 'Country', 0: 'Number of shows'})

country_df.to_csv('../../data/processed/country_shows.csv', index=False)

## Get the list of directors and actors

In [5]:
actors = df['cast'].dropna(how='any')
count_actors = Counter()
for actor in actors:
    count_actors = count_actors + Counter(list(map(str.strip, actor.split(','))))
    
actor_df = pd.DataFrame.from_dict(count_actors, orient='index').reset_index().rename(columns={'index': 'Actor', 0: 'Number of shows'})

actor_df.to_csv('../../data/processed/actor_shows.csv', index=False)

In [6]:
directors = df['director'].dropna(how='any')
count_directos = Counter()
for director in directors:
    count_directos = count_directos + Counter(list(map(str.strip, director.split(','))))

director_df = pd.DataFrame.from_dict(count_directos, orient='index').reset_index().rename(columns={'index': 'Director', 0: 'Number of shows'})

director_df.to_csv('../../data/processed/director_shows.csv', index=False)

## Get the list of genres

In [7]:
genres_tv = df[df['type'] == 'TV Show']['listed_in'].dropna(how='any')
genres_movie = df[df['type'] == 'Movie']['listed_in'].dropna(how='any')
count_genres_tv = Counter()
count_genres_movie = Counter()
for genre_tv in genres_tv:
    count_genres_tv = count_genres_tv + Counter(list(map(str.strip, genre_tv.split(','))))
for genre_movie in genres_movie:
    count_genres_movie = count_genres_movie + Counter(list(map(str.strip, genre_movie.split(','))))

In [8]:
genre_tv_df = pd.DataFrame.from_dict(count_genres_tv, orient='index').reset_index().rename(columns={'index': 'Genre', 0: 'Number of TV Shows'})
genre_tv_df.to_csv('../../data/processed/tv_shows_genres.csv')

genre_movie_df = pd.DataFrame.from_dict(count_genres_movie, orient='index').reset_index().rename(columns={'index': 'Genre', 0: 'Number of Movies'})
genre_movie_df.to_csv('../../data/processed/movies_genres.csv')


## Process the raw dataset

In [18]:
df = df.drop(['show_id', 'description', 'title'], axis=1).dropna().reset_index(drop=True)

In [19]:
df.head()

Unnamed: 0,type,director,cast,country,date_added,release_year,rating,duration,listed_in
0,Movie,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies"
1,Movie,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies"
2,Movie,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi..."
3,Movie,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas
4,TV Show,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"July 1, 2017",2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries"


In [20]:
df = df.rename(columns={
    'type': 'Type',
    'director': 'Director',
    'cast': 'Cast',
    'country': 'Country',
    'date_added': 'Date Added',
    'release_year': 'Release Year',
    'rating': 'Rating',
    'duration': 'Duration',
    'listed_in': 'Genre',
    'added_delay': 'Added Delay'
})

In [12]:
df.head()

Unnamed: 0,Type,Director,Cast,Country,Date Added,Release Year,Rating,Duration,Genre,Added Delay
0,Movie,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",0.0
1,Movie,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies",7.0
2,Movie,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...",8.0
3,Movie,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,12.0
4,TV Show,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"July 1, 2017",2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries",1.0


In [13]:
df['Date Added'] = pd.to_datetime(df['Date Added']).dt.date
df['Release Year'] = pd.to_datetime(df['Release Year'], format='%Y').dt.year

In [14]:
df.head()

Unnamed: 0,Type,Director,Cast,Country,Date Added,Release Year,Rating,Duration,Genre,Added Delay
0,Movie,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016-12-23,2016,TV-MA,93 min,"Dramas, International Movies",0.0
1,Movie,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018-12-20,2011,R,78 min,"Horror Movies, International Movies",7.0
2,Movie,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2017-11-16,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...",8.0
3,Movie,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,2020-01-01,2008,PG-13,123 min,Dramas,12.0
4,TV Show,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,2017-07-01,2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries",1.0


In [None]:
df['Duration'] = df['Duration'].str.split().map(lambda x: x[0]).astype(int)

In [None]:
df.head()

In [None]:
df.to_csv('../../data/processed/processed_netflix.csv')