# Importing Packages

In [81]:
import kaggle
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
import ast

import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

# Gathering Data

In [2]:
# Ensure kaggle.json is in the correct location and configure the environment
os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser("~/.kaggle")

# Download the dataset. This command downloads the dataset into the current directory
kaggle.api.dataset_download_files('rounakbanik/the-movies-dataset', path='./', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset


In [3]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files('shivamb/netflix-shows', path='./', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/shivamb/netflix-shows


# Cleaning Data

In [197]:
with ZipFile('Netflix Data.zip', 'r') as z:
    with z.open('netflix_titles.csv') as file:
        NT = pd.read_csv(file)

In [198]:
MMD = pd.read_csv('movies_metadata.csv')

In [199]:
#MMD.head(3)

In [200]:
#NT.head(3)

In [201]:
print(MMD.columns)
print(NT.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')
Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [202]:
df = pd.merge(MMD, NT, left_on='original_title', right_on='title', how='inner')

In [203]:
df['genres']

0       [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...
1       [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
2       [{'id': 12, 'name': 'Adventure'}, {'id': 28, '...
3       [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4       [{'id': 10751, 'name': 'Family'}, {'id': 16, '...
                              ...                        
2479                  [{'id': 99, 'name': 'Documentary'}]
2480                  [{'id': 99, 'name': 'Documentary'}]
2481    [{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...
2482    [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
2483    [{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...
Name: genres, Length: 2484, dtype: object

In [204]:
df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: ast.literal_eval(x) if not pd.isna(x) and x != 'nan' else np.nan)
df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: x['name'] if isinstance(x,dict) and 'name' in x else np.nan)

In [205]:
split_cols = ['production_companies','production_countries', 'genres', 'spoken_languages']

In [206]:
def process_columns(df, split_cols):
    for col in split_cols:
        df[col] = df[col].apply(lambda x: ast.literal_eval(x) if not pd.isna(x) and x != 'nan' else np.nan)
        df[col] = df[col].apply(lambda x: [d['name'] for d in x] if isinstance(x, list) else [])
        df[col] = df[col].apply(lambda x: '|'.join(x))
        temp_df = df[col].str.split('|', expand=True)
        temp_df.columns = [f"{col}_{i}" for i in range(temp_df.shape[1])]
        df = pd.concat([df, temp_df], axis=1)
        df.drop([col], inplace=True, axis=1)
    
    return df

In [207]:
df = process_columns(df, split_cols)

In [208]:
df['duration'] = df['duration'].str.extract('(\d+)').astype(float)

In [209]:
df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,runtime,status,tagline,title_x,video,vote_average,vote_count,show_id,type,title_y,director,cast,country,date_added,release_year,rating,duration,listed_in,description,production_companies_0,production_companies_1,production_companies_2,production_companies_3,production_companies_4,production_companies_5,production_companies_6,production_companies_7,production_companies_8,production_companies_9,production_companies_10,production_companies_11,production_companies_12,production_companies_13,production_companies_14,production_companies_15,production_companies_16,production_companies_17,production_countries_0,production_countries_1,production_countries_2,production_countries_3,production_countries_4,production_countries_5,production_countries_6,production_countries_7,production_countries_8,production_countries_9,genres_0,genres_1,genres_2,genres_3,genres_4,genres_5,genres_6,spoken_languages_0,spoken_languages_1,spoken_languages_2,spoken_languages_3,spoken_languages_4,spoken_languages_5,spoken_languages_6,spoken_languages_7,spoken_languages_8
0,False,,58000000,,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,6.677277,/jQh15y5YB7bWz1NtffNZmRw0s9D.jpg,1995-12-15,0.0,127.0,Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0,s4375,Movie,Sabrina,Rocky Soraya,"Luna Maya, Christian Sugiono, Sara Wijayanto, ...",Indonesia,"November 20, 2018",2018,TV-MA,114.0,"Horror Movies, International Movies",A toy manufacturer and his wife are terrorized...,Paramount Pictures,Scott Rudin Productions,Mirage Enterprises,Sandollar Productions,Constellation Entertainment,Worldwide,Mont Blanc Entertainment GmbH,,,,,,,,,,,,Germany,United States of America,,,,,,,,,Comedy,Romance,,,,,,Français,English,,,,,,,
1,False,,2238813,,6620,tt0047437,en,Sabrina,Linus and David Larrabee are the two sons of a...,7.359741,/7ITDmatHa2yf5UTzjwaKAvf3Xr6.jpg,1954-09-28,10000000.0,113.0,Released,...the chauffeur's daughter who learned her st...,Sabrina,False,7.4,284.0,s4375,Movie,Sabrina,Rocky Soraya,"Luna Maya, Christian Sugiono, Sara Wijayanto, ...",Indonesia,"November 20, 2018",2018,TV-MA,114.0,"Horror Movies, International Movies",A toy manufacturer and his wife are terrorized...,Paramount Pictures,,,,,,,,,,,,,,,,,,United States of America,,,,,,,,,,Comedy,Drama,Romance,,,,,English,Italiano,,,,,,,
2,False,James Bond Collection,58000000,http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686036,/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg,1995-11-16,352194034.0,130.0,Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0,s6867,Movie,GoldenEye,Martin Campbell,"Pierce Brosnan, Sean Bean, Izabella Scorupco, ...","United Kingdom, United States","December 31, 2019",1995,PG-13,130.0,Action & Adventure,Pierce Brosnan takes his first turn as debonai...,United Artists,Eon Productions,,,,,,,,,,,,,,,,,United Kingdom,United States of America,,,,,,,,,Adventure,Action,Thriller,,,,,English,Pусский,Español,,,,,,


In [212]:
new_names = {old_name: f"prod_comp_{old_name.split('_')[-1]}" for old_name in df.columns if 'production_companies_' in old_name}
df.rename(columns=new_names, inplace=True)

In [215]:
new_names = {old_name: f"lang{old_name.split('_')[-1]}" for old_name in df.columns if 'spoken_languages' in old_name}
df.rename(columns=new_names, inplace=True)

In [216]:
new_names = {old_name: f"g{old_name.split('_')[-1]}" for old_name in df.columns if 'genre' in old_name}
df.rename(columns=new_names, inplace=True)

Unnamed: 0,adult,belongs_to_collection,budget,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,runtime,status,tagline,title_x,video,vote_average,vote_count,show_id,type,title_y,director,cast,country,date_added,release_year,rating,duration,listed_in,description,prod_comp_0,prod_comp_1,prod_comp_2,prod_comp_3,prod_comp_4,prod_comp_5,prod_comp_6,prod_comp_7,prod_comp_8,prod_comp_9,prod_comp_10,prod_comp_11,prod_comp_12,prod_comp_13,prod_comp_14,prod_comp_15,prod_comp_16,prod_comp_17,production_countries_0,production_countries_1,production_countries_2,production_countries_3,production_countries_4,production_countries_5,production_countries_6,production_countries_7,production_countries_8,production_countries_9,genres_0,genres_1,genres_2,genres_3,genres_4,genres_5,genres_6,lang0,lang1,lang2,lang3,lang4,lang5,lang6,lang7,lang8
0,False,,58000000,,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,6.677277,/jQh15y5YB7bWz1NtffNZmRw0s9D.jpg,1995-12-15,0.0,127.0,Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0,s4375,Movie,Sabrina,Rocky Soraya,"Luna Maya, Christian Sugiono, Sara Wijayanto, ...",Indonesia,"November 20, 2018",2018,TV-MA,114.0,"Horror Movies, International Movies",A toy manufacturer and his wife are terrorized...,Paramount Pictures,Scott Rudin Productions,Mirage Enterprises,Sandollar Productions,Constellation Entertainment,Worldwide,Mont Blanc Entertainment GmbH,,,,,,,,,,,,Germany,United States of America,,,,,,,,,Comedy,Romance,,,,,,Français,English,,,,,,,
1,False,,2238813,,6620,tt0047437,en,Sabrina,Linus and David Larrabee are the two sons of a...,7.359741,/7ITDmatHa2yf5UTzjwaKAvf3Xr6.jpg,1954-09-28,10000000.0,113.0,Released,...the chauffeur's daughter who learned her st...,Sabrina,False,7.4,284.0,s4375,Movie,Sabrina,Rocky Soraya,"Luna Maya, Christian Sugiono, Sara Wijayanto, ...",Indonesia,"November 20, 2018",2018,TV-MA,114.0,"Horror Movies, International Movies",A toy manufacturer and his wife are terrorized...,Paramount Pictures,,,,,,,,,,,,,,,,,,United States of America,,,,,,,,,,Comedy,Drama,Romance,,,,,English,Italiano,,,,,,,
2,False,James Bond Collection,58000000,http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686036,/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg,1995-11-16,352194034.0,130.0,Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0,s6867,Movie,GoldenEye,Martin Campbell,"Pierce Brosnan, Sean Bean, Izabella Scorupco, ...","United Kingdom, United States","December 31, 2019",1995,PG-13,130.0,Action & Adventure,Pierce Brosnan takes his first turn as debonai...,United Artists,Eon Productions,,,,,,,,,,,,,,,,,United Kingdom,United States of America,,,,,,,,,Adventure,Action,Thriller,,,,,English,Pусский,Español,,,,,,


In [153]:
df.rename(columns={0:'g1', 1:'g2', 2:'g3', 3:'g4', 4:'g5', 5:'g5', 6:'g6'}, inplace=True)