In [None]:
import pandas as pd
import os
import numpy as np
import missingno as msno
import string
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#rename the csv file and install error handling code to handle potentional exceptions that may occur during file renaming
try:
   os.rename("C:/Users/Personal/Downloads/netflix_data.csv", "C:/Users/Personal/Dropbox/MSDA/Course 2/Module 4/Assignment/Netflix_shows_movies.csv")

except FileNotFoundError:
    print("Error: File not found.")

except Exception as e:
    print(f"An unexpected error occured: {e}")

netflix = pd.read_csv('Netflix_shows_movies.csv')
print(netflix.head())

In [None]:
#data cleaning

#getting information about the dataset
print(netflix.info()) 
print(netflix.shape) #rows and columns of the dataset

#checking if there're any duplicates by show_id
netflix["show_id"].is_unique

#checking for missing data
missing_values = netflix.isnull()
print("Missing values (True/False):\n", missing_values)
#counting missing values per column
missing_counts = netflix.isnull().sum()
print("\nMissing values count per column:\n", missing_counts)

netflix.fillna('Unknown', inplace=True)  #filling missing data with unkown since all of the columns with missing data are text fields
missing_counts = netflix.isnull().sum()
print("\nMissing values count per column:\n", missing_counts) #confirming that the fill worked




In [None]:
#data exploration

print(netflix.describe())  #only numeric variables are described

#to explore the other text variables better, I will run frequency visualisations for the relevant variables in the dataset

# I will create a list for text variables; these will be skipped because they're not important to plot
col_skip = ["show_id", "title", "cast", "date_added", "duration", "description", "director", "listed_in", "country"]

for x in netflix.columns:

    if x in col_skip:
        continue 

    plt.figure(figsize=(6, 4))
    
    if netflix[x].dtype == 'object':  # If categorical (text)
        sns.countplot(data=netflix, x=x, palette="viridis")   
        
        plt.title(f'Frequency of {x}')
        plt.xlabel(x)
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)
        
    else:  # If numeric
        sns.histplot(netflix[x], kde=True, color='coral') 
        
        plt.title(f'Distribution of {x}')
        plt.xlabel(x)
        plt.ylabel('Frequency')
        
    plt.tight_layout()
    plt.savefig(f"Frequency of {x}")
    plt.show()

In [None]:
#crosstabs

#crosstab between type and rating
table = pd.crosstab(netflix["rating"], netflix["type"])


print(table)

table.plot(kind="bar", stacked = True, color = ["yellow", "red"])

plt.title("showtype by rating")
plt.xlabel("rating")
plt.ylabel("count")
plt.legend(title="type")
plt.xticks(rotation=45)
plt.savefig('Rating_showtype crosstab')
plt.show



In [None]:
#data visualisation

#ratings
rating_counts = netflix["rating"].value_counts()
chart1 = plt.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=90)


# Add title
plt.title('Ratings distribution')
plt.savefig('Rating distribution')
plt.show(chart1)

#most watched genres
#inorder to get a clean sense of the genres, I had to first split the listed_in variable by separating the text that limited with commas into separate rows
netflix['listed_in'] = netflix['listed_in'].str.split(',')
netflix_exploded = netflix.explode("listed_in").reset_index(drop=True)

netflix_exploded.to_csv('exploded_items.csv', index=False) #new csv was then saved separately

netflix_exploded = pd.read_csv('exploded_items.csv')  #new dataset recalled into the dataframe to be used for the visualisation

genre_counts = netflix_exploded["listed_in"].value_counts()
chart2=plt.pie(genre_counts, labels=genre_counts.index, autopct='%1.1f%%', startangle=90)

plt.title('Genre distribution')
plt.savefig('Genre distribution')
plt.show(chart2) #international movies is the most watched genre based on the visualisation





