# Data Loading:

In [None]:
# import our own data reader for json
import jsonreader as jr

# import pandas for structuring the data
import pandas as pd

# import numpy for numerical analysis
import numpy as np

import os

In [None]:
# Geting path for JSON file. Using os for getting the path.
cwd = os.getcwd()
path = cwd + '\\data\\tv_shows_and_movies_sample.json'

In [None]:
# Loading json file in using jsonreader (jr):
metadata, text = jr.read_json(path)

In [None]:
# File metadata
metadata

In [None]:
# File text:
text

In [None]:
# Make a DataFrame with the columns filetype, meta and text:
dataf = pd.DataFrame(columns = ['filetype','meta', 'text'])

In [None]:
dataf

In [None]:
# All the txt files:
in_json = [path]

In [None]:
for uri in in_json:
    mymeta, text = jr.read_json(uri)
    
    # Adding an index to the dataframe:
    dataf.loc[len(dataf.index)] = ['json', mymeta, text]

In [None]:
dataf

In [None]:
df = dataf['text'][0]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

# Exploring and Cleaning Data

In [None]:
# Checking missing values from DataFrame:
df.isnull().sum()

In [None]:
# Drop unnecessary columns
df = df.drop(['link', 'poster', 'scraped_at', 'uniq_id', 'country'], axis=1)

In [None]:
# Clean up 'imdb_rating' column
df['imdb_rating'] = df['imdb_rating'].str.extract(r'(\d+\.\d+)').astype(float)
df.head()

In [None]:
# Remove unwanted characters from 'content_rating' column
df['content_rating'] = df['content_rating'].str.replace(r'[^A-Za-z0-9\s]', '')
df

In [None]:
# Split 'cast_and_crew' into separate columns
df['director'] = df['cast_and_crew'].str.extract(r"'name':\s+'([^']+)'", expand=False)
df['director_description'] = df['cast_and_crew'].str.extract(r"'description':\s+'([^']+)'", expand=False)
df

In [None]:
# Drop the original 'cast_and_crew' column
df = df.drop('cast_and_crew', axis=1)

In [None]:
# Function for replaceing all empty with na
def replace_empty(columns):
    for i in range(len(columns)):
        name = columns[i]
        print(name)
        df[name] = df[name].replace('', pd.NA)

In [None]:
# Use replace_empty to remove empty cells
replace_empty(df.columns)


In [None]:
# Function that count all nah/na values
def get_cell_count_na(columns):
    counts = {}
    for i in range(len(columns)):
        name = columns[i]
        counts[name] = df[name].isna().sum()
    return counts    

In [None]:
na_count = get_cell_count_na(df.columns)
for key, value in na_count.items():
    print(f'{key} : {value}')

In [None]:
df.dtypes

In [None]:
# Replace/transform or remove na values:
df = df.dropna(subset=['released_at'])

df['genre'] = df['genre'].fillna('Unknown')
df['imdb_rating'] = df['imdb_rating'].fillna(-1) 
df['director'] = df['director'].fillna('Unknown')
df['director_description'] = df['director_description'].fillna('Unknown')
df['number_of_seasons'] = df['number_of_seasons'].fillna('0')

df['content_rating'] = df['content_rating'].fillna('0+')
df

In [None]:
# Reset index
df = df.reset_index(drop=True)
df

# Visualisation 

In [None]:
# Other utilities
from sklearn import datasets, preprocessing, metrics
import matplotlib.pyplot as plt

In [None]:
# Histogram of Release Dates:

plt.figure(figsize=(10, 6))
plt.hist(pd.to_datetime(df['released_at']), bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of Release Dates')
plt.xlabel('Release Date')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# This histogram shows the distribution of release dates.
# This can help with understanding the release pattern over the years

In [None]:
# Plot the bar chart

# Count the frequency of each genre
genre_counts = df['genre'].value_counts()

top_genres = genre_counts[genre_counts >= 15]  # Adjust the threshold as needed
other_count = genre_counts[genre_counts < 15].sum()
top_genres['Other'] = other_count

# Plot the bar chart
plt.figure(figsize=(10, 6))
top_genres.plot(kind='bar', color='skyblue')
plt.title('Top Genre Distribution')
plt.xlabel('Genre')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Bar chart showing the gernres distribution.
# This can help understanding the popular genres in the dataset

In [None]:
tv_shows = df[df['type'] == 'TV Show']
tv_shows_cleaned = tv_shows.dropna(subset=['number_of_seasons'])

# Count the frequency of each number of seasons
season_counts = tv_shows_cleaned['number_of_seasons'].value_counts().sort_index()

# Plot the bar chart
plt.figure(figsize=(10, 6))
season_counts.plot(kind='bar', color='skyblue')
plt.title('Number of Seasons Distribution for TV Shows')
plt.xlabel('Number of Seasons')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Showing the distribution of the number of seasons for TV shows:

In [None]:
# Plot the box plot
plt.figure(figsize=(8, 6))
plt.boxplot(df['imdb_rating'].values, vert=False)
plt.title('Box Plot of IMDb Ratings')
plt.xlabel('IMDb Rating')
plt.yticks([])
plt.grid(axis='x')
plt.tight_layout()
plt.show()

# Box plot - IMDb ratings:

In [None]:
# Plot the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(pd.to_datetime(df['released_at']), df['imdb_rating'], color='skyblue', alpha=0.5)
plt.title('IMDb Ratings vs. Release Dates')
plt.xlabel('Release Date')
plt.ylabel('IMDb Rating')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# Scatter plot showing the relationship between IMDb ratings and release dates: