# This notebook is for getting a general idea of what the WhatsOnNetflix data looks like. 

## Findings
1. Data has inconsistencies but there does not appear to be enough of them to be a huge deal. 
    - There are no empty titles
    - Languages occasionally are misspelled or the country of origin is used instead of a language. 
    - Language split is close to 50:50 English:Other. 
    - Languages with only 1 movie are often incorrect fields or values. EX: tt7866320
    - Release year has a few years which are not real or are empty. Still is not common. 
    - Ratings are from across different country's rating systems. Would be best to group them with a mask into three or four categories. 
    - IMDb Score is either the IMDb Score or the Rotten Tomatoes Critic score. Multiply by 10 to account for this or pull in score from somewhere else for greater accuracy. 
2. Most movies are modern. None older than 1960, though some have poorly formatted dates. 
3. The top rated movies are not all Hollywood films. 


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
try:
    df = pd.read_json('../data_collection/whatsOnNetflix_2082025.json')
except FileNotFoundError as e:
    print('Error: ',e)

In [None]:
print(df.head(10), df.tail(10))

## There are a few things which I think are going to be valuable for us to look at. 
1. *Languages* - we need to know what languages are available and the distribution of languages across the dataset. 
2. *Maturity Ratings* - figuring out what the ratings are and distributions would be valuable. 
3. *Top Rated* - organize the dataset by IMDb Score. 
4. *Movies Released per year* - This will help us understand the dataset and may influence where we can search for more data when needed. 


In [None]:
data = df.copy()

In [None]:
# Language distribution. 

# Explode the Language column
data['Language'] = data['Language'].str.split(' / |, | ')
data = data.explode('Language')

# Filter out English
non_english = data[data['Language'] != 'English']

# Counts
english_count = data[data['Language'] == 'English'].shape[0]
non_english_count = data[data['Language'] != 'English'].shape[0]

# Occurrences of non-english languages
language_counts = non_english['Language'].value_counts()

# 10 most popular languages excluding English
top_10_languages = language_counts.head(10)

# Least popular languages
least_popular_languages = language_counts.tail(10)

# 10 most popular languages
plt.figure(figsize=(10, 5))
top_10_languages.plot(kind='bar', title='Top 10 Most Popular Languages (Excluding English)')
plt.xlabel('Language')
plt.ylabel('Count')
plt.show()

# 10 least popular languages
plt.figure(figsize=(10, 5))
least_popular_languages.plot(kind='bar', title='Least Popular Languages (Excluding English)')
plt.xlabel('Language')
plt.ylabel('Count')
plt.show()

# Pie chart
labels = ['English', 'Other Languages']
sizes = [english_count, non_english_count]
colors = ['#ff9999','#66b3ff']

# Customize the labels to include both percentage and actual count
def custom_autopct(pct):
    total = sum(sizes)
    absolute = int(pct * total / 100.0)
    return f'{pct:.1f}%\n({absolute:d})'

plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, colors=colors, autopct=custom_autopct, startangle=140)
plt.title('Proportion of English Movies vs. Other Languages')
plt.show()

In [None]:
# Occurrences of each language
language_counts = data['Language'].value_counts()

# Subset languages that have only one movie
languages_with_one_movie = language_counts[language_counts == 1]
count_of_languages_with_one_movie = languages_with_one_movie.shape[0]

print(f"Number of languages with only one movie: {count_of_languages_with_one_movie}")
print(languages_with_one_movie)


In [None]:
data = df.copy()

# Convert IMDb Score to numeric values, replacing non-numeric data with -1
def clean_imdb_score(score):
    try:
        # Get Numerator
        if isinstance(score, str) and '/' in score:
            return float(score.split('/')[0])
        # Rotten Scores
        elif isinstance(score, str) and score.replace('.', '', 1).isdigit():
            numeric_score = float(score)
            if numeric_score < 1:  
                return numeric_score * 10  # Multiply by 10 to normalize
            return numeric_score
        return -1  # Return -1 for any invalid or missing data
    except (AttributeError, ValueError): 
        return -1
data['IMDb Score'] = data['IMDb Score'].apply(clean_imdb_score)

# Clean the 'Release Year' column: replace anything that is not a valid year with 100
data['Release Year'] = pd.to_numeric(data['Release Year'], errors='coerce') 
data['Release Year'] = data['Release Year'].fillna(100)  

# Releases per year
release_years = data['Release Year'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
release_years.plot(kind='bar', color='blue', edgecolor='black')
plt.title('Number of Releases Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Releases')
plt.show()

# Releases per 5 year period
data['Release Period'] = (data['Release Year'] // 5) * 5
release_periods = data['Release Period'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
release_periods.plot(kind='bar', color='orange', edgecolor='black')
plt.title('Number of Releases Per 5 Year Period')
plt.xlabel('5-Year Period')
plt.ylabel('Number of Releases')
plt.show()

# Releases per decade
data['Decade'] = (data['Release Year'] // 10) * 10
release_decades = data['Decade'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
release_decades.plot(kind='bar', color='green', edgecolor='black')
plt.title('Number of Releases Per Decade')
plt.xlabel('Decade')
plt.ylabel('Number of Releases')
plt.show()


In [None]:
#  Display top 10 IMDB Scores
# Filter out rows where IMDb Score is -1
df_valid_score = data[data['IMDb Score'] > 0]
df_sorted_by_rating = df_valid_score.sort_values(by='IMDb Score', ascending=False)
top_10_rows = df_sorted_by_rating.head(10)
print("Top 10 Rows Sorted by Rating:")
print(top_10_rows)

In [None]:

# lowest imdb scores
lowest_10_rows = df_sorted_by_rating.tail(10)
print("Lowest 10 Rows Sorted by Rating (with IMDb Score > 0):")
print(lowest_10_rows)