In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import defaultdict
import pickle
import gc

from helpers.readers import prepare_dataframes, read_dataframe_parquet, read_dataframe
from helpers.readers import save_parquet_to_generated, save_dict_to_generated, load_dict_from_generated
from helpers.utils import PALETTE_D, PALETTE_C

# Loading dataframes

In [None]:
movies = read_dataframe_parquet("merged/movies")
directors = read_dataframe_parquet("merged/directors")
awards = read_dataframe_parquet("merged/awards")

In [None]:
all_countries = movies["countries"]
split_countries = all_countries.str.split(',')
flat_list = [country.strip() for sublist in split_countries for country in sublist]
countries = sorted(set(flat_list))
print(f'We have movies from {len(countries)} countries in the world.')

In [None]:
for country in ['Greece',"Egypt"]:#, 'Egypt', 'Argentina', 'Pakistan', 'Philippines']:
    g = sns.jointplot(x=movies.rating, y=movies.votes, kind='hex', color="#4CB391", joint_kws={'yscale': 'log'})
    g.fig.set_figwidth(3)  
    g.fig.set_figheight(3)  
    
    df = movies[movies.countries.str.contains(country) & ~movies.countries.str.contains(',')]
    sns.scatterplot(x=df.rating, y=df.votes, color='r', alpha=.4, s=5, ax=g.ax_joint, label=country)
    sns.kdeplot(x=df.rating, y=df.votes, color='r', alpha=.4, ax=g.ax_joint)
    g.ax_joint.legend()

### Movie score

In [None]:
%%time

def compute_score_vectorized(df):
    df['score'] = np.log10(df['votes']) * df['rating']
    return df

movies = compute_score_vectorized(movies)

In [None]:
def list_top_movies(country: str, top=5):
    cols = ['title', 'score', 'rating', 'votes', 'revenue', 'release', 'genres']
    display(movies[movies.countries.str.contains(country)].sort_values(by='score', ascending=False)[:top][cols])
    
list_top_movies('Iran')

### Score for directors

In [None]:
len(movies[movies["directors"].str.contains(",", na=False)]) # some movies have multiple directors

In [None]:
movies_exploded = movies.reset_index(drop=False).copy()

movies_exploded['directors'] = movies_exploded['directors'].str.split(',')
movies_exploded = movies_exploded.explode('directors')

len(movies_exploded[movies_exploded["directors"].str.contains(",", na=False)]) # after explode no more duplicates

```python
%%time
directors.reset_index(inplace=True)


def hits_metric(data, threshold):
    return data['score'].gt(threshold).sum()

def rate_metric(data, rate_threshold, vote_threshold):
    return data[(data['rating'] >= rate_threshold) & (data['votes'] >= vote_threshold)].shape[0]

def avg_top_n_scores(data, n):
    if len(data) >= n:
        return data.nlargest(n, 'score')['score'].mean()
    return pd.NA

grouped = movies_exploded.groupby('directors')
director_metrics = pd.DataFrame(index=grouped.groups.keys())

# Calculate each metric
director_metrics['hits-30'] = grouped.apply(hits_metric, threshold=30)
director_metrics['hits-40'] = grouped.apply(hits_metric, threshold=40)
director_metrics['hits-45'] = grouped.apply(hits_metric, threshold=45)
director_metrics['hits-50'] = grouped.apply(hits_metric, threshold=50)
director_metrics['rate-7.0'] = grouped.apply(rate_metric, rate_threshold=7.0, vote_threshold=1000)
director_metrics['rate-7.5'] = grouped.apply(rate_metric, rate_threshold=7.5, vote_threshold=1000)
director_metrics['rate-8.0'] = grouped.apply(rate_metric, rate_threshold=8.0, vote_threshold=1000)
director_metrics['rate-8.5'] = grouped.apply(rate_metric, rate_threshold=8.5, vote_threshold=1000)
for n in [3, 5, 10]:
    director_metrics[f'avg-{n}'] = grouped.apply(avg_top_n_scores, n=n)

director_metrics.reset_index(inplace=True)
director_metrics.rename(columns={'index': 'nconst'}, inplace=True)

# there is probably a way to optimize even further but for now 20mins => 3mins is sufficient, would probably involve 
# a smart grouping and mapping to avoid using apply()
```

In [None]:
# shortcut to not wait 2-3 mins (to remove for final run)
directors.reset_index(inplace=True)
director_metrics = read_dataframe_parquet("directors/metrics")

In [None]:
directors_with_country = pd.merge(
    director_metrics,
    movies_exploded[['directors', 'countries']],
    left_on='nconst',
    right_on='directors',
    how='left'
)

In [None]:
# minimal version
def list_top_directors(country: str, top=5, score='avg-3'):

    directors_in_country = directors_with_country[directors_with_country['countries'].str.contains(country, na=False)]

    grouped_directors = directors_in_country.groupby('nconst').agg({score: 'mean'})

    top_directors = grouped_directors.sort_values(by=score, ascending=False).head(top)

    return top_directors#.index.tolist()

list_top_directors('Iran', top=5)

In [None]:
# as in scores-sepehr
def list_top_directors(country: str, top=5, score='avg-3'):
    directors_in_country = directors_with_country[directors_with_country['countries'].str.contains(country, na=False)]
    grouped_directors = directors_in_country.groupby('nconst').agg({
        'hits-30': 'mean', 
        'hits-40': 'mean', 
        'hits-45': 'mean', 
        'hits-50': 'mean', 
        'rate-7.0': 'mean', 
        'rate-7.5': 'mean', 
        'rate-8.0': 'mean', 
        'rate-8.5': 'mean', 
        'avg-3': 'mean', 
        'avg-5': 'mean', 
        'avg-10': 'mean'
    })
    top_directors = grouped_directors.sort_values(by=score, ascending=False).head(top)
    
    top_directors_detailed = pd.merge(
        top_directors,
        directors[['nconst', 'primaryName', 'birthYear', 'deathYear', 'awardsNominated', 'awardsWon']],
        on='nconst',
        how='left'
    )
    columns_order = [
        'nconst', 'primaryName', 'birthYear', 'deathYear', 'awardsNominated', 'awardsWon',
        'hits-30', 'hits-40', 'hits-45', 'hits-50', 
        'rate-7.0', 'rate-7.5', 'rate-8.0', 'rate-8.5', 
        'avg-3', 'avg-5', 'avg-10'
    ]
    top_directors_detailed = top_directors_detailed[columns_order]
    
    return top_directors_detailed

list_top_directors('Iran', top=5)

In [None]:
directors = pd.merge(directors, director_metrics, left_on="nconst", right_on="nconst", how="left")

In [None]:
directors.set_index('nconst', inplace=True) # needed nconst as column for top_directors_detailed merge of list_top_directors