In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns

import json 

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Data Preprocessing and Data Exploration

Possible Analysis (at a later stage): Use the Jaccard Similarity to check whether there is difference in the top 10 to 20 popular genres and tags between dramas from South Korea, Thailand, and Japan

Fighting against the formitable opponent know as Korean Drama, Lakorn Thai and Japanese Drama cannot compete in terms of overall viewership. We argued that Thai and Japanese drama are niche categories of drama that are noteworthy of being classified as "Hidden Gem of Asia" - they are niche, but they are outstanding.

Given the context and the theory we proposed, we argued that it is not appropriate to use the total viewership to make judgement regarding the drama performance. Hence, we came up with a metric to help with the comparison. 

Metric: `scored_significant = total user score * total number of user that casted the vote / total people who watched the drama`

In [None]:
def split_data(df: pd.DataFrame) -> pd.DataFrame:
    df['genres'] = df['genres'].apply(lambda x: [char.replace(',','').strip() for char in str(x).split()])
    df['tags'] = df['tags'].apply(lambda x: [char.replace(',','').strip()  for char in str(x).split()])
    df['aired_on'] = df['aired_on'].apply(lambda x: [char.replace(',','').strip()  for char in str(x).split()])
    return df

def custom_rating_generator(df: pd.DataFrame) -> pd.DataFrame:
    df['watched_ratio'] = (df['tot_num_user'] / df['tot_watched']).round(3)
    df['scored_signif'] = df['tot_user_score'] * df['watched_ratio']
    return df

def fillna_numerical_columns(df: pd.DataFrame) -> pd.DataFrame:
    numerical_columns = df.describe().columns.tolist()
    
    for col in numerical_columns:
        df[col] = df[col].fillna(0.0)
        
    return df

def drama_process_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    
    # Filtered for prefered data
    df['year'] = df['year'].apply(lambda x: str(x))
    
    df = df[
        (df['type'].isin(['Drama','Movie'])) & 
        (df['year'].isin(['2021','2022','2023']))
    ]
    
    df = split_data(df)
    df = custom_rating_generator(df)
    df = fillna_numerical_columns(df)
    
    removed_features = [
        'drama_id','synopsis','rank','popularity',
        'director','sc_writer','start_dt', 'end_dt'
    ]
    df = df.drop(columns=removed_features,axis=1)
    
    return df

In [None]:
# Preprocess Drama Metadata Dataset
dtha_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_drama.csv'))
dkor_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_drama.csv'))
djap_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_drama.csv'))

# Extract Drama Name
tha_dname = dtha_df[dtha_df['type'] == 'Drama']['drama_name'].tolist()
kor_dname = dkor_df[dkor_df['type'] == 'Drama']['drama_name'].tolist()
jap_dname = djap_df[djap_df['type'] == 'Drama']['drama_name'].tolist()

print(f'Thai: {dtha_df.shape[0]}, #drama := {len(tha_dname)}')
print(f'South Korea: {dkor_df.shape[0]}, #drama := {len(kor_dname)}')
print(f'Japan: {djap_df.shape[0]}, #drama := {len(jap_dname)}')

# Valid Drama Dicts should be placed after the preprocessing task (remove outliers etc. )
global valid_drama_dict 

valid_drama_dict = {
    'Thailand': tha_dname,
    'South Korea': kor_dname,
    'Japan': jap_dname
}

# # Load the data: ignoring drama_id, rank, and pop(ularity) because we won't be using the website ranking system
# dtha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_drama.csv').iloc[:,1:-2]
# dkor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_drama.csv').iloc[:,1:-2]
# djap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_drama.csv').iloc[:,1:-2]


# # Extract Drama Name
# tha_dname = dtha_df[dtha_df['type'] == 'Drama']['drama_name'].tolist()
# kor_dname = dkor_df[dkor_df['type'] == 'Drama']['drama_name'].tolist()
# jap_dname = djap_df[djap_df['type'] == 'Drama']['drama_name'].tolist()

In [None]:
dtha_df = dtha_df[dtha_df['type'] == 'Drama']
dkor_df = dkor_df[dkor_df['type'] == 'Drama']
djap_df = djap_df[djap_df['type'] == 'Drama']

print(f'Thai: {dtha_df.shape[0]}')
print(f'South Korea: {dkor_df.shape[0]}')
print(f'Japan: {djap_df.shape[0]}')

Removing drama that falls below the 25th percentile of viewership since we want to focus on more popular or the better representative drams of each region. We can consider the alternative where we collected the data with overall viewership greater than the average viewship. However, this approach will make the remaining dataset really small.

In [None]:
'''PROBABLY DONT NEED THIS FUNCTION GIVEN THE CURRENT ANALYSIS'''
def process_percentile_watched_pipeline(
    data_frame: pd.DataFrame,
    column_name: str,
    percentile: float = 0.25,
) -> pd.DataFrame:
    
    lower_percentile = data_frame[column_name].quantile(percentile) 
    
    filtered_data = data_frame[data_frame[column_name] >= lower_percentile]
    
    removed_data = data_frame[data_frame[column_name]< lower_percentile]
    
    print(f"Output report:\nFiltered df: {filtered_data.shape[0]}\nRemoved: {removed_data.shape[0]}")
    return filtered_data, removed_data

In [None]:
def process_outliers_df_pipeline(
    data_frame: pd.DataFrame,
    column_name: str,
    iqr_multiplier: float = 1.5
) -> pd.DataFrame:
    
    q1 = data_frame[column_name].quantile(0.25)
    q3 = data_frame[column_name].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - iqr_multiplier * iqr
    upper_bound = q3 + iqr_multiplier * iqr

    outliers_df = data_frame[(data_frame[column_name] < lower_bound) | (data_frame[column_name] > upper_bound)]
    filtered_df = data_frame[(data_frame[column_name] >= lower_bound) & (data_frame[column_name] <= upper_bound)]
    
    print(f"Pipeline Report:\nOriginal df: {data_frame.shape[0]}\nOutliers df: {outliers_df.shape[0]}\nCleaned df: {filtered_df.shape[0]}",end="\n\n")
    
    return outliers_df, filtered_df

In [None]:
# Preprocess the data (removed outliers)
print("Thailand:")
dtha_df_outliers, dtha_df_filtered = process_outliers_df_pipeline(dtha_df, 'scored_signif', 1.5)
print("Korea:")
dkor_df_outliers, dkor_df_filtered = process_outliers_df_pipeline(dkor_df, 'scored_signif', 1.5)
print("Japan:")
djap_df_outliers, djap_df_filtered = process_outliers_df_pipeline(djap_df, 'scored_signif', 1.5)

In [None]:
dkor_df_filtered['above_mean'] = np.where(
    dkor_df_filtered['scored_signif'] >= dkor_df_filtered['scored_signif'].mean(),
    'High','Low'
)

dtha_df_filtered['above_mean'] = np.where(
    dtha_df_filtered['scored_signif'] >= dtha_df_filtered['scored_signif'].mean(),
    'High','Low'
)

djap_df_filtered['above_mean'] = np.where(
    djap_df_filtered['scored_signif'] >= djap_df_filtered['scored_signif'].mean(),
    'High','Low'
)

combined_drama_df = pd.concat([dkor_df_filtered, dtha_df_filtered, djap_df_filtered], axis=0, ignore_index=True)

combined_outliers_df = pd.concat([dkor_df_outliers, dtha_df_outliers, djap_df_outliers], axis=0, ignore_index=True)

In [None]:
combined_drama_df.sample(n=3)

In [None]:
# Beepswarm plot to show the distribution of Score Significant 
# The color shows the seperation of data where the red color belongs to the data above the mean for each country
# Likewise, the data that is below the mean is denoted as blue color
sns.set(style="whitegrid")

plt.figure(figsize=(10,6))

sns.swarmplot(x='scored_signif',y='country',
              data=combined_drama_df[combined_drama_df['above_mean'] == 'Low'], legend=False,
              color='blue', alpha=0.6, label='Above Mean')

sns.swarmplot(x='scored_signif',y='country',hue='tot_watched', 
              data=combined_drama_df[combined_drama_df['above_mean'] == 'High'], legend=True,
              palette='flare',alpha=1.0, label='Below Mean')

sns.swarmplot(x='scored_signif',y='country',
              data=combined_outliers_df, legend=False,
              marker="x", linewidth=2.0, label='(Removed) Outliers')

plt.xlabel("Score Significance")
plt.ylabel("Country")

legend_handles = [
    mlines.Line2D([], [], color='red', label='Mean & Above', marker='o', markersize=4, linestyle='None'),
    mlines.Line2D([], [], color='blue', label='Below Mean', marker='o', markersize=4, linestyle='None'),
    mlines.Line2D([], [], color='black', label='(Remove) Outliers', marker='x', markersize=4, linestyle='None')
]

plt.legend(handles=legend_handles, loc='upper right', bbox_to_anchor=(1.0, 0.75), fontsize='small')

title_text = r'$\bf{Score}$' + ' ' + r'$\bf{Significance}$' + '\nDistribution by Country (from 2021-23)'
plt.title(title_text, fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10,6))

sns.kdeplot(
    data=combined_drama_df, x='scored_signif', hue='country', 
    fill=True, common_norm=False,
    palette='Set2', alpha=0.5, linewidth=0
)

plt.show()

# Hypothesis Testing:


In [None]:
df = combined_drama_df.copy()
korean_data = df[df['country'] == 'South Korea']['scored_signif']
korean_data

In [None]:
from scipy import stats

hypothesis_text = f"""Null Hypothesis (H0): The average score significance of Korean drama is equal to or less than that of Thai drama.
Alternative Hypothesis (H1): The average score significance of Korean drama is greater than that of Thai drama.
"""
print(hypothesis_text)

df = combined_drama_df.copy()
korean_data = df[df['country'] == 'South Korea']['scored_signif']
thai_data = df[df['country'] == 'Thailand']['scored_signif']

# Perform a two-sample t-test
t_statistic, p_value = stats.ttest_ind(korean_data, thai_data, alternative='greater')

# Set the significance level (alpha)
alpha = 0.05

# Print the results
print(f'T-Statistic: {t_statistic}')
print(f'P-Value: {p_value}')

if p_value < alpha:
    print("Reject the null hypothesis: The average score significance of Korean drama is greater than that of Thai drama.")
else:
    print("Fail to reject the null hypothesis: There is not enough evidence to conclude that the average score significance of Korean drama is greater than that of Thai drama.")


In [None]:
# Separate data into groups based on 'country'
grouped_data = [df['scored_signif'][df['country'] == country] for country in df['country'].unique()]

# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(*grouped_data)

# Set the significance level (alpha)
alpha = 0.05

# Print the results
print(f'F-Statistic: {f_statistic}')
print(f'P-Value: {p_value}')

if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in the means of at least two groups.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in the means of the groups.")

# Reviews

Reviews can give a more indept analysis on why these drama are so great. However, not every drama has the same number of comments. In this case, we will just do the analysis based on the existing reviews. 

For each review, the scoring can be seperated into mutliple sub-scoring; hence, each field provides more insight!

#### Preprocessing Task
We will preprocess that data. Firstly, we defined a function called `review_preprocess_pipeline` that extract useful information such as total number of episode the reviewer watched, which we can used to calculated the `watched_ratio`, This ratio will be used as a weighting factor to find weighted average rating score or `wrat_avg_score` reviewers gave to the drama.

#### Why should we consider using a weighted score in our analysis? 
To illustrate this, let's draw a parallel with shopping on an e-commerce website. Imagine you come across two identical products being sold by two different vendors. In most cases, you would likely opt to purchase from the vendor with a stronger reputation. This reputation can manifest in various ways: a higher number of followers, likes, or exceptionally high average review scores, such as a perfect 5.0 stars.

Now, let's apply a similar line of thinking to our analysis. When it comes to reviews of a particular drama, we value input from individuals with greater credibility. This credibility comes from those who have already watched the entire drama, as they can provide a more comprehensive and informed perspective on it. Therefore, we may choose to assign more weight to reviews from viewers with a proven track record of completing the drama, just as we would preferentially buy from a trusted vendor when shopping online.

### Probability Problem: 
Given that we picked one Lakorn Thai randomly, what is the probability that the Lakorn is top rated and belongs to Horror, Romance or  

In [None]:
dtha_df['genres']

In [None]:
def review_preprocess_pipeline(
    df: pd.DataFrame,
    country: str
) -> pd.DataFrame:
    
    # Filtered for review that is valid
    df = df[df['title'].isin(valid_drama_dict[country])].iloc[:,1:]
    
    # Create Additional Features into each Dataset as a Key to determine the associated Country of the Drama being commented
    df['country'] = country
    
    # Extract Number of Epsiode watched and Total Episode Number
    df['ep_watched'] = df['ep_watched'].fillna('0 of 0 episodes seen')
    df['tot_watched'] = df['ep_watched'].apply(lambda x: int(str(x).split(' ')[0]))
    df['tot_ep'] = df['ep_watched'].apply(lambda x: int(str(x).split(' ')[2]))
    
    # Feature Engineering
    df['ep_watched_ratio'] = df['tot_watched'] / df['tot_ep']
    df['ep_watched_ratio'] = df['ep_watched_ratio'].fillna(0.0).apply(lambda x: round(x,3))
    
    df['avg_score'] = df[['story','acting_cast','music','rewatch_value']].mean(axis=1)
    df['wrat_avg_score'] = df['avg_score'] * df['ep_watched_ratio']
    
    # Remove Features that wouldn't be part of the analysis 
    df = df.drop(columns=['ep_watched', 'text'], axis=1)
    
    return df

In [None]:
rtha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_user_reviews.csv').iloc[:,1:]
rkor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_user_reviews.csv').iloc[:,1:]
rjap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_user_reviews.csv').iloc[:,1:]


rtha_df_f = review_preprocess_pipeline(rtha_df,'Thailand')
rkor_df_f = review_preprocess_pipeline(rkor_df, 'South Korea')
rjap_df_f = review_preprocess_pipeline(rjap_df, 'Japan')

rtha_df_f.shape, rkor_df_f.shape, rjap_df_f.shape

In [None]:
rtha_df_f.head()

## Actors Processing

In [None]:
atha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_tha_actors.csv')
akor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_kor_actors.csv')
ajap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_jap_actors.csv')

atha_df.head()

In [None]:
# Create a function to check uniqueness of the actor name
def check_actor_name_uniqueness(df: pd.DataFrame) -> None:
    
    actor_dict = {}
    
    for index, row in df.iterrows():
        actor_id, actor_name = row['actor_id'], row['actor_name']

        if actor_name not in actor_dict.keys():
            actor_dict[actor_name] = {
                'id_list': []
            }

        if actor_id not in actor_dict[actor_name]['id_list']:
            actor_dict[actor_name]['id_list'].append(actor_id)

    for key, val in actor_dict.items():
        actor_dict[key]['nunique'] = len(actor_dict[key]['id_list'])

        if actor_dict[key]['nunique'] != 1:
            print("Actor name isn't unique!")
            return 
    
    print("Actor name is UNIQUE!")
    return 

# Check the uniqueness of the actor name. Need to use them as key later-on
check_actor_name_uniqueness(atha_df)
check_actor_name_uniqueness(akor_df)
check_actor_name_uniqueness(ajap_df)

Write a function to merge the information of actor and drama (might have to be the new dataframe created from the processed reviews?) to get the information regarding the drama performance that each actor acted in. If have to use the processed data from review, then move code blocks related to actors to be after the user reviews sections

In [None]:
df = atha_df
# Might changed this to the precess reviews dataset?

drama_df = dtha_df
country = 'Thailand'

df = df.drop(columns=['actor_id','character_name'],axis=1)
df = df[df['drama_name'].isin(valid_drama_dict[country])]

merged_df = df.merge(drama_df, on='drama_name', how='left')

features = [
    'actor_name', 'drama_name', 'role', 'year', 'tags', 'tags',
    'country', 'tot_user_score', 'tot_num_user', 'tot_watched',
    'content_rt', 'watched_ratio', 'scored_signif'
]

merged_df = merged_df[features]
merged_df.head()


## Drama Review Processing

In [None]:
rtha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_user_reviews.csv')
rkor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_user_reviews.csv')
rjap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_user_reviews.csv')

rtha_df_f = review_preprocess_pipeline(rtha_df,'Thailand')
rkor_df_f = review_preprocess_pipeline(rkor_df, 'South Korea')
rjap_df_f = review_preprocess_pipeline(rjap_df, 'Japan')

rtha_df_f.shape, rkor_df_f.shape, rjap_df_f.shape

In [None]:
def draw_boxplots_with_annotations(
    dataframe: pd.DataFrame,
    country: str = '',
) -> None:
    sns.reset_orig()
    
    num_columns = len(dataframe.columns)
    num_rows = (num_columns + 2) // 3
    
    fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(16, 6 * num_rows // 2))
    axes = axes.flatten()

    for i, col in enumerate(dataframe.columns):
        # Create a boxplot for the current column in the corresponding subplot
        ax = axes[i]
        ax.boxplot(dataframe[col], vert=False, sym='b.', 
                   whis=1.5, patch_artist=True, boxprops=dict(facecolor='lightblue'))

        # Outlier Calculation using IQR Method
        q1 = np.percentile(dataframe[col], 25)
        q3 = np.percentile(dataframe[col], 75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = [x for x in dataframe[col] if x < lower_bound or x > upper_bound]
        num_outliers = len(outliers)
        
        mean = np.mean(dataframe[col])
        std_dev = np.std(dataframe[col])
        annotation = f"Mean: {mean:.2f}\nStd Dev: {std_dev:.2f}\nOutliers: {num_outliers}"
        ax.text(0.62, 0.80, annotation, transform=ax.transAxes, fontsize=12,
                verticalalignment='center')
        
        min_score = np.min(dataframe[col])
        max_score = np.max(dataframe[col])
        minmax_annotation = f"Min: {min_score:.2f}\nMax: {max_score:.2f}"
        ax.text(0.62, 0.25, minmax_annotation, transform=ax.transAxes, fontsize=12,
               verticalalignment='center')

        ax.set_title(col)
    
    big_title = f"{country} Boxplots"
    fig.suptitle(big_title, fontsize=16, y=1.02)
    
    for j in range(num_columns, len(axes)):
        fig.delaxes(axes[j])
        
    plt.tight_layout()
    plt.show()

features = [
    'story', 'acting_cast', 'music', 'rewatch_value', 'overall',
    'n_helpful', 'tot_watched', 'tot_ep', 'ep_watched_ratio',
    'avg_score', 'wrat_avg_score'
]

draw_boxplots_with_annotations(rtha_df_f[features], 'Thailand')

draw_boxplots_with_annotations(rkor_df_f[features], 'South Korea')

draw_boxplots_with_annotations(rjap_df_f[features], 'Japan')

Whether to remove the outliers before or after performing a feature engineering like weighted averge score depends on the specific golas and requirement of this analysis. Upon observation of the fields related to score, we notice some outliers. However, this is something we would like to capture. The minimum and maximum values for these fields are between 1 and 10; hence, there is no hidden error in the data extraction or scoring process on the website. They are valuable insights even if they were outliers; henceforth, we won't remove it for this moment. 


Address other features and how does outlier in those feature effect the final weighted score
