In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import json 

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Data Preprocessing and Data Exploration

Possible Analysis (at a later stage): Use the Jaccard Similarity to check whether there is difference in the top 10 to 20 popular genres and tags between dramas from South Korea, Thailand, and Japan

In [None]:
def split_data(df: pd.DataFrame) -> pd.DataFrame:
    df['genres'] = df['genres'].apply(lambda x: [char.replace(',','').strip() for char in str(x).split()])
    df['tags'] = df['tags'].apply(lambda x: [char.replace(',','').strip()  for char in str(x).split()])
    df['aired_on'] = df['aired_on'].apply(lambda x: [char.replace(',','').strip()  for char in str(x).split()])
    return df

def custom_rating_generator(df: pd.DataFrame) -> pd.DataFrame:
    df['watched_ratio'] = (df['tot_num_user'] / df['tot_watched']).round(3)
    df['scored_signif'] = df['tot_user_score'] * df['watched_ratio']
    return df

def fillna_numerical_columns(df: pd.DataFrame) -> pd.DataFrame:
    numerical_columns = df.describe().columns.tolist()
    
    for col in numerical_columns:
        df[col] = df[col].fillna(0.0)
    # df[numerical_columns].fillna(0.0, inplace=True)
    return df

def drama_process_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    
    # Filtered for prefered data
    df['year'] = df['year'].apply(lambda x: str(x))
    
    df = df[
        (df['type'].isin(['Drama','Movie'])) & 
        (df['year'].isin(['2021','2022','2023']))
    ]
    
    df = split_data(df)
    df = custom_rating_generator(df)
    df = fillna_numerical_columns(df)
    
    removed_features = [
        'drama_id','synopsis','rank','popularity',
        'director','sc_writer','start_dt', 'end_dt'
    ]
    df = df.drop(columns=removed_features,axis=1)
    
    # Should we integrate the process of extracting the drama name from this pipeline automatically
    # Therefore, we can remvoed the type features and output list of valid drama,
    # Or output three dataframes: df, df_drama, drama_name_list?
    
    return df

In [None]:
# Preprocess Drama Metadata Dataset
dtha_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_drama.csv'))
dkor_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_drama.csv'))
djap_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_drama.csv'))

# Extract Drama Name
tha_dname = dtha_df[dtha_df['type'] == 'Drama']['drama_name'].tolist()
kor_dname = dkor_df[dkor_df['type'] == 'Drama']['drama_name'].tolist()
jap_dname = djap_df[djap_df['type'] == 'Drama']['drama_name'].tolist()

print(f'Thai: {dtha_df.shape[0]}, #drama := {len(tha_dname)}')
print(f'South Korea: {dkor_df.shape[0]}, #drama := {len(kor_dname)}')
print(f'Japan: {djap_df.shape[0]}, #drama := {len(jap_dname)}')

global valid_drama_dict 

valid_drama_dict = {
    'Thailand': tha_dname,
    'South Korea': kor_dname,
    'Japan': jap_dname
}

# # Load the data: ignoring drama_id, rank, and pop(ularity) because we won't be using the website ranking system
# dtha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_drama.csv').iloc[:,1:-2]
# dkor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_drama.csv').iloc[:,1:-2]
# djap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_drama.csv').iloc[:,1:-2]


# # Extract Drama Name
# tha_dname = dtha_df[dtha_df['type'] == 'Drama']['drama_name'].tolist()
# kor_dname = dkor_df[dkor_df['type'] == 'Drama']['drama_name'].tolist()
# jap_dname = djap_df[djap_df['type'] == 'Drama']['drama_name'].tolist()

In [None]:
dtha_df = dtha_df[dtha_df['type'] == 'Drama']
dkor_df = dkor_df[dkor_df['type'] == 'Drama']
djap_df = djap_df[djap_df['type'] == 'Drama']

print(f'Thai: {dtha_df.shape[0]}')
print(f'South Korea: {dkor_df.shape[0]}')
print(f'Japan: {djap_df.shape[0]}')

In [None]:
plt.hist(dkor_df['scored_signif'],bins=25,edgecolor='k')
plt.xlabel('scored_signif')
plt.ylabel('Frequency')
plt.title('Histogram of Values')

# Show the plot
plt.show()

In [None]:
dkor_df.describe()

In [None]:
test = dkor_df.sort_values(by='scored_signif',ascending=True)
test = test[test['scored_signif'] != 0.0]
test
# dkor_df[dkor_df['tot_watched'] < dkor_df['tot_num_user']]

In [None]:
dkor_df.describe()

In [None]:
def process_outliers_df_pipeline(
    data_frame: pd.DataFrame,
    column_name: str,
    iqr_multiplier: float = 1.5
) -> pd.DataFrame:
    
    q1 = data_frame[column_name].quantile(0.25)
    q3 = data_frame[column_name].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - iqr_multiplier * iqr
    upper_bound = q3 + iqr_multiplier * iqr

    outliers_df = data_frame[(data_frame[column_name] < lower_bound) | (data_frame[column_name] > upper_bound)]
    filtered_df = data_frame[(data_frame[column_name] >= lower_bound) & (data_frame[column_name] <= upper_bound)]
    
    return outliers_df, filtered_df

dkor_df_outliers, dkor_df_filtered = process_outliers_df_pipeline(dkor_df, 'scored_signif', 1.5)
dkor_df_outliers.shape, dkor_df_filtered.shape

In [None]:
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

sns.swarmplot(x='tot_watched', data=dkor_df_outliers, marker='o', color='red', label='Outliers')

# Customize the main data points' marker (e.g., using 's' for the main points)
sns.swarmplot(x='tot_watched', data=dkor_df_filtered, marker='s', color='blue', label='Data Points')

# Customize the plot as needed
plt.title('Beepswarm Plot with Distinct Outlier Markers')
plt.xlabel('Total Number of Watchers')
plt.ylabel('Y-axis Label')
plt.grid(True)  # Add grid lines if desired

# Add a legend to distinguish outliers from data points
plt.legend()

# Show the plot
plt.show()


We will preprocess that data. Firstly, we defined a function called `review_preprocess_pipeline` that extract useful information such as total number of episode the reviewer watched, which we can used to calculated the `watched_ratio`, This ratio will be used as a weighting factor to find weighted average rating score or `wrat_avg_score` reviewers gave to the drama.

#### Why should we consider using a weighted score in our analysis? 
To illustrate this, let's draw a parallel with shopping on an e-commerce website. Imagine you come across two identical products being sold by two different vendors. In most cases, you would likely opt to purchase from the vendor with a stronger reputation. This reputation can manifest in various ways: a higher number of followers, likes, or exceptionally high average review scores, such as a perfect 5.0 stars.

Now, let's apply a similar line of thinking to our analysis. When it comes to reviews of a particular drama, we value input from individuals with greater credibility. This credibility comes from those who have already watched the entire drama, as they can provide a more comprehensive and informed perspective on it. Therefore, we may choose to assign more weight to reviews from viewers with a proven track record of completing the drama, just as we would preferentially buy from a trusted vendor when shopping online.

In [None]:
rtha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_user_reviews.csv').iloc[:,1:]
rkor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_user_reviews.csv').iloc[:,1:]
rjap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_user_reviews.csv').iloc[:,1:]

# Create Additional Features into each Dataset as a Key to determine the associated Country of the Drama being commented
rtha_df['Country'] = 'Thailand'
rkor_df['Country'] = 'South Korea'
rjap_df['Country'] = 'Japan'

In [None]:
def review_preprocess_pipeline(
    df: pd.DataFrame,
    country: str
) -> pd.DataFrame:
    
    # Filtered for review that is valid
    df = df[df['title'].isin(valid_drama_dict[country])].iloc[:,1:]
    
    # Create Additional Features into each Dataset as a Key to determine the associated Country of the Drama being commented
    df['country'] = country
    
    # Extract Number of Epsiode watched and Total Episode Number
    df['ep_watched'] = df['ep_watched'].fillna('0 of 0 episodes seen')
    df['tot_watched'] = df['ep_watched'].apply(lambda x: int(str(x).split(' ')[0]))
    df['tot_ep'] = df['ep_watched'].apply(lambda x: int(str(x).split(' ')[2]))
    
    # Feature Engineering
    df['watched_ratio'] = df['tot_watched'] / df['tot_ep']
    df['watched_ratio'] = df['watched_ratio'].fillna(0.0).apply(lambda x: round(x,3))
    
    df['avg_score'] = df[['story','acting_cast','music','rewatch_value']].mean(axis=1)
    df['wrat_avg_score'] = df['avg_score'] * df['watched_ratio']
    
    # Remove Features that wouldn't be part of the analysis 
    df = df.drop(columns=['ep_watched', 'text'], axis=1)
    
    return df

## Actors Processing

In [None]:
atha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_tha_actors.csv')
akor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_kor_actors.csv')
ajap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_jap_actors.csv')

atha_df.head()

In [None]:
# Create a function to check uniqueness of the actor name
def check_actor_name_uniqueness(df: pd.DataFrame) -> None:
    
    actor_dict = {}
    
    for index, row in df.iterrows():
        actor_id, actor_name = row['actor_id'], row['actor_name']

        if actor_name not in actor_dict.keys():
            actor_dict[actor_name] = {
                'id_list': []
            }

        if actor_id not in actor_dict[actor_name]['id_list']:
            actor_dict[actor_name]['id_list'].append(actor_id)

    for key, val in actor_dict.items():
        actor_dict[key]['nunique'] = len(actor_dict[key]['id_list'])

        if actor_dict[key]['nunique'] != 1:
            print("Actor name isn't unique!")
            return 
    
    print("Actor name is UNIQUE!")
    return 

# Check the uniqueness of the actor name. Need to use them as key later-on
check_actor_name_uniqueness(atha_df)
check_actor_name_uniqueness(akor_df)
check_actor_name_uniqueness(ajap_df)

Write a function to merge the information of actor and drama (might have to be the new dataframe created from the processed reviews?) to get the information regarding the drama performance that each actor acted in. If have to use the processed data from review, then move code blocks related to actors to be after the user reviews sections

In [None]:
df = atha_df
# Might changed this to the precess reviews dataset?

drama_df = dtha_df
country = 'Thailand'

df = df.drop(columns=['actor_id','character_name'],axis=1)
df = df[df['drama_name'].isin(valid_drama_dict[country])]

merged_df = df.merge(drama_df, on='drama_name', how='left')

features = [
    'actor_name', 'drama_name', 'role', 'year', 'tags', 'tags',
    'country', 'tot_user_score', 'tot_num_user', 'tot_watched',
    'content_rt', 'watched_ratio', 'scored_signif'
]

merged_df = merged_df[features]
merged_df.head()


## Drama Review Processing

In [None]:
rtha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_user_reviews.csv')
rkor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_user_reviews.csv')
rjap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_user_reviews.csv')

rtha_df_f = review_preprocess_pipeline(rtha_df,'Thailand')
rkor_df_f = review_preprocess_pipeline(rkor_df, 'South Korea')
rjap_df_f = review_preprocess_pipeline(rjap_df, 'Japan')

rtha_df_f.shape, rkor_df_f.shape, rjap_df_f.shape

In [None]:
def draw_boxplots_with_annotations(
    dataframe: pd.DataFrame,
    country: str = '',
) -> None:
    num_columns = len(dataframe.columns)
    num_rows = (num_columns + 2) // 3
    
    fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(16, 6 * num_rows // 2))
    axes = axes.flatten()

    for i, col in enumerate(dataframe.columns):
        # Create a boxplot for the current column in the corresponding subplot
        ax = axes[i]
        ax.boxplot(dataframe[col], vert=False, sym='b.', 
                   whis=1.5, patch_artist=True, boxprops=dict(facecolor='lightblue'))

        # Outlier Calculation using IQR Method
        q1 = np.percentile(dataframe[col], 25)
        q3 = np.percentile(dataframe[col], 75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = [x for x in dataframe[col] if x < lower_bound or x > upper_bound]
        num_outliers = len(outliers)
        
        mean = np.mean(dataframe[col])
        std_dev = np.std(dataframe[col])
        annotation = f"Mean: {mean:.2f}\nStd Dev: {std_dev:.2f}\nOutliers: {num_outliers}"
        ax.text(0.62, 0.80, annotation, transform=ax.transAxes, fontsize=12,
                verticalalignment='center')
        
        min_score = np.min(dataframe[col])
        max_score = np.max(dataframe[col])
        minmax_annotation = f"Min: {min_score:.2f}\nMax: {max_score:.2f}"
        ax.text(0.62, 0.25, minmax_annotation, transform=ax.transAxes, fontsize=12,
               verticalalignment='center')

        ax.set_title(col)
    
    big_title = f"{country} Boxplots"
    fig.suptitle(big_title, fontsize=16, y=1.02)
    
    for j in range(num_columns, len(axes)):
        fig.delaxes(axes[j])
        
    plt.tight_layout()
    plt.show()

features = [
    'story', 'acting_cast', 'music', 'rewatch_value', 'overall',
    'n_helpful', 'tot_watched', 'tot_ep', 'watched_ratio',
    'avg_score', 'wrat_avg_score'
]

draw_boxplots_with_annotations(rtha_df_f[features], 'Thailand')

draw_boxplots_with_annotations(rkor_df_f[features], 'South Korea')

draw_boxplots_with_annotations(rjap_df_f[features], 'Japan')

Whether to remove the outliers before or after performing a feature engineering like weighted averge score depends on the specific golas and requirement of this analysis. Upon observation of the fields related to score, we notice some outliers. However, this is something we would like to capture. The minimum and maximum values for these fields are between 1 and 10; hence, there is no hidden error in the data extraction or scoring process on the website. They are valuable insights even if they were outliers; henceforth, we won't remove it for this moment. 


Address other features and how does outlier in those feature effect the final weighted score
