In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Data Preprocessing and Data Exploration

In [None]:
def split_genres_tags(df: pd.DataFrame) -> pd.DataFrame:
    df['genres'] = df['genres'].apply(lambda x: [char.replace(',','').strip() for char in str(x).split()])
    df['tags'] = df['tags'].apply(lambda x: [char.replace(',','').strip()  for char in str(x).split()])
    return df

def custom_rating_generator(df: pd.DataFrame) -> pd.DataFrame:
    df['watched_ratio'] = df['tot_num_user'] / df['tot_watched']
    df['watched_ratio'] = df['watched_ratio'].apply(lambda x: round(x,3))
    df['scored_signif'] = df['tot_user_score'] * df['watched_ratio']
    return df

def drama_process_pipeline(df: pd.DataFrame) -> pd.DataFrame:
    
    # Filtered for prefered data
    df['year'] = df['year'].apply(lambda x: str(x))
    
    df = df[
        (df['type'].isin(['Drama','Movie'])) & 
        (df['year'].isin(['2021','2022','2023']))
    ]
    
    df = split_genres_tags(df)
    df = custom_rating_generator(df)
    
    df = df.drop(columns=['drama_id','synopsis','rank','popularity'],axis=1)
    
    return df

In [None]:
# Preprocess Drama Metadata Dataset
dtha_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_drama.csv'))
dkor_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_drama.csv'))
djap_df = drama_process_pipeline(pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_drama.csv'))

# Extract Drama Name
tha_dname = dtha_df[dtha_df['type'] == 'Drama']['drama_name'].tolist()
kor_dname = dkor_df[dkor_df['type'] == 'Drama']['drama_name'].tolist()
jap_dname = djap_df[djap_df['type'] == 'Drama']['drama_name'].tolist()

print(f'Thai: {dtha_df.shape[0]}, #drama := {len(tha_dname)}')
print(f'South Korea: {dkor_df.shape[0]}, #drama := {len(kor_dname)}')
print(f'Japan: {djap_df.shape[0]}, #drama := {len(jap_dname)}')

# # Load the data: ignoring drama_id, rank, and pop(ularity) because we won't be using the website ranking system
# dtha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_drama.csv').iloc[:,1:-2]
# dkor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_drama.csv').iloc[:,1:-2]
# djap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_drama.csv').iloc[:,1:-2]


# # Extract Drama Name
# tha_dname = dtha_df[dtha_df['type'] == 'Drama']['drama_name'].tolist()
# kor_dname = dkor_df[dkor_df['type'] == 'Drama']['drama_name'].tolist()
# jap_dname = djap_df[djap_df['type'] == 'Drama']['drama_name'].tolist()

In [None]:
dtha_df.shape, dkor_df.shape, djap_df.shape

We will preprocess that data. Firstly, we defined a function called `review_preprocess_pipeline` that extract useful information such as total number of episode the reviewer watched, which we can used to calculated the `watched_ratio`, This ratio will be used as a weighting factor to find weighted average rating score or `wrat_avg_score` reviewers gave to the drama.

#### Why should we consider using a weighted score in our analysis? 
To illustrate this, let's draw a parallel with shopping on an e-commerce website. Imagine you come across two identical products being sold by two different vendors. In most cases, you would likely opt to purchase from the vendor with a stronger reputation. This reputation can manifest in various ways: a higher number of followers, likes, or exceptionally high average review scores, such as a perfect 5.0 stars.

Now, let's apply a similar line of thinking to our analysis. When it comes to reviews of a particular drama, we value input from individuals with greater credibility. This credibility comes from those who have already watched the entire drama, as they can provide a more comprehensive and informed perspective on it. Therefore, we may choose to assign more weight to reviews from viewers with a proven track record of completing the drama, just as we would preferentially buy from a trusted vendor when shopping online.

In [None]:
rtha_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/tha_user_reviews.csv').iloc[:,1:]
rkor_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/kor_user_reviews.csv').iloc[:,1:]
rjap_df = pd.read_csv('/kaggle/input/thai-and-japanese-drama-vs-korean-drama-dominance/jap_user_reviews.csv').iloc[:,1:]

# Create Additional Features into each Dataset as a Key to determine the associated Country of the Drama being commented
rtha_df['Country'] = 'Thailand'
rkor_df['Country'] = 'South Korea'
rjap_df['Country'] = 'Japan'

In [None]:
def review_preprocess_pipeline(
    df: pd.DataFrame
) -> pd.DataFrame:
    
    # Extract Number of Epsiode watched and Total Episode Number
    df['ep_watched'] = df['ep_watched'].fillna('0 of 0 episodes seen')
    df['tot_watched'] = df['ep_watched'].apply(lambda x: int(str(x).split(' ')[0]))
    df['tot_ep'] = df['ep_watched'].apply(lambda x: int(str(x).split(' ')[2]))
    
    # Feature Engineering
    df['watched_ratio'] = df['tot_watched'] / df['tot_ep']
    df['watched_ratio'] = df['watched_ratio'].fillna(0.0).apply(lambda x: round(x,3))
    
    df['avg_score'] = df[['story','acting_cast','music','rewatch_value']].mean(axis=1)
    df['wrat_avg_score'] = df['avg_score'] * df['watched_ratio']
    
    # Remove Features that wouldn't be part of the analysis 
    
    df = df.drop(columns=['ep_watched', 'text'], axis=1)
    
    return df

In [None]:
rtha_df_filtered = review_preprocess_pipeline(rtha_df)
rtha_df_filtered.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def draw_boxplots_with_annotations(dataframe):
    num_columns = len(dataframe.columns)
    num_rows = (num_columns + 2) // 3
    
    fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(16, 6 * num_rows // 2))
    axes = axes.flatten()

    for i, col in enumerate(dataframe.columns):
        # Create a boxplot for the current column in the corresponding subplot
        ax = axes[i]
        ax.boxplot(dataframe[col], vert=False, sym='b.', 
                   whis=1.5, patch_artist=True, boxprops=dict(facecolor='lightblue'))

        # Outlier Calculation using IQR Method
        q1 = np.percentile(dataframe[col], 25)
        q3 = np.percentile(dataframe[col], 75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = [x for x in dataframe[col] if x < lower_bound or x > upper_bound]
        num_outliers = len(outliers)
        
        mean = np.mean(dataframe[col])
        std_dev = np.std(dataframe[col])
        annotation = f"Mean: {mean:.2f}\nStd Dev: {std_dev:.2f}\nOutliers: {num_outliers}"
        ax.text(0.62, 0.80, annotation, transform=ax.transAxes, fontsize=12,
                verticalalignment='center')
        
        min_score = np.min(dataframe[col])
        max_score = np.max(dataframe[col])
        minmax_annotation = f"Min: {min_score:.2f}\nMax: {max_score:.2f}"
        ax.text(0.62, 0.25, minmax_annotation, transform=ax.transAxes, fontsize=12,
               verticalalignment='center')

        ax.set_title(col)
    
    
    for j in range(num_columns, len(axes)):
        fig.delaxes(axes[j])
        
    plt.tight_layout()
    plt.show()

features = [
    'story', 'acting_cast', 'music', 'rewatch_value', 'overall',
    'n_helpful', 'tot_watched', 'tot_ep', 'watched_ratio',
    'avg_score', 'wrat_avg_score'
]

draw_boxplots_with_annotations(rtha_df_filtered[features])


Whether to remove the outliers before or after performing a feature engineering like weighted averge score depends on the specific golas and requirement of this analysis. Upon observation of the fields related to score, we notice some outliers. However, this is something we would like to capture. The minimum and maximum values for these fields are between 1 and 10; hence, there is no hidden error in the data extraction or scoring process on the website. They are valuable insights even if they were outliers; henceforth, we won't remove it for this moment. 


Address other features and how does outlier in those feature effect the final weighted score
