# Análisis de datos de los dramas coreanos

In [1]:
import pandas as pd
from rich import print
import numpy as np

In [2]:
k_dramas: pd.DataFrame = pd.read_csv('./../data/csv/korean_drama.csv')
k_dramas.set_index('kdrama_id', inplace=True)
k_dramas

Unnamed: 0_level_0,drama_name,year,director,screenwriter,country,type,tot_eps,duration,start_dt,end_dt,aired_on,org_net,content_rt,synopsis,rank,pop
kdrama_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
661d4193916c4e71a2c70473ab11e9e8,Sing My Crush,2023,['So Joon Moon'],,South Korea,Drama,8,1500.0,"Aug 2, 2023","Aug 2, 2023",Wednesday,,Not Yet Rated,Follow the story of acquaintances Ba Ram and H...,1484,2238
5ffcbeaa17114714af1959129984274c,D.P. Season 2,2023,,['Kim Bo Tong'],South Korea,Drama,6,3000.0,"Jul 28, 2023","Jul 28, 2023",Friday,Netflix,15+ - Teens 15 or older,This unfolding story ensues when military dese...,164,1084
65075cb9c1a54be4a441cee6f16c9fdf,Shadow Detective Season 2,2023,['Han Dong Hwa'],"['Song Jung Woo', 'Hwang Seol Hun']",South Korea,Drama,8,3300.0,2023-07-05,2023-07-26,Wednesday,Disney+ Hulu,15+ - Teens 15 or older,Unfolds the ultimate counterattack of veteran ...,2443,6915
df0f0ac4b3ff4b15afa26f5a7a53a328,To Be Honest,2023,,,South Korea,Drama,3,600.0,2023-06-30,2023-07-14,Friday,,Not Yet Rated,Don't you have those days where the whole univ...,49895,99999
04c1fe41948e464fb440001831d74d41,Celebrity,2023,['Kim Chul Gyu'],['Kim Yi Young'],South Korea,Drama,12,2700.0,"Jun 30, 2023","Jun 30, 2023",Friday,Netflix,18+ Restricted (violence & profanity),Fame. Money. Power. One young woman fights to ...,826,547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
e8c09f0f7d0d4b75a4642eb06d76d811,"Kill Me, Heal Me",2015,"['Kim Jin Man', 'Kim Dae Jin']",['Jin Soo Wan'],South Korea,Drama,20,3720.0,2015-01-07,2015-03-12,"Wednesday, Thursday",MBC,15+ - Teens 15 or older,A traumatic childhood experience leaves Cha Do...,232,29
1e7f7f2fa45d4240b8bb585ca13040f0,Enchanting Neighbor,2015,['Park Kyung Ryul'],['Park Hye Ryun'],South Korea,Drama,120,2100.0,2015-01-05,2015-06-22,"Monday, Tuesday, Wednesday, Thursday, Friday",SBS,15+ - Teens 15 or older,"Gong Soo Rae only love her husband, Seo Bong G...",47882,10759
142a09741fdd4daa878d6ae007201e43,Iron Lady Cha,2015,"['Jang Joon Ho', 'Oh Hyun Chang']",['Seol Kyung Eun'],South Korea,Drama,111,2400.0,2015-01-05,2015-06-12,"Monday, Tuesday, Wednesday, Thursday, Friday",MBC,15+ - Teens 15 or older,People born as a member of the baby boomer gen...,58155,14841
9170c31a02014b56bb4cec2105086f00,The Family Is Coming,2015,['Joo Dong Min'],['Kim Shin Hye'],South Korea,Drama,20,3600.0,2015-01-03,2015-03-15,"Saturday, Sunday",SBS,15+ - Teens 15 or older,A grandma has abandoned her family 50 years ag...,7995,4289


In [3]:
print(f'k_dramas has {k_dramas.shape[0]} rows and {k_dramas.shape[1]} columns')

In [4]:
print(f'The columns of k_dramas dataframe are {k_dramas.columns}')

## Análisis Exploratorio de Datos Faltantes

### Resumen general de datos faltantes

In [5]:
is_na_table = k_dramas.isna()

#### Valores sueltos para describir los datos faltantes

In [6]:
na_count_per_column = is_na_table.sum()
print(f'[underline]NA count per column[/underline]: \n\n{na_count_per_column}')

In [7]:
na_tot_count = na_count_per_column.sum()
print(f'Total NA count: {na_tot_count}')

In [8]:
no_na_tot_count = k_dramas.size - na_tot_count
print(f'No NA count: {no_na_tot_count}')

In [9]:
percentage_na = (na_tot_count / k_dramas.size) * 100
print(f'Percentage NA: {percentage_na:.2f}%')

#### Tablas pra describir los datos faltantes

In [10]:
percentage_na_per_column: pd.DataFrame = (
    na_count_per_column
    .pipe(
        lambda df: (
            df
            .reset_index(name='na_count')
            .rename(columns={'index': 'column_name'})
            .assign(
                num_cases=len(k_dramas),
                percentage_na=lambda x: (x.na_count / len(k_dramas)) * 100
            )
        )
    )
)
percentage_na_per_column

Unnamed: 0,column_name,na_count,num_cases,percentage_na
0,drama_name,0,1752,0.0
1,year,0,1752,0.0
2,director,716,1752,40.86758
3,screenwriter,793,1752,45.262557
4,country,0,1752,0.0
5,type,0,1752,0.0
6,tot_eps,0,1752,0.0
7,duration,24,1752,1.369863
8,start_dt,0,1752,0.0
9,end_dt,0,1752,0.0


In [11]:
(
    na_count_per_column
    .value_counts()
    .reset_index()
    .rename(columns={
        "index": "count_na_values",
        "count": "coincidences"
    })
    .assign(
        percentage=lambda df: (df.coincidences / df.coincidences.sum()) * 100
    )
    .sort_values('percentage', ascending=False)
)

Unnamed: 0,count_na_values,coincidences,percentage
0,0,10,62.5
1,716,1,6.25
2,793,1,6.25
3,24,1,6.25
4,232,1,6.25
5,408,1,6.25
6,168,1,6.25


In [12]:
percentage_na_per_row: pd.DataFrame = (
    is_na_table
    .assign(
        num_na_per_row=lambda df: df.apply(
            axis=1,
            func=lambda row: row.sum()
        ),
        percentage_na_per_row=lambda df: (df.num_na_per_row / df.shape[1]) * 100
    )
)[['num_na_per_row', 'percentage_na_per_row']]
percentage_na_per_row

Unnamed: 0_level_0,num_na_per_row,percentage_na_per_row
kdrama_id,Unnamed: 1_level_1,Unnamed: 2_level_1
661d4193916c4e71a2c70473ab11e9e8,2,11.764706
5ffcbeaa17114714af1959129984274c,1,5.882353
65075cb9c1a54be4a441cee6f16c9fdf,0,0.000000
df0f0ac4b3ff4b15afa26f5a7a53a328,3,17.647059
04c1fe41948e464fb440001831d74d41,0,0.000000
...,...,...
e8c09f0f7d0d4b75a4642eb06d76d811,0,0.000000
1e7f7f2fa45d4240b8bb585ca13040f0,0,0.000000
142a09741fdd4daa878d6ae007201e43,0,0.000000
9170c31a02014b56bb4cec2105086f00,0,0.000000


In [13]:
(
    percentage_na_per_row
    .value_counts('num_na_per_row')
    .reset_index()
    .rename(columns={
        'count': 'coincidences'
    })
    .assign(
        percentage=lambda df: (df.coincidences / df.coincidences.sum()) * 100
    )
)

Unnamed: 0,num_na_per_row,coincidences,percentage
0,0,859,49.02968
1,3,268,15.296804
2,2,257,14.66895
3,1,173,9.874429
4,4,125,7.134703
5,5,70,3.995434


In [14]:
def missing_variable_span(data: pd.DataFrame, variable: str, span_every: int) -> pd.DataFrame:
    return(
        data
        .assign(
            span_counter=lambda df: np.repeat(range(df.shape[0]), span_every)[:df.shape[0]] # Volver hacer pero no con un intervalo sino por número de secciones
        )
        .groupby('span_counter')
        .aggregate(
            num_in_span=(variable, 'size'),
            num_of_na=(variable, lambda x: x.isna().sum())
        )
        .assign(
            num_of_no_na=lambda df: df.num_in_span - df.num_of_na,
            percentage_of_na=lambda df: (df.num_of_na / df.num_in_span) * 100,
            percentage_of_no_na=lambda df: (df.num_of_no_na / df.num_in_span) * 100
        )
        .drop(columns=['num_in_span'])
    )

for column in k_dramas.columns:
    df = missing_variable_span(k_dramas, column, 50)
    if df.num_of_na.sum() > 0:
        print(f'[underline]Missing variable span for [purple]{column}[/purple][/underline]\n')
        print(df)
        print('============================================================')

In [23]:
def missing_variable_intervals(data: pd.DataFrame, variable: str, num_of_intervals: int) -> pd.DataFrame:
    return(
        data
        .assign(
            span_counter=lambda df: np.repeat(range(df.shape[0]), df.shape[0]/num_of_intervals)[:df.shape[0]]
        )
        .groupby('span_counter')
        .aggregate(
            num_in_span=(variable, 'size'),
            num_of_na=(variable, lambda x: x.isna().sum())
        )
        .assign(
            num_of_no_na=lambda df: df.num_in_span - df.num_of_na,
            percentage_of_na=lambda df: (df.num_of_na / df.num_in_span) * 100,
            percentage_of_no_na=lambda df: (df.num_of_no_na / df.num_in_span) * 100
        )
        .drop(columns=['num_in_span'])
    )

for column in k_dramas.columns:
    df = missing_variable_intervals(k_dramas, column, 13)
    if df.num_of_na.sum() > 0:
        print(f'[underline]Missing variable intervals for [purple]{column}[/purple][/underline]\n')
        print(df)
        print('============================================================')

In [30]:
import itertools


def missing_variable_sections(data: pd.DataFrame, variable: str) -> pd.DataFrame:
    rle_list = data[variable].pipe(
        lambda x: (
            [(len(list(group)), key) for key, group in itertools.groupby(x.isna())]
        )
    )

    return(
        pd.DataFrame(
            rle_list,
            columns=['num_in_section', 'value']
        )
        .replace({
            False: 'no_na',
            True: 'na'
        })
    )

for column in k_dramas.columns:
    df = missing_variable_sections(k_dramas, column)
    print(f'[underline]Missing variable sections for [purple]{column}[/purple][/underline]\n')
    print(df)

---

## Análisis Exploratorio de Datos

In [15]:
# k_dramas.info()

In [16]:
# k_dramas.describe()

Se descubrió que la duración puede que no sea por el dorama completo, sino por episodio. Por lo que se procede a hacer una nueva columna con la duración total del dorama.

In [17]:
# k_dramas['tot_duration'] = k_dramas['tot_eps'] * k_dramas['duration']

In [18]:
# k_dramas.plot(kind='scatter', x='tot_eps', y='tot_duration', figsize=(14, 6), grid=True)