# Análisis de datos de los dramas coreanos

In [68]:
import pandas as pd
from rich import print

In [69]:
%run "../pandas_extensions/missing.py"

  @pd.api.extensions.register_dataframe_accessor("missing_data")


In [101]:
k_dramas: pd.DataFrame = pd.read_csv('./../data/csv/korean_drama.csv')
k_dramas.set_index('kdrama_id', inplace=True)
k_dramas.head()

Unnamed: 0_level_0,drama_name,year,director,screenwriter,country,type,tot_eps,duration,start_dt,end_dt,aired_on,org_net,content_rt,synopsis,rank,pop
kdrama_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
661d4193916c4e71a2c70473ab11e9e8,Sing My Crush,2023,['So Joon Moon'],,South Korea,Drama,8,1500.0,"Aug 2, 2023","Aug 2, 2023",Wednesday,,Not Yet Rated,Follow the story of acquaintances Ba Ram and H...,1484,2238
5ffcbeaa17114714af1959129984274c,D.P. Season 2,2023,,['Kim Bo Tong'],South Korea,Drama,6,3000.0,"Jul 28, 2023","Jul 28, 2023",Friday,Netflix,15+ - Teens 15 or older,This unfolding story ensues when military dese...,164,1084
65075cb9c1a54be4a441cee6f16c9fdf,Shadow Detective Season 2,2023,['Han Dong Hwa'],"['Song Jung Woo', 'Hwang Seol Hun']",South Korea,Drama,8,3300.0,2023-07-05,2023-07-26,Wednesday,Disney+ Hulu,15+ - Teens 15 or older,Unfolds the ultimate counterattack of veteran ...,2443,6915
df0f0ac4b3ff4b15afa26f5a7a53a328,To Be Honest,2023,,,South Korea,Drama,3,600.0,2023-06-30,2023-07-14,Friday,,Not Yet Rated,Don't you have those days where the whole univ...,49895,99999
04c1fe41948e464fb440001831d74d41,Celebrity,2023,['Kim Chul Gyu'],['Kim Yi Young'],South Korea,Drama,12,2700.0,"Jun 30, 2023","Jun 30, 2023",Friday,Netflix,18+ Restricted (violence & profanity),Fame. Money. Power. One young woman fights to ...,826,547


In [71]:
print(f'k-dramas has {k_dramas.shape[0]} rows and {k_dramas.shape[1]} columns')

In [72]:
print(f'The columns of k-dramas dataframe are {list(k_dramas.columns)}')

## Análisis Exploratorio de Datos Faltantes

### Resumen general de datos faltantes

In [73]:
is_na_table = k_dramas.isna()
na_count_per_column = is_na_table.sum()
na_tot_count = na_count_per_column.sum()
no_na_tot_count = k_dramas.size - na_tot_count
percentage_na = (na_tot_count / k_dramas.size) * 100

#### Valores sueltos para describir los datos faltantes

Se puede ver que la mayoría de columnas no tienen ningún valor faltante.

Pero las dos columnas con mayor número de valores faltantes son: 'director' y 'screenwriter'. La cantidad de valores faltantes en estas dos columnas son similares, así que más adelante se analizarán si tienen algún tipo de relación.

De allí la columna con más valores faltantes es 'org_net', con aproximadamente la mitad de valores faltantes que las dos anteriores.

Luego se encuentra la columna 'aired_on', con nuevamente la mitad de valores faltantes que la anterior.

En el caso de la columnas 'synopsis' pasa algo similar

Y finalmente, la columna con menos valores faltantes es 'duration', con solo 24 valores faltantes. Así que las filas con valores faltantes en esta columna es posible que se puedan eliminar.

In [74]:
print(f'[underline]Count na per column[/underline]: \n\n{k_dramas.missing_data.count_na_per_column}')

In [75]:
k_dramas.missing_data.count_na_per_column.sort_values(ascending=False)

screenwriter    793
director        716
org_net         408
aired_on        232
synopsis        168
duration         24
drama_name        0
year              0
country           0
type              0
tot_eps           0
start_dt          0
end_dt            0
content_rt        0
rank              0
pop               0
dtype: int64

In [76]:
k_dramas.missing_data.count_na_per_column_percentage

drama_name       0.000000
year             0.000000
director        40.867580
screenwriter    45.262557
country          0.000000
type             0.000000
tot_eps          0.000000
duration         1.369863
start_dt         0.000000
end_dt           0.000000
aired_on        13.242009
org_net         23.287671
content_rt       0.000000
synopsis         9.589041
rank             0.000000
pop              0.000000
dtype: float64

In [77]:
k_dramas.missing_data.count_na_per_row

kdrama_id
661d4193916c4e71a2c70473ab11e9e8    2
5ffcbeaa17114714af1959129984274c    1
65075cb9c1a54be4a441cee6f16c9fdf    0
df0f0ac4b3ff4b15afa26f5a7a53a328    3
04c1fe41948e464fb440001831d74d41    0
                                   ..
e8c09f0f7d0d4b75a4642eb06d76d811    0
1e7f7f2fa45d4240b8bb585ca13040f0    0
142a09741fdd4daa878d6ae007201e43    0
9170c31a02014b56bb4cec2105086f00    0
c19353bdbc7a43ae98b27db202e7c908    5
Length: 1752, dtype: int64

In [78]:
k_dramas.missing_data.count_na_per_row_percentage

kdrama_id
661d4193916c4e71a2c70473ab11e9e8    12.50
5ffcbeaa17114714af1959129984274c     6.25
65075cb9c1a54be4a441cee6f16c9fdf     0.00
df0f0ac4b3ff4b15afa26f5a7a53a328    18.75
04c1fe41948e464fb440001831d74d41     0.00
                                    ...  
e8c09f0f7d0d4b75a4642eb06d76d811     0.00
1e7f7f2fa45d4240b8bb585ca13040f0     0.00
142a09741fdd4daa878d6ae007201e43     0.00
9170c31a02014b56bb4cec2105086f00     0.00
c19353bdbc7a43ae98b27db202e7c908    31.25
Length: 1752, dtype: float64

La totalidad de los datos faltantes es de 2341, que corresponde al 8.35% de los datos. Por lo que se puede decir que la cantidad de datos faltantes es relativamente baja.

In [79]:
print(f'Total NA count: {k_dramas.missing_data.total_count_na}')

In [80]:
print(f'No NA count: {k_dramas.missing_data.total_count_not_na}')

In [81]:
print(f'Percentage NA: {k_dramas.missing_data.total_count_na_percentage:.2f}%')

In [82]:
print(f'Percentage no NA: {k_dramas.missing_data.total_count_not_na_percentage:.2f}%')

#### Tablas pra describir los datos faltantes

In [83]:
k_dramas.missing_data.na_count_and_percentage_df("row")

Unnamed: 0_level_0,count,percentage
kdrama_id,Unnamed: 1_level_1,Unnamed: 2_level_1
661d4193916c4e71a2c70473ab11e9e8,2,12.50
5ffcbeaa17114714af1959129984274c,1,6.25
65075cb9c1a54be4a441cee6f16c9fdf,0,0.00
df0f0ac4b3ff4b15afa26f5a7a53a328,3,18.75
04c1fe41948e464fb440001831d74d41,0,0.00
...,...,...
e8c09f0f7d0d4b75a4642eb06d76d811,0,0.00
1e7f7f2fa45d4240b8bb585ca13040f0,0,0.00
142a09741fdd4daa878d6ae007201e43,0,0.00
9170c31a02014b56bb4cec2105086f00,0,0.00


In [85]:
k_dramas.missing_data.coincidence_count_na_per("column")

0      10
24      1
168     1
232     1
408     1
716     1
793     1
Name: count, dtype: int64

In [86]:
k_dramas.missing_data.coincidence_and_percentage_count_na_per("column")

Unnamed: 0,count,percentage
0,10,62.5
24,1,6.25
168,1,6.25
232,1,6.25
408,1,6.25
716,1,6.25
793,1,6.25


In [87]:
k_dramas.missing_data.coincidence_and_percentage_count_na_per("row")

Unnamed: 0,count,percentage
0,859,49.02968
1,173,9.874429
2,257,14.66895
3,268,15.296804
4,125,7.134703
5,70,3.995434


In [91]:
k_dramas.missing_data.na_count_by_intervals(50, "duration")

Unnamed: 0_level_0,count_na,count_of_not_na,percentage_of_not_na,percentage_of_na
groupby_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,50,100.0,0.0
1,1,49,98.0,2.0
2,2,48,96.0,4.0
3,0,50,100.0,0.0
4,0,50,100.0,0.0
5,1,49,98.0,2.0
6,0,50,100.0,0.0
7,0,50,100.0,0.0
8,1,49,98.0,2.0
9,1,49,98.0,2.0


In [92]:
k_dramas.missing_data.na_count_by_bins(73, "duration")

Unnamed: 0_level_0,count_na,count_of_not_na,percentage_of_not_na,percentage_of_na
groupby_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,24,100.000000,0.000000
1,0,24,100.000000,0.000000
2,0,24,100.000000,0.000000
3,0,24,100.000000,0.000000
4,3,21,87.500000,12.500000
...,...,...,...,...
68,1,23,95.833333,4.166667
69,0,24,100.000000,0.000000
70,0,24,100.000000,0.000000
71,0,24,100.000000,0.000000


In [95]:
k_dramas.missing_data.size_of_sections_of_na_and_not_na("duration")

Unnamed: 0,num_in_section,value
0,97,not_na
1,1,na
2,4,not_na
3,1,na
4,5,not_na
5,1,na
6,157,not_na
7,1,na
8,177,not_na
9,1,na


### Visualizaciones de datos faltantes

---

## Análisis Exploratorio de Datos

In [97]:
# k_dramas.info()

In [98]:
# k_dramas.describe()

Se descubrió que la duración puede que no sea por el dorama completo, sino por episodio. Por lo que se procede a hacer una nueva columna con la duración total del dorama.

In [99]:
# k_dramas['tot_duration'] = k_dramas['tot_eps'] * k_dramas['duration']

In [100]:
# k_dramas.plot(kind='scatter', x='tot_eps', y='tot_duration', figsize=(14, 6), grid=True)