$$
ETL
$$

#### ETL Peliculas

In [38]:
# importar librerias necesarias
import pandas as pd
import numpy as np

In [39]:
# Cargar los datasets
df_netflix = pd.read_csv('src/csv_peliculas/netflix_titles.csv')
df_amazon = pd.read_csv('src/csv_peliculas/amazon_prime_titles.csv')
df_hulu = pd.read_csv('src/csv_peliculas/hulu_titles.csv')
df_disney = pd.read_csv('src/csv_peliculas/disney_plus_titles.csv')

#### Transformaciones y normalizacion

In [40]:
# Generar campo ID
df_netflix['id'] = 'n' + df_netflix['show_id'].astype(str)
df_amazon['id'] = 'a' + df_amazon['show_id'].astype(str)
df_hulu['id'] = 'h' + df_hulu['show_id'].astype(str)
df_disney['id'] = 'h' + df_disney['show_id'].astype(str)

In [41]:
# Reemplazar valores nulos del campo rating
df_netflix['rating'] = df_netflix['rating'].fillna('G')
df_amazon['rating'] = df_amazon['rating'].fillna('G')
df_hulu['rating'] = df_hulu['rating'].fillna('G')
df_disney['rating'] = df_disney['rating'].fillna('G')

In [42]:
# Cambiar el formato de las fechas
df_netflix['date_added'] = pd.to_datetime(df_netflix['date_added'], errors='coerce')
df_netflix['date_added'] = df_netflix['date_added'].dt.strftime('%Y-%m-%d')

df_amazon['date_added'] = pd.to_datetime(df_amazon['date_added'], errors='coerce')
df_amazon['date_added'] = df_amazon['date_added'].dt.strftime('%Y-%m-%d')

df_hulu['date_added'] = pd.to_datetime(df_hulu['date_added'], errors='coerce')
df_hulu['date_added'] = df_hulu['date_added'].dt.strftime('%Y-%m-%d')

df_disney['date_added'] = pd.to_datetime(df_disney['date_added'], errors='coerce')
df_disney['date_added'] = df_disney['date_added'].dt.strftime('%Y-%m-%d')

In [43]:
# Convertir el campo duration en dos campos: duration_int y duration_type
df_netflix[['duration_int', 'duration_type']] = df_netflix['duration'].str.extract('(\d+) (\w+)')
df_netflix['duration_int'] = pd.to_numeric(df_netflix['duration_int'])

df_amazon[['duration_int', 'duration_type']] = df_amazon['duration'].str.extract('(\d+) (\w+)')
df_amazon['duration_int'] = pd.to_numeric(df_amazon['duration_int'])

df_hulu[['duration_int', 'duration_type']] = df_hulu['duration'].str.extract('(\d+) (\w+)')
df_hulu['duration_int'] = pd.to_numeric(df_hulu['duration_int'])

df_disney[['duration_int', 'duration_type']] = df_disney['duration'].str.extract('(\d+) (\w+)')
df_disney['duration_int'] = pd.to_numeric(df_disney['duration_int'])

In [44]:
# Convertir los campos de texto a minúsculas
df_netflix = df_netflix.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
df_amazon = df_amazon.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
df_hulu = df_hulu.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
df_disney = df_disney.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

In [45]:
# Concatenar los dataframes
df_concat = pd.concat([df_netflix, df_hulu, df_disney, df_amazon], axis=0, ignore_index=True)

# Mostrar el resultado
print(df_concat.head())


  show_id     type                  title         director  \
0      s1    movie   dick johnson is dead  kirsten johnson   
1      s2  tv show          blood & water              NaN   
2      s3  tv show              ganglands  julien leclercq   
3      s4  tv show  jailbirds new orleans              NaN   
4      s5  tv show           kota factory              NaN   

                                                cast        country  \
0                                                NaN  united states   
1  ama qamata, khosi ngema, gail mabalane, thaban...   south africa   
2  sami bouajila, tracy gotoas, samuel jouy, nabi...            NaN   
3                                                NaN            NaN   
4  mayur more, jitendra kumar, ranjan raj, alam k...          india   

   date_added  release_year rating   duration  \
0  2021-09-25          2020  pg-13     90 min   
1  2021-09-24          2021  tv-ma  2 seasons   
2  2021-09-24          2021  tv-ma   1 season   
3  2

In [46]:
df_concat.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,id,duration_int,duration_type
0,s1,movie,dick johnson is dead,kirsten johnson,,united states,2021-09-25,2020,pg-13,90 min,documentaries,"as her father nears the end of his life, filmm...",ns1,90.0,min
1,s2,tv show,blood & water,,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t...",ns2,2.0,seasons
2,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",,2021-09-24,2021,tv-ma,1 season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...,ns3,1.0,season
3,s4,tv show,jailbirds new orleans,,,,2021-09-24,2021,tv-ma,1 season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo...",ns4,1.0,season
4,s5,tv show,kota factory,,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...,ns5,2.0,seasons


#### Importar DataFrame de ratings

In [47]:
ratings = pd.read_csv('src/csv_ratings/ratings_final.csv')
ratings.head()

Unnamed: 0,userId,score,timestamp,movieId
0,1,1.0,2015-03-09,as680
1,1,4.5,2015-03-09,ns2186
2,1,5.0,2015-03-09,hs2381
3,1,5.0,2015-03-09,ns3663
4,1,5.0,2015-03-09,as9500


In [48]:
# Agrupamos por 'movieid' y calculamos el promedio de las calificaciones
ratings = ratings.groupby('movieId').agg(np.mean).score.to_dict()

  ratings = ratings.groupby('movieId').agg(np.mean).score.to_dict()


In [49]:
# Unimos el dataframe de películas con el de ratings promedio, usando el campo 'id' como clave
df_concat['score'] = df_concat['id'].map(ratings)

In [50]:
df_concat['score']

0        3.611111
1        3.552632
2        3.597938
3        3.561616
4        3.593023
           ...   
22993    3.620915
22994    3.553215
22995    3.541750
22996    3.555102
22997    3.521739
Name: score, Length: 22998, dtype: float64

In [51]:
df_concat.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,id,duration_int,duration_type,score
0,s1,movie,dick johnson is dead,kirsten johnson,,united states,2021-09-25,2020,pg-13,90 min,documentaries,"as her father nears the end of his life, filmm...",ns1,90.0,min,3.611111
1,s2,tv show,blood & water,,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t...",ns2,2.0,seasons,3.552632
2,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",,2021-09-24,2021,tv-ma,1 season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...,ns3,1.0,season,3.597938
3,s4,tv show,jailbirds new orleans,,,,2021-09-24,2021,tv-ma,1 season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo...",ns4,1.0,season,3.561616
4,s5,tv show,kota factory,,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...,ns5,2.0,seasons,3.593023


In [53]:
df_concat.to_csv('src/csv_peliculas/peliculas_final.csv', index=False)