<a href="https://colab.research.google.com/github/fowardelcac/Tp2_sem/blob/main/proccess_df_to_ddb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
import sqlite3
import pandas as pd
import numpy as np

In [71]:
def process(df: pd.DataFrame):
    # Función para imputar valores faltantes de forma aleatoria
    def random_imputer(df_filter: pd.DataFrame, col: str):
        # Seleccionar valores no nulos de la columna para la imputación
        non_null_values = df_filter[col].dropna().values

        # Aplicar un valor aleatorio de non_null_values donde hay NaN en la columna
        df_filter.loc[:, col] = df_filter[col].apply(
            lambda x: np.random.choice(non_null_values) if pd.isnull(x) else x
        )
        return df_filter

    # Crear una copia del DataFrame original
    df = df.copy()
    df.drop_duplicates(inplace=True)  # Eliminar duplicados
    df_filter = df.dropna(
        subset=["country", "designation"]
    )  # Filtrar filas con valores nulos en las columnas clave
    df_filter = random_imputer(
        df_filter, "price"
    )  # Imputar valores faltantes en la columna "price"
    return df_filter

In [72]:
def build_country(df: pd.DataFrame):
  unique_countries = df['country'].dropna().unique()
  countries_df = pd.DataFrame(unique_countries, columns=['country'])
  countries_df['country_id'] = countries_df.index + 1
  return countries_df

def build_prov(df: pd.DataFrame, country_df: pd.DataFrame):
  unique_provinces = df[['province', 'country']].dropna().drop_duplicates()  # Obtener provincias con su país
  unique_provinces = unique_provinces.merge(country_df[['country_id', 'country']], on='country', how='left')  # Unir con la tabla de países
  provinces_df = unique_provinces[['province', 'country_id']]  # Seleccionar las columnas necesarias
  provinces_df['province_id'] = provinces_df.index + 1
  provinces_df.columns = ['province', 'country_id', 'province_id',]
  return provinces_df # Renombrar las columnas para mantener consistencia

def build_region(df: pd.DataFrame, province_df: pd.DataFrame):
  unique_region = df[['province', 'region_1']].dropna().drop_duplicates()
  unique_region = unique_region.merge(province_df[['province_id', 'province']], on='province', how='left')
  region_df = unique_region[['region_1', 'province_id']]
  region_df['region_id'] = region_df.index + 1
  region_df.columns = ['region', 'province_id', 'region_id']
  return region_df

def build_variety(df: pd.DataFrame):
  unique_varieties = df['variety'].dropna().unique()
  var_df = pd.DataFrame(unique_varieties, columns=['variety'])
  var_df['variety_id'] = var_df.index + 1
  return var_df

def build_winery(df: pd.DataFrame):
  unique_winery = df['winery'].dropna().unique()
  win_var = pd.DataFrame(unique_winery, columns=['winery'])
  win_var['winery_id'] = win_var.index + 1
  return win_var

def build_taster(df: pd.DataFrame):
  unique_taster = df['taster_name'].dropna().unique()
  taster_df = pd.DataFrame(unique_taster, columns=['taster_name'])

  taster_handles = df[['taster_name', 'taster_twitter_handle']].dropna().drop_duplicates()
  taster_df = taster_df.merge(taster_handles, on='taster_name', how='left')
  taster_df['taster_id'] = taster_df.index + 1
  return taster_df

def build_wine(df: pd.DataFrame, country_table: pd.DataFrame, province_table: pd.DataFrame, region_table: pd.DataFrame, variety_table: pd.DataFrame, winery_table: pd.DataFrame):
  wine_df = df[['title', 'designation', 'points', 'price', 'variety', 'winery', 'country']]
  wine_df = wine_df.merge(variety_table[["variety_id", "variety"]], on='variety', how='left')
  wine_df = wine_df.merge(winery_table[["winery_id", "winery"]], on='winery', how='left')
  wine_df = wine_df.merge(country_table[["country_id", "country"]], on='country', how='left')
  wine_df.drop(['variety', 'winery', 'country'], axis=1, inplace=True)
  wine_df['wine_id'] = wine_df.index + 1
  return wine_df

def build_review(df: pd.DataFrame, taster_table: pd.DataFrame, wine_table: pd.DataFrame):
  review_table = df[['taster_name', 'title', 'description']]
  review_table = review_table.merge(taster_table[["taster_id", "taster_name"]], on='taster_name', how='left')
  review_table['taster_id'] = review_table['taster_id'].fillna(0).astype(int)

  review_table = review_table.merge(wine_table[["wine_id", "title"]], on='title', how='left')
  review_table.drop(["title", "taster_name"], axis=1, inplace=True)
  review_table['review_id'] = review_table.index + 1
  return review_table

In [73]:
def processor():
  data = pd.read_csv("https://media.githubusercontent.com/media/fowardelcac/Tp2_sem/refs/heads/main/winemag-data-130k-v2.csv").drop('Unnamed: 0', axis = 1)
  df = process(data)

  country_table = build_country(df)
  province_table = build_prov(df, country_table)
  region_table = build_region(df, province_table)
  variety_table = build_variety(df)
  winery_table = build_winery(df)
  taster_table = build_taster(df)
  taster_table.loc[len(taster_table)] = ['unknown', 'NaN', 0]

  wine_table = build_wine(df, country_table, province_table, region_table, variety_table, winery_table)
  review_table = build_review(df, taster_table, wine_table)
  return country_table.set_index("country_id"), province_table.set_index("province_id"), region_table.set_index("region_id"), variety_table.set_index("variety_id"), winery_table.set_index("winery_id"), taster_table.set_index("taster_id"), wine_table.set_index("wine_id"), review_table.set_index("review_id")


In [74]:
country_table, province_table, region_table, variety_table, winery_table, taster_table, wine_table, review_table = processor()

In [79]:
l = [country_table, province_table, region_table, variety_table, winery_table, taster_table, wine_table, review_table]
for i in l:
  print(i.head(1))
  print("*"* 100)

           country
country_id        
1            Italy
****************************************************************************************************
                      province  country_id
province_id                               
1            Sicily & Sardinia           1
****************************************************************************************************
          region  province_id
region_id                    
1           Etna            1
****************************************************************************************************
                variety
variety_id             
1           White Blend
****************************************************************************************************
            winery
winery_id         
1          Nicosia
****************************************************************************************************
             taster_name taster_twitter_handle
taster_id                                  