<a href="https://colab.research.google.com/github/fowardelcac/Tp2_sem/blob/main/proccess_df_to_ddb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sqlite3
import pandas as pd
import numpy as np

# Preprocesamiento de datos con Python

In [2]:
def process(df: pd.DataFrame):
    # Función para imputar valores faltantes de forma aleatoria
    def random_imputer(df_filter: pd.DataFrame, col: str):
        # Seleccionar valores no nulos de la columna para la imputación
        non_null_values = df_filter[col].dropna().values

        # Aplicar un valor aleatorio de non_null_values donde hay NaN en la columna
        df_filter.loc[:, col] = df_filter[col].apply(
            lambda x: np.random.choice(non_null_values) if pd.isnull(x) else x
        )
        return df_filter

    # Crear una copia del DataFrame original
    df = df.copy()
    df.drop_duplicates(inplace=True)  # Eliminar duplicados
    df_filter = df.dropna(
        subset=["country", "designation"]
    )  # Filtrar filas con valores nulos en las columnas clave
    df_filter = random_imputer(
        df_filter, "price"
    )  # Imputar valores faltantes en la columna "price"
    return df_filter

In [3]:
def build_country(df: pd.DataFrame):
  unique_countries = df['country'].dropna().unique()
  countries_df = pd.DataFrame(unique_countries, columns=['country'])
  countries_df['country_id'] = countries_df.index + 1
  return countries_df

def build_prov(df: pd.DataFrame, country_df: pd.DataFrame):
  unique_provinces = df[['province', 'country']].dropna().drop_duplicates()  # Obtener provincias con su país
  unique_provinces = unique_provinces.merge(country_df[['country_id', 'country']], on='country', how='left')  # Unir con la tabla de países
  provinces_df = unique_provinces[['province', 'country_id']]  # Seleccionar las columnas necesarias
  provinces_df['province_id'] = provinces_df.index + 1
  provinces_df.columns = ['province', 'country_id', 'province_id',]
  return provinces_df # Renombrar las columnas para mantener consistencia

def build_region(df: pd.DataFrame, province_df: pd.DataFrame):
  unique_region = df[['province', 'region_1']].dropna().drop_duplicates()
  unique_region = unique_region.merge(province_df[['province_id', 'province']], on='province', how='left')
  region_df = unique_region[['region_1', 'province_id']]
  region_df['region_id'] = region_df.index + 1
  region_df.columns = ['region', 'province_id', 'region_id']
  return region_df

def build_variety(df: pd.DataFrame):
  unique_varieties = df['variety'].dropna().unique()
  var_df = pd.DataFrame(unique_varieties, columns=['variety'])
  var_df['variety_id'] = var_df.index + 1
  return var_df

def build_winery(df: pd.DataFrame):
  unique_winery = df['winery'].dropna().unique()
  win_var = pd.DataFrame(unique_winery, columns=['winery'])
  win_var['winery_id'] = win_var.index + 1
  return win_var

def build_taster(df: pd.DataFrame):
  unique_taster = df['taster_name'].dropna().unique()
  taster_df = pd.DataFrame(unique_taster, columns=['taster_name'])

  taster_handles = df[['taster_name', 'taster_twitter_handle']].dropna().drop_duplicates()
  taster_df = taster_df.merge(taster_handles, on='taster_name', how='left')
  taster_df['taster_id'] = taster_df.index + 1
  return taster_df

def build_wine(df: pd.DataFrame, country_table: pd.DataFrame, province_table: pd.DataFrame, region_table: pd.DataFrame, variety_table: pd.DataFrame, winery_table: pd.DataFrame):
  wine_df = df[['title', 'designation', 'points', 'price', 'variety', 'winery', 'country']]
  wine_df = wine_df.merge(variety_table[["variety_id", "variety"]], on='variety', how='left')
  wine_df = wine_df.merge(winery_table[["winery_id", "winery"]], on='winery', how='left')
  wine_df = wine_df.merge(country_table[["country_id", "country"]], on='country', how='left')
  wine_df.drop(['variety', 'winery', 'country'], axis=1, inplace=True)
  wine_df['wine_id'] = wine_df.index + 1
  return wine_df

def build_review(df: pd.DataFrame, taster_table: pd.DataFrame, wine_table: pd.DataFrame):
  review_table = df[['taster_name', 'title', 'description']]
  review_table = review_table.merge(taster_table[["taster_id", "taster_name"]], on='taster_name', how='left')
  review_table['taster_id'] = review_table['taster_id'].fillna(0).astype(int)

  review_table = review_table.merge(wine_table[["wine_id", "title"]], on='title', how='left')
  review_table.drop(["title", "taster_name"], axis=1, inplace=True)
  review_table['review_id'] = review_table.index + 1
  return review_table

In [4]:
def processor():
  data = pd.read_csv("https://media.githubusercontent.com/media/fowardelcac/Tp2_sem/refs/heads/main/winemag-data-130k-v2.csv").drop('Unnamed: 0', axis = 1)
  df = process(data)

  country_table = build_country(df)
  province_table = build_prov(df, country_table)
  region_table = build_region(df, province_table)
  variety_table = build_variety(df)
  winery_table = build_winery(df)
  taster_table = build_taster(df)
  taster_table.loc[len(taster_table)] = ['unknown', 'NaN', 0]

  wine_table = build_wine(df, country_table, province_table, region_table, variety_table, winery_table)
  review_table = build_review(df, taster_table, wine_table)
  return country_table.set_index("country_id"), province_table.set_index("province_id"), region_table.set_index("region_id"), variety_table.set_index("variety_id"), winery_table.set_index("winery_id"), taster_table.set_index("taster_id"), wine_table.set_index("wine_id"), review_table.set_index("review_id")


In [5]:
country_table, province_table, region_table, variety_table, winery_table, taster_table, wine_table, review_table = processor()

In [7]:
tables_list = [country_table, province_table, region_table, variety_table, winery_table, taster_table, wine_table, review_table]
for i in tables_list:
  print(i.head(1))
  print("*"* 100)

           country
country_id        
1            Italy
****************************************************************************************************
                      province  country_id
province_id                               
1            Sicily & Sardinia           1
****************************************************************************************************
          region  province_id
region_id                    
1           Etna            1
****************************************************************************************************
                variety
variety_id             
1           White Blend
****************************************************************************************************
            winery
winery_id         
1          Nicosia
****************************************************************************************************
             taster_name taster_twitter_handle
taster_id                                  

# SQL

In [8]:
table_names = ['country', 'province', 'region', 'variety', 'winery', 'taster', 'wine', 'review']

conexion = sqlite3.connect('WINE_DDB.db')
with conexion:
  for i, table_name in enumerate(table_names):
    tables_list[i].to_sql(name=table_name, con=conexion, if_exists='replace', index=True)

In [9]:
pd.read_sql(f"SELECT * FROM country LIMIT 3;", conexion)

Unnamed: 0,country_id,country
0,1,Italy
1,2,Portugal
2,3,US


In [None]:
pd.read_sql(f"SELECT * FROM {table_names[1]} LIMIT 3;", conexion)

Unnamed: 0,province_id,province,country_id
0,1,Sicily & Sardinia,1
1,2,Douro,2
2,3,Michigan,3


In [None]:
pd.read_sql(f"SELECT * FROM {table_names[2]} LIMIT 3;", conexion)

Unnamed: 0,region_id,region,province_id
0,1,Etna,1
1,2,Lake Michigan Shore,3
2,3,Willamette Valley,4


In [None]:
pd.read_sql(f"SELECT * FROM {table_names[3]} LIMIT 3;", conexion)

Unnamed: 0,variety_id,variety
0,1,White Blend
1,2,Portuguese Red
2,3,Riesling


In [None]:
pd.read_sql(f"SELECT * FROM {table_names[4]} LIMIT 3;", conexion)

Unnamed: 0,winery_id,winery
0,1,Nicosia
1,2,Quinta dos Avidagos
2,3,St. Julian


In [None]:
pd.read_sql(f"SELECT * FROM {table_names[5]} LIMIT 3;", conexion)

Unnamed: 0,taster_id,taster_name,taster_twitter_handle
0,1,Kerin O’Keefe,@kerinokeefe
1,2,Roger Voss,@vossroger
2,3,Alexander Peartree,


In [None]:
pd.read_sql(f"SELECT * FROM {table_names[6]} LIMIT 3;", conexion)

Unnamed: 0,wine_id,title,designation,points,price,variety_id,winery_id,country_id
0,1,Nicosia 2013 Vulkà Bianco (Etna),Vulkà Bianco,87,80.0,1,1,1
1,2,Quinta dos Avidagos 2011 Avidagos Red (Douro),Avidagos,87,15.0,2,2,2
2,3,St. Julian 2013 Reserve Late Harvest Riesling ...,Reserve Late Harvest,87,13.0,3,3,3


In [None]:
pd.read_sql(f"SELECT * FROM {table_names[7]} LIMIT 3;", conexion)

Unnamed: 0,review_id,description,taster_id,wine_id
0,1,"Aromas include tropical fruit, broom, brimston...",1,1
1,2,"This is ripe and fruity, a wine that is smooth...",2,2
2,3,"Pineapple rind, lemon pith and orange blossom ...",3,3


# Consultas SQL

### Obtener la cantidad de vinos por pais.

In [10]:
query1 = """
SELECT country, COUNT(wine_id) AS total_wines
FROM wine
JOIN country  ON wine.country_id = country.country_id
GROUP BY country
ORDER BY total_wines DESC LIMIT 10;
"""
pd.read_sql(query1, conexion)

Unnamed: 0,country,total_wines
0,US,34170
1,France,13386
2,Italy,12798
3,Spain,4891
4,Portugal,4715
5,Chile,3627
6,Argentina,2693
7,Austria,2582
8,Germany,1847
9,Australia,1539


### Precio promedio por vino

In [11]:
query2 = """
SELECT points, AVG(price) AS avg_price
FROM wine
GROUP BY points
ORDER BY points DESC;
"""
pd.read_sql(query2, conexion)

Unnamed: 0,points,avg_price
0,100,359.692308
1,99,254.172414
2,98,202.207547
3,97,147.556213
4,96,124.158273
5,95,93.951338
6,94,76.135976
7,93,61.394488
8,92,51.196448
9,91,44.116705


# Los 10 sommelier con mas reseñas

In [26]:
query = """
SELECT taster_name, COUNT(*) AS total_reviews
FROM taster
ORDER BY total_reviews DESC
"""
pd.read_sql(query, conexion)

Unnamed: 0,taster_name,total_reviews
0,Kerin O’Keefe,20


In [27]:
query = """
SELECT *
FROM taster
"""
pd.read_sql(query, conexion)

Unnamed: 0,taster_id,taster_name,taster_twitter_handle
0,1,Kerin O’Keefe,@kerinokeefe
1,2,Roger Voss,@vossroger
2,3,Alexander Peartree,
3,4,Paul Gregutt,@paulgwine
4,5,Michael Schachner,@wineschach
5,6,Anna Lee C. Iijima,
6,7,Virginie Boone,@vboone
7,8,Matt Kettmann,@mattkettmann
8,9,Sean P. Sullivan,@wawinereport
9,10,Joe Czerwinski,@JoeCz


In [29]:
query = """
SELECT taster_id, COUNT(*) AS total_reviews
FROM review
WHERE taster_id != 0
GROUP BY taster_id
ORDER BY total_reviews DESC
LIMIT 10;
"""
pd.read_sql(query, conexion)


Unnamed: 0,taster_id,total_reviews
0,2,17738
1,5,11693
2,1,6904
3,7,6304
4,4,6080
5,8,4195
6,10,3219
7,9,3215
8,6,3196
9,16,2872


In [31]:
query = """
SELECT
    t.taster_name,
    COUNT(*) AS total_reviews
FROM review r
JOIN taster t ON r.taster_id = t.taster_id
WHERE r.taster_id != 0
GROUP BY t.taster_name
ORDER BY total_reviews DESC
LIMIT 10;

"""
pd.read_sql(query, conexion)


Unnamed: 0,taster_name,total_reviews
0,Roger Voss,17738
1,Michael Schachner,11693
2,Kerin O’Keefe,6904
3,Virginie Boone,6304
4,Paul Gregutt,6080
5,Matt Kettmann,4195
6,Joe Czerwinski,3219
7,Sean P. Sullivan,3215
8,Anna Lee C. Iijima,3196
9,Anne Krebiehl MW,2872


#Variedad de uva mas comun

In [40]:
query = """
SELECT
    variety,
    COUNT(*) AS variety_count
FROM variety
GROUP BY variety
ORDER BY variety_count DESC
LIMIT 1;

"""
pd.read_sql(query, conexion)

Unnamed: 0,variety,variety_count
0,Çalkarası,1
