# Actividad #5. Normalización

## Parte 1. COVID

In [3]:
import pandas as pd

URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

df = pd.read_csv(URL)

display(df.head())

print(df.dtypes)

print("\nNulos por columna:")
nulos = df.isna().sum()
nulos = nulos[nulos > 0]
print(nulos if not nulos.empty else 'No hay columnas con nulos.')

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,209322,209340,209358,209362,209369,209390,209406,209436,209451,209451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,334391,334408,334408,334427,334427,334427,334427,334427,334443,334457
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,271441,271448,271463,271469,271469,271477,271477,271490,271494,271496
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,47866,47875,47875,47875,47875,47875,47875,47875,47890,47890
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,105255,105277,105277,105277,105277,105277,105277,105277,105288,105288


Province/State     object
Country/Region     object
Lat               float64
Long              float64
1/22/20             int64
                   ...   
3/5/23              int64
3/6/23              int64
3/7/23              int64
3/8/23              int64
3/9/23              int64
Length: 1147, dtype: object

Nulos por columna:
Province/State    198
Lat                 2
Long                2
dtype: int64


In [4]:
# Renombrar columnas a snake_case

df = df.rename(columns={
    'Province/State': 'province_state',
    'Country/Region': 'country_region',
    'Lat': 'lat',
    'Long': 'long'
})

display(df.head())

Unnamed: 0,province_state,country_region,lat,long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,209322,209340,209358,209362,209369,209390,209406,209436,209451,209451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,334391,334408,334408,334427,334427,334427,334427,334427,334443,334457
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,271441,271448,271463,271469,271469,271477,271477,271490,271494,271496
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,47866,47875,47875,47875,47875,47875,47875,47875,47890,47890
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,105255,105277,105277,105277,105277,105277,105277,105277,105288,105288


In [7]:
# Pivoteo wide a long
date_cols = df.columns[df.columns.get_loc('long') + 1:]

df_long = df.melt(
    id_vars=['province_state', 'country_region', 'lat', 'long'],
    value_vars=date_cols,
    var_name='date',
    value_name='confirmed'
)

# converitr a formato de fecha yyyy-mm-dd
df_long['date'] = pd.to_datetime(df_long['date'], format='%m/%d/%y')
display(df_long.head())

Unnamed: 0,province_state,country_region,lat,long,date,confirmed
0,,Afghanistan,33.93911,67.709953,2020-01-22,0
1,,Albania,41.1533,20.1683,2020-01-22,0
2,,Algeria,28.0339,1.6596,2020-01-22,0
3,,Andorra,42.5063,1.5218,2020-01-22,0
4,,Angola,-11.2027,17.8739,2020-01-22,0


In [8]:
# Normalización
df_long['lat'] = pd.to_numeric(df_long['lat'], errors='coerce')
df_long['long'] = pd.to_numeric(df_long['long'], errors='coerce')

# Confirmed a no negativos
df_long['confirmed'] = pd.to_numeric(df_long['confirmed'], errors='coerce')
df_long.loc[df_long['confirmed'] < 0, 'confirmed'] = pd.NA

# los datos de COVID no deberían ser negativos y NA implicaria que no hay casos reportados
df_long['confirmed'] = df_long['confirmed'].fillna(0).astype('int64')

# Verificación
display(df_long.head())
display(df_long.dtypes)


Unnamed: 0,province_state,country_region,lat,long,date,confirmed
0,,Afghanistan,33.93911,67.709953,2020-01-22,0
1,,Albania,41.1533,20.1683,2020-01-22,0
2,,Algeria,28.0339,1.6596,2020-01-22,0
3,,Andorra,42.5063,1.5218,2020-01-22,0
4,,Angola,-11.2027,17.8739,2020-01-22,0


province_state            object
country_region            object
lat                      float64
long                     float64
date              datetime64[ns]
confirmed                  int64
dtype: object

In [9]:
print("\nNulos por columna:")
nulos = df_long.isna().sum()
nulos = nulos[nulos > 0]
print(nulos if not nulos.empty else 'No hay columnas con nulos en df_long.')


Nulos por columna:
province_state    226314
lat                 2286
long                2286
dtype: int64


In [10]:
import pycountry

def get_iso3(country):
    try:
        return pycountry.countries.lookup(country).alpha_3
    except Exception:
        print(f"Error al obtener el ISO3 para {country}")
        return None

df_long['iso3'] = df_long['country_region'].apply(get_iso3)

display(df_long[['country_region', 'iso3']].drop_duplicates().head(20))

Error al obtener el ISO3 para Brunei
Error al obtener el ISO3 para Burma
Error al obtener el ISO3 para Congo (Brazzaville)
Error al obtener el ISO3 para Congo (Kinshasa)
Error al obtener el ISO3 para Cote d'Ivoire
Error al obtener el ISO3 para Diamond Princess
Error al obtener el ISO3 para Holy See
Error al obtener el ISO3 para Korea, North
Error al obtener el ISO3 para Korea, South
Error al obtener el ISO3 para Kosovo
Error al obtener el ISO3 para MS Zaandam
Error al obtener el ISO3 para Micronesia
Error al obtener el ISO3 para Russia
Error al obtener el ISO3 para Summer Olympics 2020
Error al obtener el ISO3 para Taiwan*
Error al obtener el ISO3 para Turkey
Error al obtener el ISO3 para West Bank and Gaza
Error al obtener el ISO3 para Winter Olympics 2022
Error al obtener el ISO3 para Brunei
Error al obtener el ISO3 para Burma
Error al obtener el ISO3 para Congo (Brazzaville)
Error al obtener el ISO3 para Congo (Kinshasa)
Error al obtener el ISO3 para Cote d'Ivoire
Error al obtener e

Unnamed: 0,country_region,iso3
0,Afghanistan,AFG
1,Albania,ALB
2,Algeria,DZA
3,Andorra,AND
4,Angola,AGO
5,Antarctica,ATA
6,Antigua and Barbuda,ATG
7,Argentina,ARG
8,Armenia,ARM
9,Australia,AUS


In [None]:
# Validar que lat[-90, 90] y long[-180, 180]
lat_not_valid = ~df_long['lat'].between(-90, 90) & df_long['lat'].notna()
long_not_valid = ~df_long['long'].between(-180, 180) & df_long['long'].notna()

print(f"Filas fuera de rango latitud (-90 a 90): {lat_not_valid.sum()}")
print(f"Filas fuera de rango longitud (-180 a 180): {long_not_valid.sum()}")

Filas fuera de rango latitud (-90 a 90): 0
Filas fuera de rango longitud (-180 a 180): 0


# Parte 2. Chipotle

In [13]:
import pandas as pd

data_url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"

chipotle = pd.read_csv(data_url, sep='\t')

display(chipotle.head())

print(chipotle.dtypes)

# Nulos por columna
print("\nNulos por columna:")
nulos = chipotle.isnull().sum()
nulos = nulos[nulos > 0]
print(nulos if not nulos.empty else 'No hay columnas con nulos.')

# Ejemplos de item_price
print("\nEjemplos de item_price:")
print(chipotle['item_price'].head(10).tolist())

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39
2,1,1,Nantucket Nectar,[Apple],$3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,$2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",$16.98


order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

Nulos por columna:
choice_description    1246
dtype: int64

Ejemplos de item_price:
['$2.39 ', '$3.39 ', '$3.39 ', '$2.39 ', '$16.98 ', '$10.98 ', '$1.69 ', '$11.75 ', '$9.25 ', '$9.25 ']


In [18]:
# Normalización de precios y tipos

# Quitar $ y convertir item_price a float (unit_price)
chipotle['unit_price'] = (
    chipotle['item_price']
    .replace(r'[\$,]', '', regex=True)
    .astype(float)
)

# Verificar que quantity es entero positivo
chipotle['quantity'] = pd.to_numeric(chipotle['quantity'], errors="raise")

# Calcular line_total = unit_price * quantity
chipotle['line_total'] = round(chipotle['unit_price'] * chipotle['quantity'], 2)

display(chipotle[['item_price', 'unit_price', 'quantity', 'line_total']].head(10))

Unnamed: 0,item_price,unit_price,quantity,line_total
0,$2.39,2.39,1,2.39
1,$3.39,3.39,1,3.39
2,$3.39,3.39,1,3.39
3,$2.39,2.39,1,2.39
4,$16.98,16.98,2,33.96
5,$10.98,10.98,1,10.98
6,$1.69,1.69,1,1.69
7,$11.75,11.75,1,11.75
8,$9.25,9.25,1,9.25
9,$9.25,9.25,1,9.25


In [20]:
# Normalización de item_name
chipotle['item_name_norm'] = (
    chipotle['item_name']
    .astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
)

display(chipotle[['item_name', 'item_name_norm']].drop_duplicates().head(10))

Unnamed: 0,item_name,item_name_norm
0,Chips and Fresh Tomato Salsa,chips and fresh tomato salsa
1,Izze,izze
2,Nantucket Nectar,nantucket nectar
3,Chips and Tomatillo-Green Chili Salsa,chips and tomatillo-green chili salsa
4,Chicken Bowl,chicken bowl
6,Side of Chips,side of chips
7,Steak Burrito,steak burrito
8,Steak Soft Tacos,steak soft tacos
10,Chips and Guacamole,chips and guacamole
11,Chicken Crispy Tacos,chicken crispy tacos


In [23]:
# Validación de rangos
assert (chipotle['unit_price'] >= 0).all(), "unit_price contiene valores negativos"
assert (chipotle['quantity'] >= 1).all(), "quantity contiene valores menores a 1"