In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np
import os

In [2]:
# Data loading
path = r'../data/raw/data_raw.csv'
alzheimer_df_raw = pd.read_csv(path)

## Modeling the Data Frame

In [3]:
alzheimer_df = alzheimer_df_raw.copy()

#Filtering only the rows about alzheimers disease
alzheimer_df = alzheimer_df[alzheimer_df["Padecimiento"].str.startswith("Enfermedad de Alzheimer", na=False)]

#cleaning innecesary columns
alzheimer_df = alzheimer_df.drop(["Pag.", "Cuadro ", "No_01", "No_02", "No_03", "No_04", "Ax_002", "Padecimiento"], axis=1)

# replacing "-" by "0"
alzheimer_df["Valor"] = alzheimer_df["Valor"].replace("-", "0")

# converting to numeric data type
alzheimer_df["Valor"] = pd.to_numeric(alzheimer_df["Valor"], errors="coerce")

# Converting "Semana" to a numeric data type
alzheimer_df["Semana"] = alzheimer_df["Semana"].str.replace("sem", "", regex=False).astype(int)

# replacing Entidad name 
alzheimer_df["Entidad"] = alzheimer_df["Entidad"].replace("Distrito Federal", "Ciudad de México")

# Removing redundant data

# Removing rows with "Acum." in "Ax_003" column
alzheimer_df = alzheimer_df[alzheimer_df["Ax_003"] != "Acum."]

# Removing rows with "TOTAL" in "Entidad" column
alzheimer_df = alzheimer_df[alzheimer_df["Entidad"] != "TOTAL"]

alzheimer_df = alzheimer_df.rename(columns={'Ax_001': 'Año_Epi'})

In [4]:
alzheimer_df.head()

Unnamed: 0,Año,Semana,Entidad,Año_Epi,Ax_003,Valor
6,2014,2,Aguascalientes,2014,Sem.,0.0
7,2014,2,Aguascalientes,2014,H,0.0
8,2014,2,Aguascalientes,2014,M,0.0
15,2014,2,Baja California,2014,Sem.,0.0
16,2014,2,Baja California,2014,H,0.0


In [5]:
# Missing values
nan_counts = alzheimer_df.isna().sum()
columnas_con_nan = nan_counts[nan_counts > 0]
columnas_con_nan_ordenadas = columnas_con_nan.sort_values(ascending=False)
print(columnas_con_nan_ordenadas)

Valor    1
dtype: int64


In [6]:
alzheimer_df.isna().sum()

Año        0
Semana     0
Entidad    0
Año_Epi    0
Ax_003     0
Valor      1
dtype: int64

In [7]:
alzheimer_df[alzheimer_df['Valor'].isna()]

Unnamed: 0,Año,Semana,Entidad,Año_Epi,Ax_003,Valor
55601,2016,50,Querétaro,2016,Sem.,


In [8]:
alzheimer_df.loc[55601, "Valor"] = 0 # Imputing missing value with 0 manually infered by the week data

In [9]:
# Creating a new column "Fecha" based on "Año" and "Semana" according to ISO 8601 standard
def crear_fecha_lunes(row):
    fecha_str = f"{int(row['Año'])}-W{int(row['Semana']):02d}-1"
    
    try:
        return pd.to_datetime(fecha_str, format='%Y-W%W-%w')
    except ValueError:
        return pd.NaT

alzheimer_df['Fecha'] = alzheimer_df.apply(crear_fecha_lunes, axis=1)

fecha_col = alzheimer_df.pop('Fecha')
alzheimer_df.insert(2, 'Fecha', fecha_col)


alzheimer_df.head()

Unnamed: 0,Año,Semana,Fecha,Entidad,Año_Epi,Ax_003,Valor
6,2014,2,2014-01-13,Aguascalientes,2014,Sem.,0.0
7,2014,2,2014-01-13,Aguascalientes,2014,H,0.0
8,2014,2,2014-01-13,Aguascalientes,2014,M,0.0
15,2014,2,2014-01-13,Baja California,2014,Sem.,0.0
16,2014,2,2014-01-13,Baja California,2014,H,0.0


In [10]:
# Pivot the DataFrame to have separate columns for each type of count in 'Ax_003'
df_wide = alzheimer_df.pivot_table(
    index=['Año', 'Semana', 'Fecha', 'Entidad', 'Año_Epi'],
    columns='Ax_003',
    values='Valor'
).reset_index()

df_wide.columns.name = None

df_wide.sort_values(by=['Entidad', 'Año', 'Semana'], inplace=True)
df_wide['Fecha'] = pd.to_datetime(df_wide['Fecha'])

df_wide.head(10)

Unnamed: 0,Año,Semana,Fecha,Entidad,Año_Epi,H,M,Sem.
0,2014,2,2014-01-13,Aguascalientes,2014,0.0,0.0,0.0
32,2014,3,2014-01-20,Aguascalientes,2014,0.0,0.0,0.0
64,2014,4,2014-01-27,Aguascalientes,2014,0.0,0.0,0.0
96,2014,5,2014-02-03,Aguascalientes,2014,0.0,0.0,0.0
128,2014,6,2014-02-10,Aguascalientes,2014,0.0,0.0,0.0
160,2014,7,2014-02-17,Aguascalientes,2014,0.0,0.0,0.0
192,2014,8,2014-02-24,Aguascalientes,2014,0.0,0.0,0.0
224,2014,9,2014-03-03,Aguascalientes,2014,0.0,0.0,0.0
256,2014,10,2014-03-10,Aguascalientes,2014,0.0,0.0,0.0
288,2014,11,2014-03-17,Aguascalientes,2014,0.0,0.0,0.0


In [None]:
# output_folder = '../data/interim/'
# filename = 'data_v1.csv'
# route = os.path.join(output_folder, filename)


# df_wide.to_csv(
#     route,
#     index=False,
#     encoding='utf-8'
# )