In [36]:
import pandas as pd
import os

In [37]:
# Data loading
path = r'../data/raw/data_raw.csv'
alzheimer_df_raw = pd.read_csv(path)

## Modeling the Data Frame

In [38]:
alzheimer_df = alzheimer_df_raw.copy()

#Filtering only the rows about alzheimers disease
alzheimer_df = alzheimer_df[alzheimer_df["Padecimiento"].str.startswith("Enfermedad de Alzheimer", na=False)]

#cleaning innecesary columns
alzheimer_df = alzheimer_df.drop(["Pag.", "Cuadro ", "No_01", "No_02", "No_03", "No_04", "Ax_002", "Padecimiento"], axis=1)

# replacing "-" by "0"
alzheimer_df["Valor"] = alzheimer_df["Valor"].replace("-", "0")

# converting to numeric data type
alzheimer_df["Valor"] = pd.to_numeric(alzheimer_df["Valor"], errors="coerce")

# Converting "Semana" to a numeric data type
alzheimer_df["Semana"] = alzheimer_df["Semana"].str.replace("sem", "", regex=False).astype(int)

# replacing Entidad name 
alzheimer_df["Entidad"] = alzheimer_df["Entidad"].replace("Distrito Federal", "Ciudad de México")

# Removing redundant data

# Removing rows with "Acum." in "Ax_003" column
alzheimer_df = alzheimer_df[alzheimer_df["Ax_003"] != "Acum."]

# Removing rows with "TOTAL" in "Entidad" column
alzheimer_df = alzheimer_df[alzheimer_df["Entidad"] != "TOTAL"]

# Renaming columns
alzheimer_df = alzheimer_df.rename(columns={'Año': 'Year', 'Semana': 'Week', 'Entidad': 'Entity', 'Ax_001': 'Epi_Year', 'Valor': 'Value',})

In [41]:
alzheimer_df.head()

Unnamed: 0,Year,Week,Entity,Epi_Year,Ax_003,Value
6,2014,2,Aguascalientes,2014,Sem.,0.0
7,2014,2,Aguascalientes,2014,H,0.0
8,2014,2,Aguascalientes,2014,M,0.0
15,2014,2,Baja California,2014,Sem.,0.0
16,2014,2,Baja California,2014,H,0.0


In [42]:
# Missing values
nan_counts = alzheimer_df.isna().sum()
columnas_con_nan = nan_counts[nan_counts > 0]
columnas_con_nan_ordenadas = columnas_con_nan.sort_values(ascending=False)
print(columnas_con_nan_ordenadas)

Value    1
dtype: int64


In [43]:
alzheimer_df.isna().sum()

Year        0
Week        0
Entity      0
Epi_Year    0
Ax_003      0
Value       1
dtype: int64

In [44]:
alzheimer_df[alzheimer_df['Value'].isna()]

Unnamed: 0,Year,Week,Entity,Epi_Year,Ax_003,Value
55601,2016,50,Querétaro,2016,Sem.,


In [45]:
alzheimer_df.loc[55601, "Value"] = 0 # Imputing missing value with 0 manually infered by the week data

In [46]:
# Creating a new column "Fecha" based on "Año" and "Semana" according to ISO 8601 standard
def create_date_ISO8601(row):
    date_str = f"{int(row['Year'])}-W{int(row['Week']):02d}-1"
    
    try:
        return pd.to_datetime(date_str, format='%Y-W%W-%w')
    except ValueError:
        return pd.NaT

alzheimer_df['Date'] = alzheimer_df.apply(create_date_ISO8601, axis=1)

date_col = alzheimer_df.pop('Date')
alzheimer_df.insert(2, 'Date', date_col)


alzheimer_df.head()

Unnamed: 0,Year,Week,Date,Entity,Epi_Year,Ax_003,Value
6,2014,2,2014-01-13,Aguascalientes,2014,Sem.,0.0
7,2014,2,2014-01-13,Aguascalientes,2014,H,0.0
8,2014,2,2014-01-13,Aguascalientes,2014,M,0.0
15,2014,2,2014-01-13,Baja California,2014,Sem.,0.0
16,2014,2,2014-01-13,Baja California,2014,H,0.0


In [47]:
# Pivot the DataFrame to have separate columns for each type of count in 'Ax_003'
df_wide = alzheimer_df.pivot_table(
    index=['Year', 'Week', 'Date', 'Entity', 'Epi_Year'],
    columns='Ax_003',
    values='Value'
).reset_index()

df_wide.columns.name = None

df_wide.sort_values(by=['Entity', 'Year', 'Week'], inplace=True)
df_wide['Date'] = pd.to_datetime(df_wide['Date'])
df_wide = df_wide.rename(columns={'H':'M', 'M':'F', 'Sem.':'New_Cases_Week'})

In [48]:
# output_folder = '../data/interim/'
# filename = 'data_v2_English.csv'
# route = os.path.join(output_folder, filename)


# df_wide.to_csv(
#     route,
#     index=False,
#     encoding='utf-8'
# )

print(df_wide.info())
df_wide.head()

<class 'pandas.core.frame.DataFrame'>
Index: 18304 entries, 0 to 18303
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Year            18304 non-null  int64         
 1   Week            18304 non-null  int64         
 2   Date            18304 non-null  datetime64[ns]
 3   Entity          18304 non-null  object        
 4   Epi_Year        18304 non-null  int64         
 5   M               18304 non-null  float64       
 6   F               18304 non-null  float64       
 7   New_Cases_Week  18304 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(3), object(1)
memory usage: 1.3+ MB
None


Unnamed: 0,Year,Week,Date,Entity,Epi_Year,M,F,New_Cases_Week
0,2014,2,2014-01-13,Aguascalientes,2014,0.0,0.0,0.0
32,2014,3,2014-01-20,Aguascalientes,2014,0.0,0.0,0.0
64,2014,4,2014-01-27,Aguascalientes,2014,0.0,0.0,0.0
96,2014,5,2014-02-03,Aguascalientes,2014,0.0,0.0,0.0
128,2014,6,2014-02-10,Aguascalientes,2014,0.0,0.0,0.0
