In [1]:
import pandas as pd
import os

In [2]:
# Data loading
path = r'../data/processed/data_processed_v3.csv'
alzheimer_df = pd.read_csv(path)

print(alzheimer_df.info())
alzheimer_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18336 entries, 0 to 18335
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      18336 non-null  int64  
 1   Week      18336 non-null  int64  
 2   Date      18336 non-null  object 
 3   Entity    18336 non-null  object 
 4   Epi_Year  18336 non-null  int64  
 5   M         18336 non-null  float64
 6   F         18336 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 1002.9+ KB
None


Unnamed: 0,Year,Week,Date,Entity,Epi_Year,M,F
0,2014,2,2014-01-13,Aguascalientes,2014,0.0,0.0
1,2014,3,2014-01-20,Aguascalientes,2014,0.0,0.0
2,2014,4,2014-01-27,Aguascalientes,2014,0.0,0.0
3,2014,5,2014-02-03,Aguascalientes,2014,0.0,0.0
4,2014,6,2014-02-10,Aguascalientes,2014,0.0,0.0


# No Acumulative data by Male and Female
Transform cumulative case columns into weekly new cases per entity and epidemiological year.
For each group, replaces each value with the increment from the previous week 
(and sets the first week to the original value).

In [3]:
df_NoAcum_MF = alzheimer_df.copy()

for col in ['M', 'F']:
    df_NoAcum_MF[col] = df_NoAcum_MF.groupby(['Entity', 'Epi_Year'])[col].diff().fillna(df_NoAcum_MF[col])


In [None]:
# output_folder = '../data/processed/'
# filename = 'data_processed_v3_NoAcum_MF.csv'
# route = os.path.join(output_folder, filename)

# df_NoAcum_MF.to_csv(
#     route,
#     index=False,
#     encoding='utf-8'
# )

print(df_NoAcum_MF.info())
df_NoAcum_MF[47:52]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18336 entries, 0 to 18335
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      18336 non-null  int64  
 1   Week      18336 non-null  int64  
 2   Date      18336 non-null  object 
 3   Entity    18336 non-null  object 
 4   Epi_Year  18336 non-null  int64  
 5   M         18336 non-null  float64
 6   F         18336 non-null  float64
dtypes: float64(2), int64(3), object(2)
memory usage: 1002.9+ KB
None


Unnamed: 0,Year,Week,Date,Entity,Epi_Year,M,F
47,2014,49,2014-12-08,Aguascalientes,2014,0.0,0.0
48,2014,50,2014-12-15,Aguascalientes,2014,0.0,1.0
49,2014,51,2014-12-22,Aguascalientes,2014,0.0,0.0
50,2014,52,2014-12-29,Aguascalientes,2014,0.0,0.0
51,2014,53,2015-01-05,Aguascalientes,2014,0.0,1.0


# No Acumulative data by Total Cases
Calculate total cases per row by adding male and female counts, then drop the original gender columns.

In [5]:
df_NoAcum_Total = df_NoAcum_MF.copy()

df_NoAcum_Total['total_cases'] = df_NoAcum_Total['M'] + df_NoAcum_Total['F']
df_NoAcum_Total.drop(['M', 'F'], axis=1, inplace=True)

In [None]:
# output_folder = '../data/processed/'
# filename = 'data_processed_v3_NoAcum_Total.csv'
# route = os.path.join(output_folder, filename)

# df_NoAcum_Total.to_csv(
#     route,
#     index=False,
#     encoding='utf-8'
# )

print(df_NoAcum_Total.info())
df_NoAcum_Total[47:52]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18336 entries, 0 to 18335
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Year         18336 non-null  int64  
 1   Week         18336 non-null  int64  
 2   Date         18336 non-null  object 
 3   Entity       18336 non-null  object 
 4   Epi_Year     18336 non-null  int64  
 5   total_cases  18336 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 859.6+ KB
None


Unnamed: 0,Year,Week,Date,Entity,Epi_Year,total_cases
47,2014,49,2014-12-08,Aguascalientes,2014,0.0
48,2014,50,2014-12-15,Aguascalientes,2014,1.0
49,2014,51,2014-12-22,Aguascalientes,2014,0.0
50,2014,52,2014-12-29,Aguascalientes,2014,0.0
51,2014,53,2015-01-05,Aguascalientes,2014,1.0


# National No Acumulative data by Total Cases

In [7]:
# Aggregate all entities by Year and Week
df_national = df_NoAcum_Total.groupby(['Year', 'Week', 'Date', 'Epi_Year'], as_index=False)['total_cases'].sum()

# Verify the result
print(f"Original df shape: {df_NoAcum_Total.shape}")
print(f"National df shape: {df_national.shape}")
print(f"\nFirst rows of national df:")
print(df_national.head())

# Quick validation
print(f"\nValidation:")
print(f"Original unique Year-Week combinations: {df_NoAcum_Total[['Year', 'Week']].drop_duplicates().shape[0]}")
print(f"National df rows: {len(df_national)}")

Original df shape: (18336, 6)
National df shape: (573, 5)

First rows of national df:
   Year  Week        Date  Epi_Year  total_cases
0  2014     2  2014-01-13      2014          4.0
1  2014     3  2014-01-20      2014         29.0
2  2014     4  2014-01-27      2014         47.0
3  2014     5  2014-02-03      2014         36.0
4  2014     6  2014-02-10      2014         42.0

Validation:
Original unique Year-Week combinations: 573
National df rows: 573


In [None]:
# output_folder = '../data/processed/'
# filename = 'data_processed_v3_National_NoAcum_Total.csv'
# route = os.path.join(output_folder, filename)

# df_national.to_csv(
#     route,
#     index=False,
#     encoding='utf-8'
# )

print(df_national.info())
df_national.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573 entries, 0 to 572
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Year         573 non-null    int64  
 1   Week         573 non-null    int64  
 2   Date         573 non-null    object 
 3   Epi_Year     573 non-null    int64  
 4   total_cases  573 non-null    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 22.5+ KB
None


Unnamed: 0,Year,Week,Date,Epi_Year,total_cases
0,2014,2,2014-01-13,2014,4.0
1,2014,3,2014-01-20,2014,29.0
2,2014,4,2014-01-27,2014,47.0
3,2014,5,2014-02-03,2014,36.0
4,2014,6,2014-02-10,2014,42.0
