In [1]:
import pandas as pd
import os

In [2]:
# Data loading
path = r'../data/interim/data_v2_English.csv'
alzheimer_df = pd.read_csv(path)
alzheimer_df.head()

Unnamed: 0,Year,Week,Date,Entity,Epi_Year,M,F,New_Cases_Week
0,2014,2,2014-01-06,Aguascalientes,2014,0.0,0.0,0.0
1,2014,3,2014-01-13,Aguascalientes,2014,0.0,0.0,0.0
2,2014,4,2014-01-20,Aguascalientes,2014,0.0,0.0,0.0
3,2014,5,2014-01-27,Aguascalientes,2014,0.0,0.0,0.0
4,2014,6,2014-02-03,Aguascalientes,2014,0.0,0.0,0.0


In [3]:
alzheimer_check_df = alzheimer_df.copy()

# Check if there are duplicates in the combination of the three columns
duplicates = alzheimer_check_df.duplicated(subset=['Year', 'Week', 'Entity'])

# Count how many duplicates there are
num_duplicates = duplicates.sum()
print(f"Number of duplicate records: {num_duplicates}")

Number of duplicate records: 0


In [4]:
def find_missing_combinations(df, entity_col='Entity', year_col='Year', week_col='Week'):
    # Get all unique entities and years
    entities = df[entity_col].unique()
    years = range(df[year_col].min(), df[year_col].max() + 1)
    
    # Build expected combinations (adjust rules as needed!)
    expected_combinations = set()
    for entity in entities:
        for year in years:
            # Determine how many weeks each year has (custom logic)
            if year == 2014:
                weeks = range(2, 54)   # Weeks 2-53 (52 weeks)
            elif year == 2020:
                weeks = range(1, 54)   # Weeks 1-53 (53 weeks)
            else:
                weeks = range(1, 53)   # Weeks 1-52 (52 weeks)
            for week in weeks:
                expected_combinations.add((year, week, entity))
    
    # Get existing combinations
    existing_combinations = set(zip(df[year_col], df[week_col], df[entity_col]))
    missing = expected_combinations - existing_combinations

    # Results and summary
    print(f"Found {len(missing)} missing combinations!")
    if missing:
        print("\nMissing combinations:")
        for combo in sorted(missing):
            print(f"Year: {combo[0]}, Week: {combo[1]}, Entity: {combo[2]}")
    else:
        print("✓ No missing combinations. All data is complete.")

    print(f"\n--- Summary ---")
    print(f"Total entities: {len(entities)}")
    print(f"Total expected combinations: {len(expected_combinations)}")
    print(f"Total existing combinations: {len(existing_combinations)}")
    print(f"Missing combinations: {len(missing)}")
    
    return missing

missing_combos = find_missing_combinations(alzheimer_check_df)

Found 64 missing combinations!

Missing combinations:
Year: 2015, Week: 1, Entity: Aguascalientes
Year: 2015, Week: 1, Entity: Baja California
Year: 2015, Week: 1, Entity: Baja California Sur
Year: 2015, Week: 1, Entity: Campeche
Year: 2015, Week: 1, Entity: Chiapas
Year: 2015, Week: 1, Entity: Chihuahua
Year: 2015, Week: 1, Entity: Ciudad de México
Year: 2015, Week: 1, Entity: Coahuila
Year: 2015, Week: 1, Entity: Colima
Year: 2015, Week: 1, Entity: Durango
Year: 2015, Week: 1, Entity: Guanajuato
Year: 2015, Week: 1, Entity: Guerrero
Year: 2015, Week: 1, Entity: Hidalgo
Year: 2015, Week: 1, Entity: Jalisco
Year: 2015, Week: 1, Entity: Michoacán
Year: 2015, Week: 1, Entity: Morelos
Year: 2015, Week: 1, Entity: México
Year: 2015, Week: 1, Entity: Nayarit
Year: 2015, Week: 1, Entity: Nuevo León
Year: 2015, Week: 1, Entity: Oaxaca
Year: 2015, Week: 1, Entity: Puebla
Year: 2015, Week: 1, Entity: Querétaro
Year: 2015, Week: 1, Entity: Quintana Roo
Year: 2015, Week: 1, Entity: San Luis Potos

In [5]:
# Data loading
path = r'../data/raw/2024_week_10.csv'
week10_df = pd.read_csv(path)
week10_df.head()

Unnamed: 0,Entity,Sem.,H,M,Acum.
0,Aguascalientes,1,2,3,4
1,Baja California,3,7,9,17
2,Baja California Sur,0,2,1,3
3,Campeche,0,3,2,7
4,Coahuila,2,4,14,15


In [6]:
week10_df = week10_df.rename(columns={'M': 'F', 'H': 'M', 'Sem.': 'New_Cases_Week'})
week10_df['Year'] = 2024
week10_df['Epi_Year'] = 2024
week10_df['Week'] = 10
week10_df['Date'] = '2024-03-04'
week10_df = week10_df.drop('Acum.', axis=1)
week10_ready = week10_df[['Year', 'Week', 'Date', 'Entity', 'Epi_Year', 'M', 'F', 'New_Cases_Week']]
    
week10_ready.head()

Unnamed: 0,Year,Week,Date,Entity,Epi_Year,M,F,New_Cases_Week
0,2024,10,2024-03-04,Aguascalientes,2024,2,3,1
1,2024,10,2024-03-04,Baja California,2024,7,9,3
2,2024,10,2024-03-04,Baja California Sur,2024,2,1,0
3,2024,10,2024-03-04,Campeche,2024,3,2,0
4,2024,10,2024-03-04,Coahuila,2024,4,14,2


In [7]:
combined_df = pd.concat([alzheimer_check_df, week10_ready], ignore_index=True)

In [8]:
missing_combos = find_missing_combinations(combined_df)

Found 32 missing combinations!

Missing combinations:
Year: 2015, Week: 1, Entity: Aguascalientes
Year: 2015, Week: 1, Entity: Baja California
Year: 2015, Week: 1, Entity: Baja California Sur
Year: 2015, Week: 1, Entity: Campeche
Year: 2015, Week: 1, Entity: Chiapas
Year: 2015, Week: 1, Entity: Chihuahua
Year: 2015, Week: 1, Entity: Ciudad de México
Year: 2015, Week: 1, Entity: Coahuila
Year: 2015, Week: 1, Entity: Colima
Year: 2015, Week: 1, Entity: Durango
Year: 2015, Week: 1, Entity: Guanajuato
Year: 2015, Week: 1, Entity: Guerrero
Year: 2015, Week: 1, Entity: Hidalgo
Year: 2015, Week: 1, Entity: Jalisco
Year: 2015, Week: 1, Entity: Michoacán
Year: 2015, Week: 1, Entity: Morelos
Year: 2015, Week: 1, Entity: México
Year: 2015, Week: 1, Entity: Nayarit
Year: 2015, Week: 1, Entity: Nuevo León
Year: 2015, Week: 1, Entity: Oaxaca
Year: 2015, Week: 1, Entity: Puebla
Year: 2015, Week: 1, Entity: Querétaro
Year: 2015, Week: 1, Entity: Quintana Roo
Year: 2015, Week: 1, Entity: San Luis Potos

In [9]:
# Columns list to verify
cols_to_check = ['M', 'F', 'New_Cases_Week']

# Looking for negative values
negatives = combined_df[(combined_df[cols_to_check] < 0).any(axis=1)]

# Show the result
if len(negatives) == 0:
    print("✓ No negative values found in M, F or New_Cases_Week")
else:
    print("⚠️ Found negative values in the following rows:")
    print(negatives)

✓ No negative values found in M, F or New_Cases_Week


In [None]:
final_df = combined_df.sort_values(['Year', 'Week', 'Entity']).reset_index(drop=True)
# output_folder = '../data/interim/'
# filename = 'data_v3_English.csv'
# route = os.path.join(output_folder, filename)


# final_df.to_csv(
#     route,
#     index=False,
#     encoding='utf-8'
# )

print(final_df.info())
final_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18304 entries, 0 to 18303
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            18304 non-null  int64  
 1   Week            18304 non-null  int64  
 2   Date            18304 non-null  object 
 3   Entity          18304 non-null  object 
 4   Epi_Year        18304 non-null  int64  
 5   M               18304 non-null  float64
 6   F               18304 non-null  float64
 7   New_Cases_Week  18304 non-null  float64
dtypes: float64(3), int64(3), object(2)
memory usage: 1.1+ MB
None


Unnamed: 0,Year,Week,Date,Entity,Epi_Year,M,F,New_Cases_Week
0,2014,2,2014-01-06,Aguascalientes,2014,0.0,0.0,0.0
1,2014,2,2014-01-06,Baja California,2014,0.0,0.0,0.0
2,2014,2,2014-01-06,Baja California Sur,2014,0.0,0.0,0.0
3,2014,2,2014-01-06,Campeche,2014,0.0,0.0,0.0
4,2014,2,2014-01-06,Chiapas,2014,0.0,0.0,0.0
