In [1]:
import pandas as pd
import os
from utils import *
from aggregations import *

In [2]:
data_dict = {
    "pib_par_industrie": {
        "path": os.path.join("data", "stats_can", "pib_par_industrie.csv"),
        "gran:": "industrie and monthly",
        "date_range": ["199701", "202312"],
        "nom": "pib_par_industrie",
        "delimiter": ",",
        "select": {
            "REF_DATE": "CalendarMonth",
            "North American Industry Classification System (NAICS)": "industry",
            "VALUE": "PIB_par_industrie",
        }
    },
    "investissement_construction": {
        "path": os.path.join("data", "stats_can", "investissement_construction.csv"),
        "gran:": "monthly",
        "date_range": ["201701", "202312"],
        "nom": "investissement_construction",
        "delimiter": ";",
        "select": {
            "PÉRIODE DE RÉFÉRENCE": "CalendarMonth",
            "Type de structure": "structure_type",
            "VALEUR": "inverstissement_construction"
        }
    },
    "construction_par_region": {
        "path": os.path.join("data", "stats_can", "construction_par_region.csv"),
        "gran:": "yearly, par region",
        "date_range": ["200501", "202312"],
        "nom": "construction_par_region",
        "delimiter": ";",
        "select": {
            "PÉRIODE DE RÉFÉRENCE": "CalendarYear",
            "Estimations de logement": "construction_status",
            "Type d'unité": "construction_unit_type",
            "VALEUR": "unit_count",
        }
    },
    "indice_de_prix_logements": {
        "path": os.path.join("data", "stats_can", "indice_de_prix_logements.csv"),
        "gran:": "monthly",
        "date_range": ["200501", "202312"],
        "nom": "indice_de_prix_logements",
        "delimiter": ";",
        "select": {
            "PÉRIODE DE RÉFÉRENCE": "CalendarMonth",
            "Indices des prix des logements neufs": "new_housing_price_index",
            "VALEUR": "indice_de_prix_logements",
        }
    },
    "taux_hypothecaire_terme_5ans": {
        "path": os.path.join("data", "stats_can", "taux_hypothecaire_terme_5ans.csv"),
        "gran:": "monthly",
        "date_range": ["200501", "202312"],
        "nom": "taux_hypothecaire_terme_5ans",
        "delimiter": ";",
        "select": {
            "PÉRIODE DE RÉFÉRENCE": "CalendarMonth",
            "VALEUR": "taux_hypothecaire_terme_5ans",
        }
    },
    "demographic_growth":{
        "path": os.path.join("data", "stats_can", "demographic_growth.csv"),
        "gran": "yearly",
        "date_range": ["1998", "2023"],
        "nom": "demographic_growth",
        "delimiter": ",",
        "select": {
            "REF_DATE": "REF_DATE",
            "Components of population growth": "Components of population growth",
            "VALUE": "VALUE",
        }
    }    
}

In [3]:
output_dict = {}
id = 0
for table, table_info in data_dict.items():
    id += 1
    df = load_table(table_info["path"], table_info["delimiter"])
    df = select_rename_columns(df, table_info["select"])
    output_dict[table] = df
    # print the name of the table
    print(" " * 100)    
    print(table)
    if table == "pib_par_industrie":
        print(df.columns)
        df = extract_year(df)        
        df = agg_pib(df)
    if table == "investissement_construction":
        print(df.columns)
        df = extract_year(df)
        df = agg_investissement(df)
    if table == "construction_par_region":
        print(df.columns)
        df = extract_year(df)
        df = agg_construction(df)
    if table == "indice_de_prix_logements":
        print(df.columns)
        df = extract_year(df)
        df = agg_price_index(df)
    if table == "demographic_growth":
        print(df.columns)
        df = extract_year(df)
        df = agg_demo(df)
    # print(get_date_range(df))
    # print(df.shape[0])
    # print(df.head(5))
    print(df.columns)
    df = extract_year(df)

    output_dict[table] = df

                                                                                                    
pib_par_industrie
Index(['CalendarMonth', 'industry', 'PIB_par_industrie'], dtype='object')
Index(['CalendarMonth', 'All industries [T001]', 'Construction [23]',
       'Energy sector [T016]', 'Finance and insurance [52]',
       'Public Sector [T018]', 'Real estate and rental and leasing [53]',
       'Transportation and warehousing [48-49]'],
      dtype='object', name='industry')
                                                                                                    
investissement_construction
Index(['CalendarMonth', 'structure_type', 'inverstissement_construction'], dtype='object')


KeyError: "['work_type'] not found in axis"

In [None]:

joined_df = output_dict["pib_par_industrie"].merge(output_dict["investissement_construction"], on="CalendarMonth", how="outer")
joined_df["CalendarYear"] = joined_df["CalendarMonth"].apply(lambda x: x[:4])
joined_df["CalendarYear"] = joined_df["CalendarYear"].astype(int)
joined_df = joined_df.merge(output_dict["construction_par_region"], on="CalendarYear", how="outer")
joined_df = joined_df.merge(output_dict["indice_de_prix_logements"], on="CalendarMonth", how="outer")
joined_df = joined_df.merge(output_dict["taux_hypothecaire_terme_5ans"], on="CalendarMonth", how="outer")
joined_df = joined_df.merge(output_dict["demographic_growth"], on="CalendarYear", how="outer")


rename_list = [
    ("All industries [T001]", "PIB_all_industries"),
    ("Construction [23]", "PIB_construction"),
    ("Energy sector [T016]", "PIB_energy"),
    ("Finance and insurance [52]", "PIB_finance_insurance"),
    ("Public Sector [T018]", "PIB_public_sector"),
    ("Real estate and rental and leasing [53]", "PIB_real_estate"),
    ("Transportation and warehousing [48-49]", "PIB_transportation"),
    ("inverstissement_construction", "inverstissement_construction"),
    ("unit_count", "construction_unit_count"),
    ("Maison seulement", "house_only_price_index"),
    ("Terrain seulement", "land_only_price_index"),
    ("Total (maison et terrain)", "price_index_total"),
    ]
for x in rename_list:
    joined_df = joined_df.rename(columns={x[0]: x[1]})

In [None]:
print(joined_df.columns)
print(joined_df.shape[0])

In [None]:
joined_df

In [None]:
save_table(joined_df, "joined_data")

In [None]:
test_df = load_table("cleaned_data\joined_data.csv", ",")
test_df

In [None]:
# select inverstissement_construction from output_dict["investissement_construction"]
inv_df = output_dict["investissement_construction"]

In [None]:
inv_df

In [None]:
test_inv_df = load_table("data/stats_can/investissement_construction.csv", ";")

In [None]:
# print all columns from load_table
# print(test_inv_df.columns)
# print unique values in Type de structure
# print(test_inv_df["Type de structure"].unique())
# print all unique values in Type de travaux
print(len(test_inv_df["Type de travaux"].unique()))