## Preliminaries

In [47]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings

# Important functionality for this notebook
from scipy import stats
import statsmodels.api as sm
import os
from pathlib import Path # For working with file paths
import re # For regex

In [3]:
# Set directory
os.chdir("C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO/DATA/ECONOMIC")

## Define base data processor class

In [77]:
# Define base data folder processor class
class BaseDataFolderProcessor:
    def __init__(self, folder_path, file_prefix, year_range = range(2016, 2025) ):
        self.folder_path = Path(folder_path)
        self.file_prefix = file_prefix
        self.year_range = year_range # Our default year range for this project is 2016 to 2024 so we make this argument optional
        self.data_by_year = self.load_data()
        
    def reset_data(self):
        pass
    
    def is_clean(self):
        print(f'Data is clean: {self.cleaned}')
        return self
    
    def clean_data(self):
        pass

    def preprocess_data(self):
        pass

    def load_data(self): # Load CSV files into a dictionary keyed by year.
        data_dict = {}
        for year in self.year_range:
            file_name = f"{self.file_prefix}_{year}.csv"
            file_path = self.folder_path / file_name
            if file_path.exists():
                try:
                    df = pd.read_csv(file_path, skiprows=2, encoding='latin1', na_values = '**',usecols=range(11))
                    df = self.clean_dataframe(df,file_name)
                    data_dict[year] = df
                except Exception as e:
                    print(f"Error loading {file_name}: {e}")
            else:
                print(f"Warning: {file_name} not found.")
        return data_dict
    def clean_dataframe(self, df, file_name):
        # Extract suffix after 'by_'
        match = re.search(r'by_(\w+)_\d{4}', file_name)
        if match:
            area_type = match.group(1).capitalize()
            df.columns.values[0] = area_type  # Rename first column
        else:
            df.columns.values[0] = 'Area'  # Fallback name

        # Rename data quality columns based on their relative positions
        col_renames = {
            df.columns[2]: 'Bachelor DQ',
            df.columns[4]: '1 Bedroom DQ',
            df.columns[6]: '2 Bedroom DQ',
            df.columns[8]: '3 Bedroom + DQ',
            df.columns[10]: 'Total DQ'
        }
        df = df.rename(columns=col_renames)

        # Find the index of the row where area == 'Vancouver'
        area_col = df.columns[0]
        vancouver_index = df[df[area_col].str.strip().str.lower() == 'vancouver'].index.min()

        # Drop that row and all rows after it in order to exclude unnecessary rows
        if pd.notnull(vancouver_index):
            df = df.loc[:vancouver_index - 1]
        return df

    def get_data_for_year(self, year): # Retrieve the data for a specific year
        return self.data_by_year.get(year)
    def save_processed_data(self):
        pass

In [78]:
# Define multifolder data folder processor class
class MultiFolderDataProcessor:
    def __init__(self, base_folder_path, folder_prefix_pairs, year_range=range(2016, 2025)):
        self.base_folder_path = Path(base_folder_path)
        self.folder_prefix_pairs = folder_prefix_pairs
        self.year_range = year_range
        self.data = self.load_all_data()

    def load_all_data(self): # Loads data from all subfolders and prefixes into a nested dictionary.
        all_data = {}
        for subfolder, prefix in self.folder_prefix_pairs:
            full_path = self.base_folder_path / subfolder
            processor = BaseDataFolderProcessor(full_path, prefix, self.year_range)
            all_data[subfolder] = processor.data_by_year
        return all_data

    def get_data(self, subfolder, year): # Retrieve DataFrame for a specific subfolder and year.
        return self.data.get(subfolder, {}).get(year, None)

## Testing function: test_sequence()

In [69]:
# Define function to streamline testing
def test_sequence():
    # Instantiate avg rent by census tract class
    avg_rent_processor = BaseDataFolderProcessor("AVG_RENTS/BY_TRACT", "avg_rent_by_tract")

    # Extract 2020 specific data
    df_2020 = avg_rent_processor.get_data_for_year(2020)

    # Examine dataframe head
    display(df_2020.tail(5))
    display(df_2020.info())

In [70]:
# Run test sequence
test_sequence()

Unnamed: 0,Tract,Bachelor,Bachelor DQ,1 Bedroom,1 Bedroom DQ,2 Bedroom,2 Bedroom DQ,3 Bedroom +,3 Bedroom + DQ,Total,Total DQ
280,The following letter codes are used to indicat...,,,,,,,,,,
281,**  Data suppressed to protect confidentialit...,,,,,,,,,,
282,++  Change in rent is not statistically signi...,,,,,,,,,,
283,"CMA, CA and CSD definitions are based on 2016 ...",,,,,,,,,,
284,Source,CMHC Rental Market Survey,,,,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Tract           285 non-null    object
 1   Bachelor        83 non-null     object
 2   Bachelor DQ     82 non-null     object
 3   1 Bedroom       129 non-null    object
 4   1 Bedroom DQ    129 non-null    object
 5   2 Bedroom       127 non-null    object
 6   2 Bedroom DQ    127 non-null    object
 7   3 Bedroom +     27 non-null     object
 8   3 Bedroom + DQ  27 non-null     object
 9   Total           139 non-null    object
 10  Total DQ        139 non-null    object
dtypes: object(11)
memory usage: 24.6+ KB


None

## Miscellaneous

In [79]:
# Define your subfolders and prefixes
folder_prefixes = [
    ("BY_TRACT", "avg_rent_by_tract"),
    ("BY_NEIGHBORHOOD", "avg_rent_by_neigh")
]

# Initialize
multi_proc = MultiFolderDataProcessor("AVG_RENTS", folder_prefixes)

# Get 2020 data from BY_NEIGHBORHOOD
df_neigh_2020 = multi_proc.get_data("BY_NEIGHBORHOOD", 2020)

# Get 2017 data from BY_TRACT
df_tract_2017 = multi_proc.get_data("BY_TRACT", 2017)


display(df_neigh_2020.tail(5))
display(df_tract_2017.tail(5))

Unnamed: 0,Neigh,Bachelor,Bachelor DQ,1 Bedroom,1 Bedroom DQ,2 Bedroom,2 Bedroom DQ,3 Bedroom +,3 Bedroom + DQ,Total,Total DQ
63,West End/Stanley Park South,1202.0,a,1493,a,2196,a,,,1499,a
64,Westside Heights,1020.0,a,1253,a,1482,a,1742.0,a,1284,a
65,Westside/Kerrisdale Remainder,,,1609,a,2291,a,2999.0,a,1919,a
66,Whalley,715.0,b,1185,a,1207,a,1535.0,a,1242,a
67,White Rock,946.0,b,1091,a,1422,a,,,1191,a


Unnamed: 0,Tract,Bachelor,Bachelor DQ,1 Bedroom,1 Bedroom DQ,2 Bedroom,2 Bedroom DQ,3 Bedroom +,3 Bedroom + DQ,Total,Total DQ
271,504.11,,,,,,,,,,
272,504.12,,,,,,,,,,
273,505.01,,,,,,,,,,
274,506.01,,,,,,,,,,
275,506.02,,,,,,,,,,


In [76]:
# Define your subfolders and prefixes
folder_prefixes = [
    ("BY_TRACT", "avg_rent_by_tract"),
    ("BY_NEIGHBORHOOD", "avg_rent_by_neigh")
]

# Initialize
multi_proc = MultiFolderDataProcessor("AVG_RENTS", folder_prefixes)

# Get 2020 data from BY_NEIGHBORHOOD
df_neigh_2016 = multi_proc.get_data("BY_NEIGHBORHOOD", 2020)

# Get 2017 data from BY_TRACT
df_tract_2020 = multi_proc.get_data("BY_TRACT", 2020)


display(df_neigh_2016.tail(5))
display(df_tract_2020.tail(5))

Unnamed: 0,Neigh,Bachelor,Bachelor DQ,1 Bedroom,1 Bedroom DQ,2 Bedroom,2 Bedroom DQ,3 Bedroom +,3 Bedroom + DQ,Total,Total DQ
63,West End/Stanley Park South,1202.0,a,1493,a,2196,a,,,1499,a
64,Westside Heights,1020.0,a,1253,a,1482,a,1742.0,a,1284,a
65,Westside/Kerrisdale Remainder,,,1609,a,2291,a,2999.0,a,1919,a
66,Whalley,715.0,b,1185,a,1207,a,1535.0,a,1242,a
67,White Rock,946.0,b,1091,a,1422,a,,,1191,a


Unnamed: 0,Tract,Bachelor,Bachelor DQ,1 Bedroom,1 Bedroom DQ,2 Bedroom,2 Bedroom DQ,3 Bedroom +,3 Bedroom + DQ,Total,Total DQ
273,504.11,,,,,,,,,,
274,504.12,,,,,,,,,,
275,505.01,,,,,,,,,,
276,506.01,,,,,,,,,,
277,506.02,,,,,,,,,,


In [80]:
# Examine data dictionary
data_dict = avg_rent_processor.data_by_year
data_dict[2024]

Unnamed: 0,Tract,Bachelor,Bachelor DQ,1 Bedroom,1 Bedroom DQ,2 Bedroom,2 Bedroom DQ,3 Bedroom +,3 Bedroom + DQ,Total,Total DQ,Unnamed: 11
0,0001.01,,,,,947,a,1090,a,1020,a,
1,0001.02,,,2599,b,2758,d,,,2697,c,
2,0002.01,,,,,1766,a,,,1623,a,
3,0002.03,,,,,,,,,,,
4,0003.01,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
311,The following letter codes are used to indicat...,,,,,,,,,,,
312,**  Data suppressed to protect confidentialit...,,,,,,,,,,,
313,++  Change in rent is not statistically signi...,,,,,,,,,,,
314,"CMA, CA and CSD definitions are based on 2021 ...",,,,,,,,,,,
