In [None]:
import pandas as pd
import numpy as np
import os


In [None]:
path_directory = "/home/oem/PycharmProjects/RealEstate_Data_Pipeline/processed_data/.ipynb_checkpoints/combined_data-checkpoint.csv"
path_direct2 = "/home/oem/PycharmProjects/RealEstate_Data_Pipeline/spark_query/dataframe1.csv"

In [None]:
#Loadin function
def load_data(path_directory, path_direct2):
    df1 = pd.read_csv(path_directory, low_memory=False)
    df2 = pd.read_csv(path_direct2, low_memory=False)
    
    for cols in df1.columns:
        if df1[cols].apply(type).nunique() > 1:
            print(f"These col: {cols} has mixed data type in df1")
    
    for cols in df2.columns:
        if df2[cols].apply(type).nunique() > 1:
            print(f"These col: {cols} has mixed data type in df2")
    return df1, df2

In [None]:
df1, df2 = load_data(path_directory, path_direct2)

In [None]:
df1.dtypes

In [None]:
# Correcting the dtype of columns
def correcting_dtypes(df1, df2):
    cols_affected = [
        "State",
        "City",
        "Metro",
        "CountyName",
        "StateName"
    ]

    for col in cols_affected:
        if isinstance(df1[col], object):
            df1[col] = df1[col].astype(str)
        else:
            if isinstance(df2[col], object):
                df2[col] = df2[col].astype(str)

    return df1, df2

In [None]:
df1 = correcting_dtypes(df1)
df1.head(10)

In [None]:
# Droping abnormal columns
def clean_data(df):
    df = df.drop(columns=['City', 'StateCodeFIPS', "MunicipalCodeFIPS"], axis=1)
    print(df.duplicated(keep='first').sum())
    df.fillna(0, inplace=True)
    
    return df

In [None]:
df1 = clean_data(df1)

In [None]:
def calculate_quarter_prices(df, quarter, year_start=2000, year_end=2024):
    
    quarter_months = {
        "first": ["01-31", "02-28", "03-31"],
        "second": ["04-30", "05-31", "06-30"],
        "third": ["07-31", "08-31", "09-30"],
        "fourth": ["10-31", "11-30", "12-31"]
    }

    if quarter not in quarter_months:
        raise ValueError(f"Invalid quarter name: {quarter}. Must be one of: {list(quarter_months.keys())}")

    months = quarter_months[quarter]

    for year in range(year_start, year_end + 1):
        # Adjust February for leap years if it's the first quarter
        if quarter == "first" and pd.Timestamp(f"{year}-02-01").is_leap_year:
            months[1] = "02-29"  # Replace 02-28 with 02-29 for leap years
        else:
            months[1] = "02-28"  # Reset to 02-28 for non-leap years

        # Generate the column names for the quarter
        date_columns = [f"{year}-{month}" for month in months]

        # Check for missing columns
        missing_cols = [col for col in date_columns if col not in df.columns]
        if missing_cols:
            print(f"Warning: Missing columns for {year} {quarter}: {missing_cols}")
            continue
            
        df1 = df
        df1 = df1.copy()
        
        # Calculate the average and add the new column
        column_name = f"{year}_{quarter}_qtr_prices"
        df1[column_name] = df1[date_columns].mean(axis=1).round(2)

    return df1

calculate_quarter_prices(texas_df, quarter="first")

In [None]:
def create_state_dfs(df):
    # List of states
    states = [
        "TX", "CA", "NY", "FL", "IL",
        "OH", "GA", "MA", "VA", "WA",
        "PA", "NC", "CO", "MN", "IN",
        "MI", "IA", "MD", "KS", "UT", "OR"
    ]
    
    # Create a dictionary of DataFrames for each state
    state_dfs = {state: df[df["State"] == state] for state in states}
    
    return state_dfs


In [None]:
state_dfs = create_state_dfs(df1)

# Access specific state DataFrames
tx_df = state_dfs["TX"]
fl_df = state_dfs["FL"]
oh_df = state_dfs["OH"]

In [None]:
tx_df.head()