In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


  from scipy.stats import gaussian_kde


In [3]:
nyc_df = pd.read_csv("data/NYC_data_raw.csv")
chicago_df = pd.read_csv("data/CHI_data_raw.csv")

In [5]:
def data_check(df, missing_thresh=0, variance_thresh=.1):
    # Missing Values
    missing = df.isna().mean().sort_values(ascending=False)
    missing_cols = missing[missing > missing_thresh]
    print("Missing value columns:")
    print(missing_cols)

    # Low Variance (numeric only)
    numeric = df.select_dtypes(include='number')
    low_var = numeric.var()[numeric.var() < variance_thresh]
    print("Low variance columns:")
    print(low_var)

    # Constant Columns
    constant_cols = [col for col in df.columns if df[col].nunique(dropna=False) <= 1]
    print("Constant-value columns:")
    print(constant_cols)

In [7]:
data_check(nyc_df)

Missing value columns:
Series([], dtype: float64)
Low variance columns:
rain_max        0.042366
snowfall_sum    0.064662
snowfall_max    0.003356
dtype: float64
Constant-value columns:
['state']


In [9]:
data_check(chicago_df)

Missing value columns:
Series([], dtype: float64)
Low variance columns:
rain_sum        0.058780
rain_max        0.031328
snowfall_sum    0.047946
snowfall_max    0.004886
dtype: float64
Constant-value columns:
['state']


In [11]:
# Correlation Matrix for NYC
numeric_cols = nyc_df.select_dtypes(include='number')
corr_matrix = numeric_cols.corr()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr = []
for col1 in upper.columns:
    for col2 in upper.index:
        corr_value = upper.loc[col1, col2]
        if pd.notnull(corr_value) and abs(corr_value) > 0.95:
            high_corr.append((col1, col2, corr_value))

for pair in high_corr:
    print(f"{pair[0]} \n{pair[1]} \ncorr = {pair[2]:.2f}\n")


apparent_temperature_min 
apparent_temperature_max 
corr = 0.96

apparent_temperature_min_lag 
apparent_temperature_max_lag 
corr = 0.96

us_aqi_pm2_5_min 
us_aqi_min 
corr = 0.97

us_aqi_pm2_5_min 
us_aqi_pm10_min 
corr = 0.96

us_aqi_pm2_5_max 
us_aqi_pm10_max 
corr = 0.96

us_aqi_pm2_5_mean 
us_aqi_mean 
corr = 0.96

us_aqi_pm2_5_mean 
us_aqi_pm10_mean 
corr = 0.96

us_aqi_min 
us_aqi_pm10_min 
corr = 0.96

us_aqi_pm10_max 
us_aqi_pm10_mean 
corr = 0.95

us_aqi_carbon_monoxide_max 
us_aqi_carbon_monoxide_mean 
corr = 0.96



In [13]:
# Correlation Matrix for Chicago
numeric_cols = chicago_df.select_dtypes(include='number')
corr_matrix = numeric_cols.corr()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr = []
for col1 in upper.columns:
    for col2 in upper.index:
        corr_value = upper.loc[col1, col2]
        if pd.notnull(corr_value) and abs(corr_value) > 0.90:
            high_corr.append((col1, col2, corr_value))

for pair in high_corr:
    print(f"{pair[0]} \n{pair[1]} \ncorr = {pair[2]:.2f}\n")


rain_sum 
rain_max 
corr = 0.91

apparent_temperature_min 
apparent_temperature_max 
corr = 0.95

apparent_temperature_min 
apparent_temperature_min_lag 
corr = 0.93

apparent_temperature_min 
apparent_temperature_max_lag 
corr = 0.93

apparent_temperature_max 
apparent_temperature_max_lag 
corr = 0.92

apparent_temperature_min_lag 
apparent_temperature_max_lag 
corr = 0.95

us_aqi_pm2_5_min 
us_aqi_pm2_5_mean 
corr = 0.95

us_aqi_pm2_5_min 
us_aqi_min 
corr = 0.96

us_aqi_pm2_5_min 
us_aqi_mean 
corr = 0.91

us_aqi_pm2_5_min 
us_aqi_pm10_min 
corr = 0.96

us_aqi_pm2_5_min 
us_aqi_pm10_mean 
corr = 0.92

us_aqi_pm2_5_max 
us_aqi_pm2_5_mean 
corr = 0.95

us_aqi_pm2_5_max 
us_aqi_mean 
corr = 0.91

us_aqi_pm2_5_max 
us_aqi_pm10_max 
corr = 0.95

us_aqi_pm2_5_max 
us_aqi_pm10_mean 
corr = 0.92

us_aqi_pm2_5_mean 
us_aqi_min 
corr = 0.92

us_aqi_pm2_5_mean 
us_aqi_mean 
corr = 0.95

us_aqi_pm2_5_mean 
us_aqi_pm10_min 
corr = 0.92

us_aqi_pm2_5_mean 
us_aqi_pm10_max 
corr = 0.92

us_aqi_pm2

In [89]:
chicago_df[['us_aqi_min','us_aqi_max','us_aqi_mean']].describe()

Unnamed: 0,us_aqi_min,us_aqi_max,us_aqi_mean
count,864.0,864.0,864.0
mean,39.635812,58.808404,47.394055
std,14.781023,25.344428,16.796872
min,13.624768,23.249071,21.082153
25%,28.398832,41.438353,34.965097
50%,36.232642,53.195922,44.444055
75%,49.227432,67.367016,56.1806
max,128.53127,187.5,148.74384
