In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


  from scipy.stats import gaussian_kde


In [3]:
nyc_df = pd.read_csv("NYC_data_raw.csv")
chicago_df = pd.read_csv("Chicago_data_raw.csv")

In [57]:
def data_check(df, missing_thresh=0.1, variance_thresh=.025):
    # Missing Values
    missing = df.isna().mean().sort_values(ascending=False)
    missing_cols = missing[missing > missing_thresh]
    print("Missing value columns:")
    print(missing_cols)

    # Low Variance (numeric only)
    numeric = df.select_dtypes(include='number')
    low_var = numeric.var()[numeric.var() < variance_thresh]
    print("Low variance columns:")
    print(low_var)

    # Constant Columns
    constant_cols = [col for col in df.columns if df[col].nunique(dropna=False) <= 1]
    print("Constant-value columns:")
    print(constant_cols)

In [59]:
data_check(nyc_df)

Missing value columns:
ammonia_min                       1.000000
ammonia_max                       1.000000
ammonia_mean                      1.000000
carbon_dioxide_min                0.768519
methane_max                       0.768519
methane_mean                      0.768519
carbon_dioxide_max                0.768519
methane_min                       0.768519
carbon_dioxide_mean               0.768519
precipitation_probability_min     0.747685
precipitation_probability_mean    0.747685
precipitation_probability_max     0.747685
dtype: float64
Low variance columns:
precipitation_min          0.000000e+00
precipitation_mean         1.930629e-04
rain_min                   0.000000e+00
rain_mean                  1.886383e-04
showers_min                0.000000e+00
showers_max                0.000000e+00
showers_mean               0.000000e+00
evapotranspiration_min     0.000000e+00
evapotranspiration_max     9.529035e-08
evapotranspiration_mean    1.641852e-09
uv_index_min            

In [61]:
data_check(chicago_df)

Missing value columns:
ammonia_min                       1.000000
ammonia_max                       1.000000
ammonia_mean                      1.000000
carbon_dioxide_min                0.768519
methane_max                       0.768519
methane_mean                      0.768519
carbon_dioxide_max                0.768519
methane_min                       0.768519
carbon_dioxide_mean               0.768519
precipitation_probability_min     0.747685
precipitation_probability_mean    0.747685
precipitation_probability_max     0.747685
dtype: float64
Low variance columns:
vapour_pressure_deficit_min    1.915701e-02
precipitation_min              0.000000e+00
precipitation_mean             1.081761e-04
rain_min                       0.000000e+00
rain_mean                      1.042199e-04
showers_min                    0.000000e+00
showers_max                    0.000000e+00
showers_mean                   0.000000e+00
evapotranspiration_min         0.000000e+00
evapotranspiration_max      

In [63]:
identical_cols = [col for col in chicago_df.columns if chicago_df[col].equals(nyc_df[col])]
print("Identical columns:", identical_cols)

Identical columns: ['date', 'precipitation_min', 'rain_min', 'showers_min', 'showers_max', 'showers_mean', 'evapotranspiration_min', 'uv_index_min', 'ammonia_min', 'ammonia_max', 'ammonia_mean']


Cols to drop based on them being all NaN or low var. among both cities indicating that the reading is regional or infreequently updated. 
 - Ammonia
 - Carbon Dioxide
 - Methane
 - Showers
 - Evapotransportation
 - precipitation_probability
 - various min cols that were all 0

In [68]:
cols_to_drop = [
    'ammonia_min', 'ammonia_max', 'ammonia_mean','methane_min', 'methane_max', 'methane_mean',
    'carbon_dioxide_min', 'carbon_dioxide_max', 'carbon_dioxide_mean', 'precipitation_probability_min', 
    'precipitation_probability_max', 'precipitation_probability_mean', 'precipitation_min', 'rain_min', 
    'showers_min', 'showers_max', 'showers_mean', 'evapotranspiration_min', 'uv_index_min','city', 'state'
]

nyc_df_clean = nyc_df.drop(columns=cols_to_drop, errors='ignore')
chicago_df_clean = chicago_df.drop(columns=cols_to_drop, errors='ignore')

In [84]:
# Correlation Matrix for NYC
numeric_cols = nyc_df_clean.select_dtypes(include='number')
corr_matrix = numeric_cols.corr()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr = []
for col1 in upper.columns:
    for col2 in upper.index:
        corr_value = upper.loc[col1, col2]
        if pd.notnull(corr_value) and abs(corr_value) > 0.95:
            high_corr.append((col1, col2, corr_value))

for pair in high_corr:
    print(f"{pair[0]} \n{pair[1]} \ncorr = {pair[2]:.2f}\n")


dew_point_2m_min 
dew_point_2m_mean 
corr = 0.98

dew_point_2m_max 
dew_point_2m_mean 
corr = 0.98

dew_point_2m_mean 
temperature_2m_min 
corr = 0.95

dew_point_2m_mean 
apparent_temperature_min 
corr = 0.96

wind_speed_10m_mean 
wind_speed_80m_mean 
corr = 0.96

wind_speed_80m_mean 
wind_speed_120m_mean 
corr = 0.96

precipitation_max 
rain_max 
corr = 1.00

precipitation_mean 
rain_mean 
corr = 0.99

temperature_2m_min 
temperature_2m_mean 
corr = 0.98

temperature_2m_min 
apparent_temperature_min 
corr = 0.99

temperature_2m_min 
apparent_temperature_max 
corr = 0.95

temperature_2m_min 
apparent_temperature_mean 
corr = 0.99

temperature_2m_max 
temperature_2m_mean 
corr = 0.98

temperature_2m_max 
apparent_temperature_max 
corr = 0.99

temperature_2m_max 
apparent_temperature_mean 
corr = 0.97

temperature_2m_mean 
apparent_temperature_min 
corr = 0.98

temperature_2m_mean 
apparent_temperature_max 
corr = 0.98

temperature_2m_mean 
apparent_temperature_mean 
corr = 0.99

apparen

'\nplt.figure(figsize=(15, 12))\nsns.heatmap(corr_matrix, cmap="coolwarm", center=0, annot=False)\nplt.title("Correlation Matrix (NYC)")\nplt.show()\n'

In [86]:
# Correlation Matrix for Chicago
numeric_cols = chicago_df_clean.select_dtypes(include='number')
corr_matrix = numeric_cols.corr()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr = []
for col1 in upper.columns:
    for col2 in upper.index:
        corr_value = upper.loc[col1, col2]
        if pd.notnull(corr_value) and abs(corr_value) > 0.95:
            high_corr.append((col1, col2, corr_value))

for pair in high_corr:
    print(f"{pair[0]} \n{pair[1]} \ncorr = {pair[2]:.2f}\n")


dew_point_2m_min 
dew_point_2m_mean 
corr = 0.99

dew_point_2m_min 
temperature_2m_min 
corr = 0.96

dew_point_2m_min 
apparent_temperature_min 
corr = 0.96

dew_point_2m_max 
dew_point_2m_mean 
corr = 0.98

dew_point_2m_max 
temperature_2m_min 
corr = 0.96

dew_point_2m_max 
temperature_2m_mean 
corr = 0.96

dew_point_2m_max 
apparent_temperature_min 
corr = 0.96

dew_point_2m_max 
apparent_temperature_mean 
corr = 0.97

dew_point_2m_mean 
temperature_2m_min 
corr = 0.98

dew_point_2m_mean 
temperature_2m_mean 
corr = 0.96

dew_point_2m_mean 
apparent_temperature_min 
corr = 0.98

dew_point_2m_mean 
apparent_temperature_mean 
corr = 0.97

wind_speed_80m_mean 
wind_speed_120m_mean 
corr = 0.96

precipitation_max 
rain_max 
corr = 1.00

precipitation_mean 
rain_mean 
corr = 0.99

temperature_2m_min 
temperature_2m_mean 
corr = 0.98

temperature_2m_min 
apparent_temperature_min 
corr = 0.99

temperature_2m_min 
apparent_temperature_mean 
corr = 0.98

temperature_2m_max 
temperature_2m_me