In [1]:
import pandas as pd
import numpy as np

input_file = 'dsci_filter_log_4sigma.csv'
df = pd.read_csv(input_file)
df['house_age'] = 2016 - df['yearbuilt']
df.loc[df['house_age'] < 0, 'house_age'] = 0

df['area_per_bed'] = df['finishedsquarefeet12'] / np.where(df['bedroomcnt'] > 0, df['bedroomcnt'], 1)
df['bath_per_bed'] = df['bathroomcnt'] / np.where(df['bedroomcnt'] > 0, df['bedroomcnt'], 1)
df['total_rooms'] = df['bedroomcnt'] + df['bathroomcnt']
df['area_per_room'] = df['finishedsquarefeet12'] / np.where(df['total_rooms'] > 0, df['total_rooms'], 1)
df['area_x_quality'] = df['finishedsquarefeet12'] * (13 - df['buildingqualitytypeid'])

zip_stats = df.groupby('ZIP').agg(
    zip_median_value=('taxvaluedollarcnt', 'median'),
    zip_median_area=('finishedsquarefeet12', 'median'),
    zip_median_year=('yearbuilt', 'median')
).reset_index()
df = df.merge(zip_stats, on='ZIP', how='left')

df['pop_density'] = df['POPULATION'] / df['SQMI']
df['coastal_flag'] = (df['SEA_DIST'] < 5280).astype(int)
df['type_SFR'] = (df['propertylandusetypeid'] == 261.0).astype(int)
df['type_Condo'] = (df['propertylandusetypeid'] == 266.0).astype(int)
df.drop(columns=['propertylandusetypeid'], inplace=True)
df.drop(columns=['total_rooms'], inplace=True)

env_cols = ['PM2_5','Drinking_Water','Pesticides','Traffic',
            'Groundwater_Threats','Imp__Water_Bodies','Solid_Waste',
            'Pollution_Burden','POPULATION','SQMI']

structural_base_cols = ['finishedsquarefeet12','bedroomcnt','bathroomcnt',
                        'buildingqualitytypeid','lotsizesquarefeet','latitude',
                        'longitude','SEA_DIST','ratio','yearbuilt','type_SFR','type_Condo']

new_structural_cols = ['house_age','area_per_bed','bath_per_bed',
                       'area_per_room','area_x_quality','coastal_flag','pop_density']

new_group_cols = ['zip_median_value','zip_median_area','zip_median_year']

features = ['taxvaluedollarcnt'] + structural_base_cols + env_cols + new_structural_cols + new_group_cols

output_file = 'feature_enhanced.csv'
df[features].to_csv(output_file, index=False)
print(f"Saved at: {output_file}")

Saved at: feature_enhanced.csv
