In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("train_data.csv", index_col = 0)  # replace with your actual file name

# Remove unnamed columns (if any)

# --- STEP 1: Create gentrification label based on changes from 2012 to 2023 ---

# Filter 2012 and 2023
df_2012 = df[df["year"] == 2012].set_index("zip_code")
df_2023 = df[df["year"] == 2023].set_index("zip_code")

# Join on zip_code
df_growth = df_2012[["income", "per_college_educated", "median_contract_rent"]].rename(
    columns=lambda x: x + "_2012"
).join(
    df_2023[["income", "per_college_educated", "median_contract_rent"]].rename(
        columns=lambda x: x + "_2023"
    ),
    how="inner"
)

# Compute change
df_growth["income_growth"] = df_growth["income_2023"] - df_growth["income_2012"]
df_growth["edu_growth"] = df_growth["per_college_educated_2023"] - df_growth["per_college_educated_2012"]
df_growth["rent_growth"] = df_growth["median_contract_rent_2023"] - df_growth["median_contract_rent_2012"]

# Define thresholds
low_income_cutoff = df_2012["income"].quantile(0.3)
high_income_growth = df_growth["income_growth"].quantile(0.7)
high_edu_growth = df_growth["edu_growth"].quantile(0.7)

# Rule-based label
df_growth["gentrified"] = (
    (df_2012["income"] < low_income_cutoff) &
    (df_growth["income_growth"] > high_income_growth) &
    (df_growth["edu_growth"] > high_edu_growth)
)

# Add gentrified label to main df
df = df.merge(df_growth["gentrified"].reset_index(), on="zip_code", how="left")

# --- STEP 2: Train Random Forest Classifier on pre-2022 data ---

# Only use data from years before 2022 and with gentrified label
# Ensure gentrified is not null and is boolean
train_df = df[(df["year"] < 2022) & (df["gentrified"].notnull())].copy()

# Convert to integer (0/1)
train_df["gentrified"] = train_df["gentrified"].astype(int)

# Features and target
features = ["income", "per_college_educated", "median_contract_rent"]
X = train_df[features]
y = train_df["gentrified"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Train model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
print("Model Performance:\n")
print(classification_report(y_test, rf.predict(X_test)))

# --- STEP 3: Predict gentrification probability for 2022 ---

df_2022 = df[df["year"] == 2022].copy()
X_2022 = df_2022[features]

# Predict probabilities
df_2022["gentrification_probability"] = rf.predict_proba(X_2022)[:, 1]

# Save predictions
df_2022[["zip_code", "gentrification_probability"]].to_csv("gentrification_predictions_2022.csv", index=False)

print("\nPredictions for 2022 saved to 'gentrification_predictions_2022.csv'")

Model Performance:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     39647
           1       0.42      0.02      0.04       856

    accuracy                           0.98     40503
   macro avg       0.70      0.51      0.51     40503
weighted avg       0.97      0.98      0.97     40503


Predictions for 2022 saved to 'gentrification_predictions_2022.csv'
