In [None]:
# If in Colab, then import the drive module from google.colab
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  # Mount the Google Drive to access files stored there
  drive.mount('/content/drive')

  # Install the latest version of torchtext library quietly without showing output

  !pip install torchinfo -qq

  basepath = '/content/drive/MyDrive/HACKATHON_2025'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Top 3 Locations to Build Tissue Mills:
                                  Metro Area  Composite Score
  Chicago-Naperville-Elgin, IL-IN Metro Area         0.501847
Atlanta-Sandy Springs-Roswell, GA Metro Area         0.472369
                     Logan, UT-ID Metro Area         0.468306


Top 3 Locations to Build Tissue Mills:
                                  Metro Area  Composite Score
  Chicago-Naperville-Elgin, IL-IN Metro Area         0.491113
Atlanta-Sandy Springs-Roswell, GA Metro Area         0.462931
                     Logan, UT-ID Metro Area         0.450862


Missing value counts before filtering:
Labor Force Potential     0
Labor Cost Index          0
Infrastructure Score      0
Stability Factor          0
Timber Availability       0
Tissue Demand            10
dtype: int64
Top 3 Locations to Build Tissue Mills (Final Model):
                                    Metro Area  Composite Score
  Atlanta-Sandy Springs-Roswell, GA Metro Area         0.659275
Portland-Vancouver-Hillsboro, OR-WA Metro Area         0.527160
                        Jackson, MS Metro Area         0.524247


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load ACS and other datasets
dp03 = pd.read_csv("/content/drive/MyDrive/HACKATHON_2025/ACSDP1Y2023.DP03-Data.csv")
dp02 = pd.read_csv("/content/drive/MyDrive/HACKATHON_2025/ACSDP1Y2023.DP02-Data.csv")
dp05 = pd.read_csv("/content/drive/MyDrive/HACKATHON_2025/ACSDP1Y2023.DP05-Data.csv")
timber_df = pd.read_csv("/content/drive/MyDrive/HACKATHON_2025/TimberdataUS.csv")
tissue_df = pd.read_csv("/content/drive/MyDrive/HACKATHON_2025/TissueConsumptionUS.csv")
disaster_df = pd.read_csv("/content/drive/MyDrive/HACKATHON_2025/water_risk_with_msa_geo_id.csv")
fire_df = pd.read_csv("/content/drive/MyDrive/HACKATHON_2025/ForestFires20-24.csv")

# Filter forest fire data to 2023 only
fire_2023 = fire_df[fire_df["Year"] == 2023]
fire_2023_agg = fire_2023.groupby("GEO_ID").agg({
    "Fires": "sum",
    "Acres Burned": "sum",
    "Forest Acres Est.": "mean"
}).reset_index()
fire_2023_agg.columns = ["GEO_ID", "Fires (2023)", "Acres Burned (2023)", "Avg Forest Acres (2023)"]

# ACS column selections
dp03_cols = {
    "Metro Area": "NAME",
    "Employed %": "DP03_0004PE",
    "Unemployment Rate %": "DP03_0009PE",
    "Employed Population": "DP03_0026E",
    "Median Household Income": "DP03_0062E",
    "Median Earnings for Workers": "DP03_0092E",
    "% Manufacturing": "DP03_0035PE",
    "% Forestry and Mining": "DP03_0033PE",
    "% Transportation & Utilities": "DP03_0038PE",
    "Mean Travel Time to Work": "DP03_0025E",
    "Poverty Rate %": "DP03_0119PE"
}

dp02_cols = {
    "High School Graduate %": "DP02_0067PE",
    "Married-couple Household %": "DP02_0002PE",
    "Civilian Veterans %": "DP02_0070PE",
    "Non-English Language %": "DP02_0114PE",
    "Broadband Subscription %": "DP02_0154PE"
}

dp05_cols = {
    "Total Population": "DP05_0001E",
    "Population Margin of Error": "DP05_0001M",
    "Population Age Insight": "DP05_0021E"
}

# Process ACS files
df_dp03 = dp03[list(dp03_cols.values())].copy()
df_dp03.columns = list(dp03_cols.keys())

df_dp02 = dp02[list(dp02_cols.values()) + ["NAME"]].copy()
df_dp02.columns = list(dp02_cols.keys()) + ["Metro Area"]

df_dp05 = dp05[list(dp05_cols.values()) + ["NAME"]].copy()
df_dp05.columns = list(dp05_cols.keys()) + ["Metro Area"]

# Merge base demographic data
df = pd.merge(df_dp03, df_dp02, on="Metro Area")
df = pd.merge(df, df_dp05, on="Metro Area")

# Add optional features
df["Recent Movers %"] = pd.to_numeric(dp02.get("DP02_0086PE"), errors='coerce')
df["Computer Access %"] = pd.to_numeric(dp02.get("DP02_0153PE"), errors='coerce')

# Convert to numeric and drop missing
for col in df.columns.drop("Metro Area"):
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.dropna(inplace=True)

# Merge Timber availability
timber_df = timber_df[["MSA_Name", "Capacity_BBF"]].drop_duplicates()
timber_df.columns = ["Metro Area", "Timber Availability"]
df = pd.merge(df, timber_df, on="Metro Area", how="left")

# Merge Tissue demand
df["Region"] = df["Metro Area"].str.extract(r',\s*([A-Z]{2})')
# Fix: Select only relevant columns before renaming
tissue_df = tissue_df[["Region", "Consumption_Metric_Tons"]].copy()
tissue_df.columns = ["Region", "Tissue Demand"]
df = pd.merge(df, tissue_df, on="Region", how="left")

# Merge Disaster risk data by GEO_ID
df = pd.merge(df, disaster_df, left_on="Metro Area", right_on="GEO_ID", how="left")

# Merge Forest Fire metrics (2023 only)
df = pd.merge(df, fire_2023_agg, on="GEO_ID", how="left")

# ------------------------------
# Create Composite Feature Metrics
# ------------------------------
max_income = df["Median Household Income"].max()

df["Labor Force Potential"] = (
    df["Employed Population"] *
    df["High School Graduate %"] *
    df["Civilian Veterans %"]
)

df["Labor Cost Index"] = (
    (1 - df["Median Household Income"] / max_income) +
    df["Poverty Rate %"] +
    df["Non-English Language %"]
)

df["Infrastructure Score"] = (
    df["Computer Access %"] +
    df["Broadband Subscription %"]
)

df["Stability Factor"] = (
    df["Married-couple Household %"] *
    (1 - df["Recent Movers %"])
)

# ------------------------------
# Final Features for Composite Score
# ------------------------------
final_features = [
    "Labor Force Potential", "Labor Cost Index", "Infrastructure Score", "Stability Factor",
    "Timber Availability", "Tissue Demand",
    "WNTW_RISKV", "WNTW_EALS", "WNTW_ALRB", "WNTW_ALRP", "WNTW_ALRA", "WNTW_ALR_NPCTL",
    "Fires (2023)", "Acres Burned (2023)"
]

# Drop rows with >2 missing values in final features
df = df[df[final_features].isnull().sum(axis=1) <= 9]

# ------------------------------
# Normalize and Score
# ------------------------------
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[final_features]), columns=final_features)
df_scaled["Metro Area"] = df["Metro Area"].values

# Invert cost/risk features
invert = ["Labor Cost Index", "WNTW_RISKV", "WNTW_EALS", "WNTW_ALRB", "WNTW_ALRP",
          "WNTW_ALRA", "WNTW_ALR_NPCTL", "Fires (2023)", "Acres Burned (2023)"]
df_scaled[invert] = 1 - df_scaled[invert]

# Composite Score: average of normalized features
df_scaled["Composite Score"] = df_scaled.drop(columns=["Metro Area"]).mean(axis=1)

# ------------------------------
# Top 3 Results
# ------------------------------
top_3 = df_scaled.sort_values(by="Composite Score", ascending=False)[["Metro Area", "Composite Score"]].head(3)

print("Top 3 Recommended Tissue Mill Locations:")
print(top_3.to_string(index=False))

Top 3 Recommended Tissue Mill Locations:
                                    Metro Area  Composite Score
  Atlanta-Sandy Springs-Roswell, GA Metro Area         0.659275
Portland-Vancouver-Hillsboro, OR-WA Metro Area         0.526327
                        Jackson, MS Metro Area         0.524247


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


Top 3 Recommended Tissue Mill Locations:
                                    Metro Area  Composite Score
  Atlanta-Sandy Springs-Roswell, GA Metro Area         0.630342
Portland-Vancouver-Hillsboro, OR-WA Metro Area         0.466921
Portland-Vancouver-Hillsboro, OR-WA Metro Area         0.445276


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
