### 90803 Final Project
# Classification of cities into Growth Categories

**Team 14**

Chi-Shiun Tsai & Colton Lapp

### Import Modules

In [54]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import math
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
import geopandas as gpd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
import warnings

pd.set_option('display.max_columns', 500)

#### Set Seed

In [15]:
seed = 0

### Read in Data

In [8]:
#Set Base Dir
base_dir = os.getcwd().split('/')
base_dir = '/'.join(base_dir[0:-1])
base_dir
df = pd.read_csv(base_dir + '/Team14_CL_CT_MG/data/data_all.csv', low_memory=False)
df.head()

Unnamed: 0,Name,Population 2020,Median Household Income,Median Home Value,Median Gross Rent,Renter Household,Total Housing Units,Race: White Alone,Hispanic or Latino,Population Age 25 or Older: Bachelor`s Degree,Population Age 25 or Older: Doctorate Degree,Pop 5-17 Speak only English at Home,Population Age 15 or Older Never Married,Population Age 15 or Older Never Married 2,Population Age 65 or more,Pop 65 or more Living in Nonfamily Households,Females 20-64 w Own Children 0-17,Workers Age 16 or more,Workers 16 or more who Took Public Transportation to work,Workers 16 or more who Worked at Home,Workers 16 or more who Drove Alone to Work,Employed in Professional/Scientific/Technical Services,Employed in Accommodation/Food Services,Employed in Health Care/Social Assistance,Employed in Arts/Entertainment/Recreation,Male 16+: Private For-Profit Self-Emp Inc Bus,Male 16+: Private Not-For-Profit Worker,Civilian Population 18+: Veteran,Population 35-64: Medicaid Coverage Only,Population 35-64: No Health Insurance Coverage,Households with No Internet Acces,Households Receiving Food Stamps/SNAP,Households Below the Poverty Leve,Gross Rent 50% + of Household Income,Owner Households by Year Moved In: 2017 or Later,Total Vacant Housing Units,Housing Units Built in 2014 or Later,state,GEOID,Population 2019,Pop Chng %,land_area,water_area,cases_per_100K,deaths_per_100K
0,"New York city, New York",8379552,67046.0,635200.0,1489.0,2143429.0,42.002186,41.330205,28.925997,16.000569,1.125072,7.664813,17.610035,18.069295,14.947255,4.979001,1.908419,46.830821,24.736883,3.465794,10.448387,4.848242,3.531358,8.708186,1.403011,1.143271,1.902047,1.705306,8.383276,3.427486,5.030627,7.244218,6.55683,6.763679,0.208245,3.913145,0.719466,36,3651000,8419316,-0.472295,777982727,434642000,5913.616647,220.640533
1,"Los Angeles city, California",3973278,65290.0,670700.0,1523.0,884176.0,38.099297,48.93048,48.066307,16.225771,1.007329,5.68367,19.986897,17.982834,12.855557,3.889962,1.788574,49.252959,4.056122,4.517605,33.341664,4.703748,4.288298,6.098239,2.086111,1.660996,1.278365,2.068393,7.988391,5.480588,3.824172,3.640193,5.838303,6.734288,0.256715,2.800433,0.887101,6,644000,3966936,0.159871,1214591312,87468461,6178.068445,67.066531
2,"Chicago city, Illinois",2699347,62097.0,267600.0,1154.0,590848.0,45.110392,47.731581,28.628813,16.758127,1.200438,8.816058,20.245897,19.983537,12.676177,4.820647,1.80025,49.38261,12.91905,4.19457,23.775417,6.130186,4.004228,7.006176,1.199698,1.064406,2.110918,2.475895,5.359926,4.764597,5.445095,7.012289,6.614933,5.035996,0.455777,5.058372,0.868655,17,1714000,2709534,-0.375969,588881854,17621529,7499.869841,133.847542
3,"Houston city, Texas",2313238,53600.0,186800.0,1086.0,499608.0,42.481318,51.539271,44.471948,13.576078,1.126473,8.04647,17.088946,15.180669,10.985813,3.587785,2.314159,47.686144,1.748501,2.939646,36.20704,4.086998,3.904484,5.408134,0.727076,0.958916,1.097639,2.983826,2.628869,10.073672,5.207333,5.392571,6.563527,5.208932,0.410939,4.663031,2.628869,48,4835000,2310432,0.121449,1658093679,81248080,4991.616106,57.677759
4,"Phoenix city, Arizona",1658422,60914.0,250800.0,1100.0,258077.0,37.805637,68.233176,42.647046,11.954557,0.754512,11.002447,16.492967,14.118542,10.974408,3.490125,2.569792,48.881527,1.325778,4.060667,35.624769,3.587989,4.214126,6.030914,0.925096,1.144582,1.225804,4.415402,5.078864,6.408743,3.93989,4.255793,4.791905,3.29102,0.706636,2.782283,1.114433,4,455000,1633017,1.555709,1340766401,3236736,7307.164034,116.139965


## Set Up Data

### Adjust Row Names
Note: If we want to include state effects, we could keep that column and create boolean values out of oit

In [9]:
#Set row equal to name
df.set_index('Name', inplace=True, drop=True)

#Drop GEOID and STATE column
df = df.drop( ['GEOID', 'state'], axis=1)
df.head(1)

Unnamed: 0_level_0,Population 2020,Median Household Income,Median Home Value,Median Gross Rent,Renter Household,Total Housing Units,Race: White Alone,Hispanic or Latino,Population Age 25 or Older: Bachelor`s Degree,Population Age 25 or Older: Doctorate Degree,Pop 5-17 Speak only English at Home,Population Age 15 or Older Never Married,Population Age 15 or Older Never Married 2,Population Age 65 or more,Pop 65 or more Living in Nonfamily Households,Females 20-64 w Own Children 0-17,Workers Age 16 or more,Workers 16 or more who Took Public Transportation to work,Workers 16 or more who Worked at Home,Workers 16 or more who Drove Alone to Work,Employed in Professional/Scientific/Technical Services,Employed in Accommodation/Food Services,Employed in Health Care/Social Assistance,Employed in Arts/Entertainment/Recreation,Male 16+: Private For-Profit Self-Emp Inc Bus,Male 16+: Private Not-For-Profit Worker,Civilian Population 18+: Veteran,Population 35-64: Medicaid Coverage Only,Population 35-64: No Health Insurance Coverage,Households with No Internet Acces,Households Receiving Food Stamps/SNAP,Households Below the Poverty Leve,Gross Rent 50% + of Household Income,Owner Households by Year Moved In: 2017 or Later,Total Vacant Housing Units,Housing Units Built in 2014 or Later,state,Population 2019,Pop Chng %,land_area,water_area,cases_per_100K,deaths_per_100K
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
"New York city, New York",8379552,67046.0,635200.0,1489.0,2143429.0,42.002186,41.330205,28.925997,16.000569,1.125072,7.664813,17.610035,18.069295,14.947255,4.979001,1.908419,46.830821,24.736883,3.465794,10.448387,4.848242,3.531358,8.708186,1.403011,1.143271,1.902047,1.705306,8.383276,3.427486,5.030627,7.244218,6.55683,6.763679,0.208245,3.913145,0.719466,36,8419316,-0.472295,777982727,434642000,5913.616647,220.640533


### Create Boolean Target Variable

In [14]:
# define a function to apply to the 'population' column
def growth_function(row):
    if row['Pop Chng %'] > 0:
        return 1
    else:
        return 0

# create new 'growth' column using the apply function
df['Growing'] = df.apply(lambda row: growth_function(row), axis=1)
df[['Pop Chng %', 'Growing']].head(5)

Unnamed: 0_level_0,Pop Chng %,Growing
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"New York city, New York",-0.472295,0
"Los Angeles city, California",0.159871,1
"Chicago city, Illinois",-0.375969,0
"Houston city, Texas",0.121449,1
"Phoenix city, Arizona",1.555709,1


## Train Test Split
Should we keep Population as a feature? I could see arguments for and against

In [20]:
y = df[['Growing']]
X = df.drop(['Growing', 'Pop Chng %', 'Population 2019'], axis=1)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=seed)
X_train.head(3)

Unnamed: 0_level_0,Population 2020,Median Household Income,Median Home Value,Median Gross Rent,Renter Household,Total Housing Units,Race: White Alone,Hispanic or Latino,Population Age 25 or Older: Bachelor`s Degree,Population Age 25 or Older: Doctorate Degree,Pop 5-17 Speak only English at Home,Population Age 15 or Older Never Married,Population Age 15 or Older Never Married 2,Population Age 65 or more,Pop 65 or more Living in Nonfamily Households,Females 20-64 w Own Children 0-17,Workers Age 16 or more,Workers 16 or more who Took Public Transportation to work,Workers 16 or more who Worked at Home,Workers 16 or more who Drove Alone to Work,Employed in Professional/Scientific/Technical Services,Employed in Accommodation/Food Services,Employed in Health Care/Social Assistance,Employed in Arts/Entertainment/Recreation,Male 16+: Private For-Profit Self-Emp Inc Bus,Male 16+: Private Not-For-Profit Worker,Civilian Population 18+: Veteran,Population 35-64: Medicaid Coverage Only,Population 35-64: No Health Insurance Coverage,Households with No Internet Acces,Households Receiving Food Stamps/SNAP,Households Below the Poverty Leve,Gross Rent 50% + of Household Income,Owner Households by Year Moved In: 2017 or Later,Total Vacant Housing Units,Housing Units Built in 2014 or Later,state,land_area,water_area,cases_per_100K,deaths_per_100K
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
"Deltona city, Florida",91847,56760.0,170000.0,1189.0,6883.0,36.063236,70.350692,36.911385,8.532668,0.124119,14.470805,13.63572,11.849053,16.122465,3.712696,1.709364,43.882762,0.107788,2.962536,36.378978,1.976112,3.192265,6.445502,1.210709,1.009287,0.765403,7.142313,3.340338,5.348024,3.353403,6.073143,3.72685,1.959781,0.687012,2.763291,0.182913,12,96537108,9151400,4081.800819,82.21832
"Encampment town, Wyoming",432,43170.0,184000.0,1031.0,63.0,78.703704,100.0,0.0,14.351852,1.62037,24.305556,11.342593,7.87037,18.287037,9.722222,0.462963,46.296296,0.0,6.25,34.259259,2.083333,2.546296,9.490741,2.314815,0.0,1.157407,7.87037,0.0,1.851852,9.722222,0.0,3.703704,0.0,4.861111,28.472222,3.009259,56,4136211,0,6276.122869,99.51569
"Lamberton city, Minnesota",831,50357.0,64100.0,600.0,79.0,51.98556,93.742479,3.008424,6.859206,0.240674,17.689531,11.79302,7.220217,25.631769,9.386282,2.647413,42.839952,0.0,2.166065,31.768953,0.481348,0.0,8.182912,3.249097,0.722022,1.925391,6.979543,2.647413,0.601685,10.348977,1.805054,3.850782,0.240674,1.684717,10.348977,1.083032,27,1979346,0,8609.227934,171.131442


## Check for Class Imbalances

In [22]:
y_train.value_counts()

Growing
1          7732
0          6889
dtype: int64

**Results: Looks pretty balanced!**


# Scale Datasets

In [32]:
#create a RobustScaler object
scaler = RobustScaler()
#Fit
scaler.fit(X_train)

#Transform X_train and X_test
X_train_RS = pd.DataFrame(  scaler.transform(X_train), columns = X_train.columns)
X_train_RS.set_index( X_train.index , drop=True, inplace=True)

X_test_RS =  pd.DataFrame( scaler.transform(X_test), columns = X_test.columns)
X_test_RS.set_index( X_test.index , drop=True, inplace=True)

X_train_RS.head(3)

Unnamed: 0_level_0,Population 2020,Median Household Income,Median Home Value,Median Gross Rent,Renter Household,Total Housing Units,Race: White Alone,Hispanic or Latino,Population Age 25 or Older: Bachelor`s Degree,Population Age 25 or Older: Doctorate Degree,Pop 5-17 Speak only English at Home,Population Age 15 or Older Never Married,Population Age 15 or Older Never Married 2,Population Age 65 or more,Pop 65 or more Living in Nonfamily Households,Females 20-64 w Own Children 0-17,Workers Age 16 or more,Workers 16 or more who Took Public Transportation to work,Workers 16 or more who Worked at Home,Workers 16 or more who Drove Alone to Work,Employed in Professional/Scientific/Technical Services,Employed in Accommodation/Food Services,Employed in Health Care/Social Assistance,Employed in Arts/Entertainment/Recreation,Male 16+: Private For-Profit Self-Emp Inc Bus,Male 16+: Private Not-For-Profit Worker,Civilian Population 18+: Veteran,Population 35-64: Medicaid Coverage Only,Population 35-64: No Health Insurance Coverage,Households with No Internet Acces,Households Receiving Food Stamps/SNAP,Households Below the Poverty Leve,Gross Rent 50% + of Household Income,Owner Households by Year Moved In: 2017 or Later,Total Vacant Housing Units,Housing Units Built in 2014 or Later,state,land_area,water_area,cases_per_100K,deaths_per_100K
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
"Deltona city, Florida",20.223561,0.23049,0.573267,1.347826,11.584906,-0.797428,-1.072642,3.836635,0.026752,-0.002509,-0.05503,0.228548,0.338377,-0.157014,-0.425781,-0.102909,-0.106138,0.414025,0.473271,0.049265,0.3414,0.238139,0.065091,0.748613,0.287473,-0.053789,0.344711,0.113736,0.518272,-0.500498,0.294229,-0.25644,0.124794,0.542316,-0.37982,-0.066037,-0.708333,9.03334,67.11971,-0.899299,-0.24008
"Encampment town, Wyoming",-0.172468,-0.305965,0.711881,0.889855,-0.113208,2.583286,0.488792,-0.391156,0.771238,2.339123,1.312698,-0.145871,-0.354898,0.075197,0.941565,-0.785334,0.105449,0.0,1.716299,-0.148573,0.388896,-0.001828,0.856183,1.796509,-0.467118,0.241875,0.53537,-0.720882,-0.32741,0.613236,-0.828715,-0.261054,-0.605695,5.989515,3.223304,2.331647,1.125,0.01498,-0.056882,-0.146699,-0.033962
"Lamberton city, Minnesota",-0.083445,-0.022263,-0.475248,-0.35942,-0.085763,0.46496,0.15925,-0.046574,-0.187345,0.179898,0.392601,-0.072326,-0.468186,0.863125,0.865129,0.410687,-0.197559,0.0,0.172117,-0.381,-0.320731,-0.947736,0.516435,2.683229,0.072699,0.821116,0.302086,-0.059398,-0.629811,0.722838,-0.494954,-0.231741,-0.515986,1.844321,0.683319,0.697564,-0.083333,-0.19553,-0.056882,0.6535,0.81942


# Baseline Classification Using Logistic Regression

In [46]:
LR_baseline = LogisticRegression(random_state=seed)
CVs_LR_baseline = cross_val_score(LR_baseline, X_train_RS, y_train.squeeze(), cv=10, scoring = 'f1') 

print("Mean CV F1 Score: ", np.mean(CVs_LR_baseline))
print("Std Dev CV F1 Score: ", np.std(CVs_LR_baseline))

Mean CV F1 Score:  0.5409586220298395
Std Dev CV F1 Score:  0.04316028366748058


## Tune Parameters of Logistic Regression 
### Use GridSearch CV

In [56]:
#Define new LogReg
LR_GS = LogisticRegression(random_state=seed)

# define parameter grid for grid search
param_grid = {
    'penalty': ['l1', None],
    'C': [0.1, 1, 10],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'max_iter': [100, 300]
}

# set warnings to false
warnings.filterwarnings('ignore')

#GridSearchCV object
GS = GridSearchCV(estimator=LR_GS, param_grid=param_grid, cv=5)
GS.fit(X_train_RS, y_train.squeeze())

# set warnings back to true
warnings.filterwarnings('default')

#Print Optimal Parameters
best_params = GS.best_params_
print("Optimal parameters:",best_params)

#Set Best Parameters
LR_GS.set_params(**best_params)

#Get CV Score
CVs_LR_GS = cross_val_score(LR_GS, X_train_RS, y_train.squeeze(), cv=10, scoring = 'f1') 
print("Mean CV F1 Score: ", np.mean(CVs_LR_GS))
print("Std Dev CV F1 Score: ", np.std(CVs_LR_GS))

Optimal parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Mean CV F1 Score:  0.6417809055741956
Std Dev CV F1 Score:  0.016309604491181393


# References:
- https://stackoverflow.com/questions/26886653/create-new-column-based-on-values-from-other-columns-apply-a-function-of-multi
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_index.html
- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html
- https://stackoverflow.com/questions/75220289/difference-between-gridsearch-best-estimator-and-best-params

