In [1]:
import requests
import pandas as pd
import censusdis.data as ced
from censusdis import states
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from pycaret.regression import *

In [2]:
#variables of interest
download_variables=['NAME', 'B15003_022E', 'B25077_001E']


In [3]:
#download data
df_2022 = ced.download(
    
    # Data set: American Community Survey 5-Year
    dataset='acs/acs5',
    
    # Vintage: 2022
    vintage=2022, 
    
    # Variable: median household income
    download_variables=download_variables, 
    
    # Geography: Georgia State
    state="*",
    county="*",
    
    with_geometry=True
)

df_2022.head()


Unnamed: 0,STATE,COUNTY,NAME,B15003_022E,B25077_001E,geometry
0,1,1,"Autauga County, Alabama",6726,191800.0,"POLYGON ((-86.92120 32.65754, -86.92035 32.658..."
1,1,3,"Baldwin County, Alabama",33474,266000.0,"POLYGON ((-88.02858 30.22676, -88.02399 30.230..."
2,1,5,"Barbour County, Alabama",1167,102700.0,"POLYGON ((-85.74803 31.61918, -85.74544 31.618..."
3,1,7,"Bibb County, Alabama",1047,120100.0,"POLYGON ((-87.42194 33.00338, -87.31854 33.006..."
4,1,9,"Blount County, Alabama",3840,159800.0,"POLYGON ((-86.96336 33.85822, -86.95967 33.857..."


In [4]:
#add year column with value 2022 for the entire dataset
df_2022['year'] = 2022

In [5]:
#change variable name to more descriptive names B15003_022E = median_bachelor, B25077_001E = median_income
df_2022.rename(columns={'B15003_022E': 'median_bachelor_edu', 'B25077_001E': 'median_home_value'}, inplace=True)

In [6]:
#Get data for 2018
df_2018 = ced.download(
    
    # Data set: American Community Survey 5-Year
    dataset='acs/acs5',
    
    # Vintage: 2018
    vintage=2018, 
    
    # Variable: median household income
    download_variables=download_variables, 
    
    # Geography: Georgia State
    state="*",
    county="*",
    
    with_geometry=True
)

#add year column with value 2018 for the entire dataset
df_2018['year'] = 2018

#rename variables
df_2018.rename(columns={'B15003_022E': 'median_bachelor_edu', 'B25077_001E': 'median_home_value'}, inplace=True)

df_2018.head()


Unnamed: 0,STATE,COUNTY,NAME,median_bachelor_edu,median_home_value,geometry,year
0,20,173,"Sedgwick County, Kansas",65704,135600.0,"POLYGON ((-97.80835 37.64679, -97.80672 37.654...",2018
1,20,157,"Republic County, Kansas",494,62800.0,"POLYGON ((-97.93342 39.82744, -97.93219 39.885...",2018
2,20,65,"Graham County, Kansas",385,70100.0,"POLYGON ((-100.16357 39.24778, -100.16354 39.2...",2018
3,20,45,"Douglas County, Kansas",17876,192800.0,"POLYGON ((-95.50140 39.02951, -95.50149 39.043...",2018
4,20,179,"Sheridan County, Kansas",303,106900.0,"POLYGON ((-100.72147 39.19128, -100.72128 39.2...",2018


In [7]:
#arrange both datasets with ascending order of TRACT variable
df_2018 = df_2018.sort_values(by='COUNTY')
df_2022 = df_2022.sort_values(by='COUNTY')


In [8]:
#create change_median_bachelor_edu and change_median_home_value
df_2022['change_median_bachelor_edu'] = (df_2022['median_bachelor_edu'] - df_2018['median_bachelor_edu']) / df_2018['median_bachelor_edu']
df_2022['change_median_home_value'] = (df_2022['median_home_value'] - df_2018['median_home_value']) / df_2018['median_home_value']


In [9]:
#shift all values in change_median_bachelor_edu and change_median_home_value columns to ensure positive values

#---change_median_bachelor_edu---
# Step 1: Find the minimum value in the change_median_bachelor_edu column
min_value_bachelor_edu = df_2022['change_median_bachelor_edu'].min()

# Step 2: Adjust the minimum value by adding 1
adjusted_min_value = abs(min_value_bachelor_edu) + 1

# Step 3: Add this adjusted value to all values in the column
df_2022['change_median_bachelor_edu'] += adjusted_min_value

#---change_median_home_value---
# Step 1: Find the minimum value in the change_median_home_value column
min_value_home_value = df_2022['change_median_home_value'].min()

# Step 2: Adjust the minimum value by adding 1
adjusted_min_value_home = abs(min_value_home_value) + 1

# Step 3: Add this adjusted value to all values in the column
df_2022['change_median_home_value'] += adjusted_min_value_home



In [10]:
#scale the change_median_bachelor_edu and change_median_home_value columns by dividing by the maximum value in the column
df_2022['change_median_bachelor_edu'] = df_2022['change_median_bachelor_edu'] / df_2022['change_median_bachelor_edu'].max()
df_2022['change_median_home_value'] = df_2022['change_median_home_value'] / df_2022['change_median_home_value'].max()

In [11]:
#create new column called gentrification_index where we will multiply the two change columns
df_2022['gentrification_index'] = df_2022['change_median_bachelor_edu'] * df_2022['change_median_home_value']



In [12]:
df_2022.head()

Unnamed: 0,STATE,COUNTY,NAME,median_bachelor_edu,median_home_value,geometry,year,change_median_bachelor_edu,change_median_home_value,gentrification_index
0,1,1,"Autauga County, Alabama",6726,191800.0,"POLYGON ((-86.92120 32.65754, -86.92035 32.658...",2022,0.0,0.151139,0.0
1797,35,1,"Bernalillo County, New Mexico",93393,247300.0,"POLYGON ((-107.19676 35.21946, -107.17087 35.2...",2022,0.0,0.160789,0.0
1830,36,1,"Albany County, New York",47712,263800.0,"POLYGON ((-74.26473 42.42013, -74.25602 42.437...",2022,0.0,0.1288,0.0
1892,37,1,"Alamance County, North Carolina",21972,197700.0,"POLYGON ((-79.54192 35.88335, -79.54173 35.899...",2022,0.0,0.10622,0.0
245,8,1,"Adams County, Colorado",62602,425000.0,"POLYGON ((-105.05329 39.85676, -105.05316 39.8...",2022,0.0,0.310845,0.0


In [17]:
#check for missing values
df_2022.isnull().sum()

#drop missing values
df_2022 = df_2022.dropna()

In [18]:
df_2022.isnull().sum()

STATE                         0
COUNTY                        0
NAME                          0
median_bachelor_edu           0
median_home_value             0
geometry                      0
year                          0
change_median_bachelor_edu    0
change_median_home_value      0
gentrification_index          0
dtype: int64

In [None]:
#export the

In [311]:
# One hot encoding for the NAME column and put in 0 if false and 1 if true
df_2022 = pd.get_dummies(df_2022, columns=['NAME'])

#rename new columns by removing the NAME_ prefix
df_2022.columns = df_2022.columns.str.replace('NAME_', '')

In [313]:
#drop columns that are not needed
df_2022 = df_2022.drop(columns=['geometry', 'STATE', 'COUNTY', 'year', 'median_bachelor_edu', 'median_home_value', 'change_median_bachelor_edu', 'change_median_home_value'])

In [314]:
#split the data in training and test

X = df_2022.drop(columns=['gentrification_index'])
y = df_2022['gentrification_index']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [315]:
#initialize the models
rf = RandomForestRegressor()
lr = LinearRegression()

#fit the models
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

#predict
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)

In [316]:
#Evaluate the models

#calculate the percentage error
rf_error = abs(rf_pred - y_test) / y_test
lr_error = abs(lr_pred - y_test) / y_test

#calculate the mean percentage error
rf_mean_error = rf_error.mean()
lr_mean_error = lr_error.mean()

print(f'Random Forest Mean Percentage Error: {rf_mean_error}')

print(f'Linear Regression Mean Percentage Error: {lr_mean_error}')

Random Forest Mean Percentage Error: 0.566122175328132
Linear Regression Mean Percentage Error: 32.51564780596694


In [321]:
#identify the conuty with highest gentrification index in df_2022
df_2022[df_2022['gentrification_index'] == df_2022['gentrification_index'].max()]


Unnamed: 0,gentrification_index,"Appling County, Georgia","Atkinson County, Georgia","Bacon County, Georgia","Baker County, Georgia","Baldwin County, Georgia","Banks County, Georgia","Barrow County, Georgia","Bartow County, Georgia","Ben Hill County, Georgia",...,"Washington County, Georgia","Wayne County, Georgia","Webster County, Georgia","Wheeler County, Georgia","White County, Georgia","Whitfield County, Georgia","Wilcox County, Georgia","Wilkes County, Georgia","Wilkinson County, Georgia","Worth County, Georgia"
66,0.729943,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [325]:
#create a copy of the df_2022 dataset and remove the row with highest gentrification_index
df_2022_copy = df_2022.copy()
df_2022_copy = df_2022_copy.drop(df_2022_copy[df_2022_copy['gentrification_index'] == df_2022_copy['gentrification_index'].max()].index)

#provide now the colum with th ehighest gentrification_index
df_2022_copy[df_2022_copy['gentrification_index'] == df_2022_copy['gentrification_index'].max()]

Unnamed: 0,gentrification_index,"Appling County, Georgia","Atkinson County, Georgia","Bacon County, Georgia","Baker County, Georgia","Baldwin County, Georgia","Banks County, Georgia","Barrow County, Georgia","Bartow County, Georgia","Ben Hill County, Georgia",...,"Washington County, Georgia","Wayne County, Georgia","Webster County, Georgia","Wheeler County, Georgia","White County, Georgia","Whitfield County, Georgia","Wilcox County, Georgia","Wilkes County, Georgia","Wilkinson County, Georgia","Worth County, Georgia"
75,0.219944,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
