In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('../SimplyAnalytics Data/Data_by_year_cleaned.csv', header=0)

In [3]:
data.head(100)

Unnamed: 0,Id,neighborhood,larger_neighborhood,Name,Population,Median Household Income,Median Home Value,Percent Bachelors Degree,Median Rent,Percent Public Transportation to Work,Percent White Non-Hispanic,Housing Units,Percent Renter Occupied,Percent Occupied Structure w 3-4 Units,Percent Occupied Structure w 5+ Units,Percent Households w 3 Vehicles,Percent Households w 4+ Vehicles,"Percent Families, Married w Children Under 18",Total Households,Year
0,530330001001,Cedar Park,Lake City,BG0001001,1250,141810,660354,24.63,0,6.78,77.52,565,0.00,0.00,0.00,8.71,1.70,33.33,1663,2010
1,530330001002,Cedar Park,Lake City,BG0001002,1234,33316,338235,19.87,771,16.55,45.79,658,83.42,0.00,78.27,8.80,1.52,15.88,1573,2010
2,530330001003,Cedar Park,Lake City,BG0001003,1337,48703,417188,26.48,864,16.52,58.19,744,72.79,6.05,61.56,8.83,1.45,27.75,1416,2010
3,530330001004,Cedar Park,Lake City,BG0001004,1492,29392,0,12.81,694,32.69,35.66,838,100.00,3.70,89.50,8.84,1.56,12.74,1319,2010
4,530330001005,Cedar Park,Lake City,BG0001005,942,48882,415385,26.60,863,16.56,56.05,638,72.84,5.96,61.44,8.64,1.59,27.49,1293,2010
5,530330002001,Olympic Hills,Lake City,BG0002001,1086,61218,361151,26.42,1173,16.00,61.33,455,39.39,10.11,4.18,7.69,3.26,41.72,1277,2010
6,530330002002,Victory Heights,Lake City,BG0002002,1360,57863,354698,33.24,813,19.25,55.15,590,60.33,0.00,32.88,7.79,3.26,29.00,1212,2010
7,530330002003,Pinehurst,Northgate,BG0002003,1509,48071,334967,27.35,875,20.53,60.24,826,69.40,0.00,76.39,7.68,3.59,13.90,1204,2010
8,530330002004,Pinehurst,Northgate,BG0002004,1053,69413,330061,25.31,877,15.75,66.57,576,25.32,5.90,44.27,7.76,3.33,12.13,1187,2010
9,530330002005,Pinehurst,Northgate,BG0002005,1324,49187,349065,30.88,738,17.89,65.63,698,47.53,8.02,52.29,7.77,3.29,26.39,1131,2010


#### Get census tract Ids

In [4]:
def compute_tract_id(row):
    tid = row['Id']
    tract_id = np.int(tid/100)
    return tract_id 

data['tract_id'] = data.apply(compute_tract_id, axis = 1)

#### Missing values treatment

In [6]:
def add_col_for_imputing_missing_values_in_median_houseprice(row):    
    y = row['Year']
    tract = row['tract_id']
    if row['Median Home Value'] == 0:
        imputed_value = compute_missing_value_in_median_houseprice(y,tract)
        return imputed_value
    else:
        return row['Median Home Value']


def compute_missing_value_in_median_houseprice(y,tract):
    median_houseprice_array =np.array(data.loc[(data['Year'] == y) & (data['tract_id'] == tract), 'Median Home Value']. astype(int))
    mean_median_houseprice = np.mean(median_houseprice_array)
    return mean_median_houseprice 

data['median_houseprice_with_imputation'] = data.apply(add_col_for_imputing_missing_values_in_median_houseprice, axis = 1)

In [7]:
def add_col_for_imputing_missing_values_in_median_rent(row):    
    y = row['Year']
    tract = row['tract_id']
    if row['Median Rent'] == 0:
        imputed_value = compute_missing_value_in_median_rent(y,tract)
        return imputed_value
    else:
        return row['Median Rent']


def compute_missing_value_in_median_rent(y,tract):
    median_rent_array =np.array(data.loc[(data['Year'] == y) & (data['tract_id'] == tract), 'Median Rent']. astype(int))
    mean_median_rent = np.mean(median_rent_array)
    return mean_median_rent 

data['median_rent_with_imputation'] = data.apply(add_col_for_imputing_missing_values_in_median_rent, axis = 1)

### Definition 1: Sociology 
#### Eligible for Gentrification:
* The block group’s median household income was in the bottom 40th percentile when compared to all block groups in Seattle at the beginning of the time span.
* The block group’s median house value was in the bottom 40th percentile when compared to all block groups in Seattle at the beginning of the time span.

#### Have gentrified:
* An increase in a block group’s educational attainment, as measured by the percentage of residents age 25 and over holding bachelor’s degrees, was in the top third percentile of all block groups in Seattle.
* An increase in a block group’s median house value, as measured by inflation-adjusted median house value, was in the top third percentile of all block groups in Seattle.


#### Calculations for Part1: Eligible for gentrification

In [8]:
def add_col_for_median_income_by_year(row):    
    y = row['Year']
    median_income_for_row = compute_median_income_by_year(y)
    return median_income_for_row

def compute_median_income_by_year(y):
    median_income_of_one_year =np.array(data.loc[data['Year'] == y, 'Median Household Income']. astype(int))
    median_income = np.percentile(median_income_of_one_year, 50)
    return median_income

data['median_income_by_year'] = data.apply(add_col_for_median_income_by_year, axis = 1)

In [9]:
def add_col_for_median_housevalue_by_year(row):    
    y = row['Year']
    median_housevalue_for_row = compute_median_housevalue_by_year(y)
    return median_housevalue_for_row

def compute_median_housevalue_by_year(y):
    median_housevalue_of_one_year =np.array(data.loc[data['Year'] == y, 'median_houseprice_with_imputation']. astype(int))
    median_housevalue = np.percentile(median_housevalue_of_one_year, 50)
    return median_housevalue

data['median_housevalue_by_year'] = data.apply(add_col_for_median_housevalue_by_year, axis = 1)

In [10]:
def add_col_for_median_income_40th_perc(row):    
    y = row['Year']
    median_income_40th_perc = compute_median_income_40th_perc(y)
    return median_income_40th_perc

def compute_median_income_40th_perc(y):
    median_income_40th_perc_of_one_year =np.array(data.loc[data['Year'] == y, 'Median Household Income']. astype(int))
    median_income_40th_perc = np.percentile(median_income_40th_perc_of_one_year, 40)
    return median_income_40th_perc

data['median_income_40th_perc_by_year'] = data.apply(add_col_for_median_income_40th_perc, axis = 1)

In [11]:
def add_col_for_median_housevalue_40th_perc(row):    
    y = row['Year']
    median_housevalue_40th_perc = compute_median_housevalue_40th_perc(y)
    return median_housevalue_40th_perc

def compute_median_housevalue_40th_perc(y):
    median_housevalue_40th_perc_of_one_year =np.array(data.loc[data['Year'] == y, 'median_houseprice_with_imputation']. astype(int))
    median_housevalue_40th_perc = np.percentile(median_housevalue_40th_perc_of_one_year, 40)
    return median_housevalue_40th_perc

data['median_housevalue_40th_perc_by_year'] = data.apply(add_col_for_median_housevalue_40th_perc, axis = 1)

In [12]:
data.head()

Unnamed: 0,Id,neighborhood,larger_neighborhood,Name,Population,Median Household Income,Median Home Value,Percent Bachelors Degree,Median Rent,Percent Public Transportation to Work,...,"Percent Families, Married w Children Under 18",Total Households,Year,tract_id,median_houseprice_with_imputation,median_rent_with_imputation,median_income_by_year,median_housevalue_by_year,median_income_40th_perc_by_year,median_housevalue_40th_perc_by_year
0,530330001001,Cedar Park,Lake City,BG0001001,1250,141810,660354,24.63,0,6.78,...,33.33,1663,2010,5303300010,660354.0,638.4,69886.5,418602.0,62272.6,389955.6
1,530330001002,Cedar Park,Lake City,BG0001002,1234,33316,338235,19.87,771,16.55,...,15.88,1573,2010,5303300010,338235.0,771.0,69886.5,418602.0,62272.6,389955.6
2,530330001003,Cedar Park,Lake City,BG0001003,1337,48703,417188,26.48,864,16.52,...,27.75,1416,2010,5303300010,417188.0,864.0,69886.5,418602.0,62272.6,389955.6
3,530330001004,Cedar Park,Lake City,BG0001004,1492,29392,0,12.81,694,32.69,...,12.74,1319,2010,5303300010,366232.4,694.0,69886.5,418602.0,62272.6,389955.6
4,530330001005,Cedar Park,Lake City,BG0001005,942,48882,415385,26.6,863,16.56,...,27.49,1293,2010,5303300010,415385.0,863.0,69886.5,418602.0,62272.6,389955.6


#### Calculations for Part2: Have gentrified

In [13]:
data['Education_Attainment_Bachelors_Degree'] = round(data['Percent Bachelors Degree']*(data[' Population'])*0.01,0)

def add_col_for_increase_in_edu_attainment(row):    
    y = row['Year']
    bg = row['Name']
    increase_in_edu_attainment_for_row = compute_increase_in_edu_attainment(y,bg)
    return increase_in_edu_attainment_for_row

def compute_increase_in_edu_attainment(y,bg):
    if y==2010 or y==2020:
        return 0
    edu_attainment_by_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'Education_Attainment_Bachelors_Degree']. astype(int))
    edu_attainment_by_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'Education_Attainment_Bachelors_Degree']. astype(int))
    increase_in_edu_attainment_by_year = edu_attainment_by_year2 - edu_attainment_by_year1
    return increase_in_edu_attainment_by_year 

data['increase_in_edu_attainment'] = data.apply(add_col_for_increase_in_edu_attainment, axis = 1)

In [14]:
def add_col_for_edu_attainment_top_3rd_percentile_by_year(row):    
    y = row['Year']
    edu_attainment_top_3rd_percentile_for_row = compute_edu_attainmenttop_3rd_percentile_by_year(y)
    return edu_attainment_top_3rd_percentile_for_row

def compute_edu_attainmenttop_3rd_percentile_by_year(y):
    increase_edu_attainment_top_3rd_percentile_one_year =np.array(data.loc[data['Year'] == y, 'increase_in_edu_attainment']. astype(int))
    increase_edu_attainment_top_3rd_percentile_by_year = np.percentile(increase_edu_attainment_top_3rd_percentile_one_year, 66)
    return increase_edu_attainment_top_3rd_percentile_by_year

data['increase_in_edu_attainment_top_3rd_percentile_by_year'] = data.apply(add_col_for_edu_attainment_top_3rd_percentile_by_year, axis = 1)

In [15]:
data.head()

Unnamed: 0,Id,neighborhood,larger_neighborhood,Name,Population,Median Household Income,Median Home Value,Percent Bachelors Degree,Median Rent,Percent Public Transportation to Work,...,tract_id,median_houseprice_with_imputation,median_rent_with_imputation,median_income_by_year,median_housevalue_by_year,median_income_40th_perc_by_year,median_housevalue_40th_perc_by_year,Education_Attainment_Bachelors_Degree,increase_in_edu_attainment,increase_in_edu_attainment_top_3rd_percentile_by_year
0,530330001001,Cedar Park,Lake City,BG0001001,1250,141810,660354,24.63,0,6.78,...,5303300010,660354.0,638.4,69886.5,418602.0,62272.6,389955.6,308.0,0,0.0
1,530330001002,Cedar Park,Lake City,BG0001002,1234,33316,338235,19.87,771,16.55,...,5303300010,338235.0,771.0,69886.5,418602.0,62272.6,389955.6,245.0,0,0.0
2,530330001003,Cedar Park,Lake City,BG0001003,1337,48703,417188,26.48,864,16.52,...,5303300010,417188.0,864.0,69886.5,418602.0,62272.6,389955.6,354.0,0,0.0
3,530330001004,Cedar Park,Lake City,BG0001004,1492,29392,0,12.81,694,32.69,...,5303300010,366232.4,694.0,69886.5,418602.0,62272.6,389955.6,191.0,0,0.0
4,530330001005,Cedar Park,Lake City,BG0001005,942,48882,415385,26.6,863,16.56,...,5303300010,415385.0,863.0,69886.5,418602.0,62272.6,389955.6,251.0,0,0.0


In [16]:
def add_col_for_increase_in_median_housevalue(row):    
    y = row['Year']
    bg = row['Name']
    increase_in_median_housevalue_by_year = compute_increase_in_median_housevalue(y,bg)
    return increase_in_median_housevalue_by_year

def compute_increase_in_median_housevalue(y,bg):
    if y==2010 or y==2020:
        return 0
    median_housevalue_by_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'median_houseprice_with_imputation']. astype(int))
    median_housevalue_by_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'median_houseprice_with_imputation']. astype(int))
    increase_in_median_housevalue_by_year = median_housevalue_by_year2 - median_housevalue_by_year1
    return increase_in_median_housevalue_by_year 

data['increase_in_median_housevalue'] = data.apply(add_col_for_increase_in_median_housevalue, axis = 1)

In [17]:
def add_col_for_housevalue_top_3rd_percentile_by_year(row):    
    y = row['Year']
    housevlue_top_3rd_percentile_for_row = compute_housevalue_top_3rd_percentile_by_year(y)
    return housevlue_top_3rd_percentile_for_row

def compute_housevalue_top_3rd_percentile_by_year(y):
    increase_housevalue_top_3rd_percentile_one_year =np.array(data.loc[data['Year'] == y, 'increase_in_median_housevalue']. astype(int))
    increase_housevalue_top_3rd_percentile_by_year = np.percentile(increase_housevalue_top_3rd_percentile_one_year, 66)
    return increase_housevalue_top_3rd_percentile_by_year

data['increase_in_housevalue_top_3rd_percentile_by_year'] = data.apply(add_col_for_housevalue_top_3rd_percentile_by_year, axis = 1)

In [18]:
data.head()

Unnamed: 0,Id,neighborhood,larger_neighborhood,Name,Population,Median Household Income,Median Home Value,Percent Bachelors Degree,Median Rent,Percent Public Transportation to Work,...,median_rent_with_imputation,median_income_by_year,median_housevalue_by_year,median_income_40th_perc_by_year,median_housevalue_40th_perc_by_year,Education_Attainment_Bachelors_Degree,increase_in_edu_attainment,increase_in_edu_attainment_top_3rd_percentile_by_year,increase_in_median_housevalue,increase_in_housevalue_top_3rd_percentile_by_year
0,530330001001,Cedar Park,Lake City,BG0001001,1250,141810,660354,24.63,0,6.78,...,638.4,69886.5,418602.0,62272.6,389955.6,308.0,0,0.0,0,0.0
1,530330001002,Cedar Park,Lake City,BG0001002,1234,33316,338235,19.87,771,16.55,...,771.0,69886.5,418602.0,62272.6,389955.6,245.0,0,0.0,0,0.0
2,530330001003,Cedar Park,Lake City,BG0001003,1337,48703,417188,26.48,864,16.52,...,864.0,69886.5,418602.0,62272.6,389955.6,354.0,0,0.0,0,0.0
3,530330001004,Cedar Park,Lake City,BG0001004,1492,29392,0,12.81,694,32.69,...,694.0,69886.5,418602.0,62272.6,389955.6,191.0,0,0.0,0,0.0
4,530330001005,Cedar Park,Lake City,BG0001005,942,48882,415385,26.6,863,16.56,...,863.0,69886.5,418602.0,62272.6,389955.6,251.0,0,0.0,0,0.0


#### Labelling the tracts per definition 1

In [19]:
def label_def1(row):
   if row['Year'] == 2010 or row['Year'] == 2020:
      return 'None'
   if row['Median Household Income'] < row['median_income_40th_perc_by_year'] and row['median_houseprice_with_imputation'] < row['median_housevalue_40th_perc_by_year']:
      return 'EligibleForGentrification'
   if row['increase_in_edu_attainment'] > row['increase_in_edu_attainment_top_3rd_percentile_by_year'] and row['increase_in_median_housevalue'] > row['increase_in_housevalue_top_3rd_percentile_by_year']:
      return 'HaveGentrified'
   else:
      return 'NotEligibleForGentrification'

In [20]:
data['def_1'] = data.apply(label_def1, axis=1)

### Definition 2: Financial & Economics
#### Currently gentrifying:
* The block group’s income levels below 40% of the median, and experienced rent increases greater than the median neighborhood did.

#### Have gentrified:
* The block group’s share of neighborhoods in a metro area that moved from the bottom half to the top half in the distribution of home prices.

#### Calculations for Part1: Currently gentrifying

In [21]:
def add_col_for_40_percent_median_income_by_year(row):    
    y = row['Year']
    median_income_40_percent_for_row = compute_median_income_40_percent_by_year(y)
    return median_income_40_percent_for_row

def compute_median_income_40_percent_by_year(y):
    median_income_40_percent_of_one_year =np.array(data.loc[data['Year'] == y, 'Median Household Income']. astype(int))
    median_income_40_percent_by_year = np.percentile(median_income_40_percent_of_one_year, 50)*0.4
    return median_income_40_percent_by_year

data['median_income_40_percent_by_year'] = data.apply(add_col_for_40_percent_median_income_by_year, axis = 1)

In [22]:
def add_col_for_increase_in_median_rent(row):    
    y = row['Year']
    bg = row['Name']
    increase_in_median_rent = compute_increase_in_median_rent(y,bg)
    return increase_in_median_rent

def compute_increase_in_median_rent(y,bg):
    if y==2010 or y==2020:
        return 0
    median_rent_by_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'median_rent_with_imputation']. astype(int))
    median_rent_by_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'median_rent_with_imputation']. astype(int))
    increase_in_median_rent_by_year = median_rent_by_year2 - median_rent_by_year1
    return increase_in_median_rent_by_year 

data['increase_in_median_rent'] = data.apply(add_col_for_increase_in_median_rent, axis = 1)

In [23]:
data.head()

Unnamed: 0,Id,neighborhood,larger_neighborhood,Name,Population,Median Household Income,Median Home Value,Percent Bachelors Degree,Median Rent,Percent Public Transportation to Work,...,median_income_40th_perc_by_year,median_housevalue_40th_perc_by_year,Education_Attainment_Bachelors_Degree,increase_in_edu_attainment,increase_in_edu_attainment_top_3rd_percentile_by_year,increase_in_median_housevalue,increase_in_housevalue_top_3rd_percentile_by_year,def_1,median_income_40_percent_by_year,increase_in_median_rent
0,530330001001,Cedar Park,Lake City,BG0001001,1250,141810,660354,24.63,0,6.78,...,62272.6,389955.6,308.0,0,0.0,0,0.0,,27954.6,0
1,530330001002,Cedar Park,Lake City,BG0001002,1234,33316,338235,19.87,771,16.55,...,62272.6,389955.6,245.0,0,0.0,0,0.0,,27954.6,0
2,530330001003,Cedar Park,Lake City,BG0001003,1337,48703,417188,26.48,864,16.52,...,62272.6,389955.6,354.0,0,0.0,0,0.0,,27954.6,0
3,530330001004,Cedar Park,Lake City,BG0001004,1492,29392,0,12.81,694,32.69,...,62272.6,389955.6,191.0,0,0.0,0,0.0,,27954.6,0
4,530330001005,Cedar Park,Lake City,BG0001005,942,48882,415385,26.6,863,16.56,...,62272.6,389955.6,251.0,0,0.0,0,0.0,,27954.6,0


In [24]:
def add_col_for_median_of_increase_in_rent(row):    
    y = row['Year']
    increase_in_rent_median = compute_median_of_increase_in_rent(y)
    return increase_in_rent_median

def compute_median_of_increase_in_rent(y):
    median_of_increase_in_rent_of_one_year =np.array(data.loc[data['Year'] == y, 'increase_in_median_rent']. astype(int))
    median_of_increase_in_rent_of_one_year_by_year = np.percentile(median_of_increase_in_rent_of_one_year, 50)
    return median_of_increase_in_rent_of_one_year_by_year

data['median_of_increase_in_rent_by_year'] = data.apply(add_col_for_median_of_increase_in_rent, axis = 1)

#### Calculations for Part2: Have gentrified

In [25]:
def add_col_for_housevalue_distribution_current_year(row):    
    y = row['Year']
    bg = row['Name']
    housevalue_distribution_this_year = compute_housevalue_distribution(y, bg)
    return housevalue_distribution_this_year

def add_col_for_housevalue_distribution_past_year(row):    
    y = row['Year']
    bg = row['Name']
    housevalue_distribution_past_year = compute_housevalue_distribution(y-1, bg)
    return housevalue_distribution_past_year

def compute_housevalue_distribution(y, bg):
    if y==2009 or y==2019:
        return 'None'
    housevalue = np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'median_houseprice_with_imputation']. astype(int))
    median_house_value_by_year = np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'median_housevalue_by_year']. astype(int))
    if housevalue > median_house_value_by_year:
        return 'topHalf'
    else:
        return 'bottomHalf'
    
data['housevalue_distribution_current_year'] = data.apply(add_col_for_housevalue_distribution_current_year, axis = 1)
data['housevalue_distribution_past_year'] = data.apply(add_col_for_housevalue_distribution_past_year, axis = 1)    

In [26]:
data.head()

Unnamed: 0,Id,neighborhood,larger_neighborhood,Name,Population,Median Household Income,Median Home Value,Percent Bachelors Degree,Median Rent,Percent Public Transportation to Work,...,increase_in_edu_attainment,increase_in_edu_attainment_top_3rd_percentile_by_year,increase_in_median_housevalue,increase_in_housevalue_top_3rd_percentile_by_year,def_1,median_income_40_percent_by_year,increase_in_median_rent,median_of_increase_in_rent_by_year,housevalue_distribution_current_year,housevalue_distribution_past_year
0,530330001001,Cedar Park,Lake City,BG0001001,1250,141810,660354,24.63,0,6.78,...,0,0.0,0,0.0,,27954.6,0,0.0,topHalf,
1,530330001002,Cedar Park,Lake City,BG0001002,1234,33316,338235,19.87,771,16.55,...,0,0.0,0,0.0,,27954.6,0,0.0,bottomHalf,
2,530330001003,Cedar Park,Lake City,BG0001003,1337,48703,417188,26.48,864,16.52,...,0,0.0,0,0.0,,27954.6,0,0.0,bottomHalf,
3,530330001004,Cedar Park,Lake City,BG0001004,1492,29392,0,12.81,694,32.69,...,0,0.0,0,0.0,,27954.6,0,0.0,bottomHalf,
4,530330001005,Cedar Park,Lake City,BG0001005,942,48882,415385,26.6,863,16.56,...,0,0.0,0,0.0,,27954.6,0,0.0,bottomHalf,


#### Labelling the block groups per definition 2

In [27]:
def label_def2(row):
   if row['Median Household Income'] < row['median_income_40_percent_by_year'] and row['increase_in_median_rent'] > row['median_of_increase_in_rent_by_year']:
      return 'CurrentlyGentrifying'
   if row['housevalue_distribution_current_year'] == 'topHalf' and row['housevalue_distribution_past_year'] == 'bottomHalf': 
      return 'HaveGentrified'
   else:
      return 'NotCurrentlyGentrifying'

In [28]:
data['def_2'] = data.apply(label_def2, axis=1)

### Definition 3: Urban Planning
#### In danger of gentrifying:
* The block group’s % of workers taking transit increased
* The block group’s youth facilities per 1000 residents increased
* The block group’s public space per 1000 residents increased
* The block group’s % non-family households increased
* The block group’s % dwelling units in building with 5+ units increased
* The block group’s % dwelling units in building with 3-4 units increased
* The block group’s % renter-occupied increased
* The block group’s income diversity increased
* The block group’s % of renters paying >35% of income increased
* The block group’s % of dwelling units with three or more cars available decreased
* The block group’s recreational facilities per 1000 residents decreased
* The block group’s % married couples with children decreased
* The block group’s % non-hispanic white decreased

In [29]:
#data.head()

In [30]:
data['workers_taking_transit'] = round(data['Percent Public Transportation to Work']*(data[' Population'])*0.01,0)

def add_col_for_increase_in_workers_taking_transit(row):    
    y = row['Year']
    bg = row['Name']
    increase_in_workers_taking_transit = compute_increase_in_workers_taking_transit(y,bg)
    return increase_in_workers_taking_transit

def compute_increase_in_workers_taking_transit(y, bg):
    if y==2010 or y==2020:
        return 'None'
    workers_taking_transit_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'workers_taking_transit']. astype(int))
    workers_taking_transit_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'workers_taking_transit']. astype(int))
    if(workers_taking_transit_year2 > workers_taking_transit_year1):
        return 'True'
    else:
        return 'False'
    
data['increase_in_workers_taking_transit'] = data.apply(add_col_for_increase_in_workers_taking_transit, axis=1)    

In [31]:
data['dwelling_units_with_5+_units'] = round(data['Percent Occupied Structure w 5+ Units']*(data[' Housing Units'])*0.01,0)

def add_col_for_increase_in_dwelling_units_with_5_plus_units(row):    
    y = row['Year']
    bg = row['Name']
    increase_in_dwelling_units_with_5_plus_units = compute_increase_in_dwelling_units_with_5_plus_units(y,bg)
    return increase_in_dwelling_units_with_5_plus_units

def compute_increase_in_dwelling_units_with_5_plus_units(y, bg):
    if y==2010 or y==2020:
        return 'None'
    dwelling_units_with_5_plus_units_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'dwelling_units_with_5+_units']. astype(int))
    dwelling_units_with_5_plus_units_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'dwelling_units_with_5+_units']. astype(int))
    if(dwelling_units_with_5_plus_units_year2 > dwelling_units_with_5_plus_units_year1):
        return 'True'
    else:
        return 'False'
    
data['increase_in_dwelling_units_with_5_plus_units'] = data.apply(add_col_for_increase_in_dwelling_units_with_5_plus_units, axis=1)    

In [32]:
data['dwelling_units_with_3-4_units'] = round(data['Percent Occupied Structure w 3-4 Units']*(data[' Housing Units'])*0.01,0)

def add_col_for_increase_in_dwelling_units_with_3_4_units(row):    
    y = row['Year']
    bg = row['Name']
    increase_in_dwelling_units_with_3_4_units = compute_increase_in_dwelling_units_with_3_4_units(y,bg)
    return increase_in_dwelling_units_with_3_4_units

def compute_increase_in_dwelling_units_with_3_4_units(y, bg):
    if y==2010 or y==2020:
        return 'None'
    dwelling_units_with_3_4_units_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'dwelling_units_with_3-4_units']. astype(int))
    dwelling_units_with_3_4_units_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'dwelling_units_with_3-4_units']. astype(int))
    if(dwelling_units_with_3_4_units_year2 > dwelling_units_with_3_4_units_year1):
        return 'True'
    else:
        return 'False'
    
data['increase_in_dwelling_units_with_3_4_units'] = data.apply(add_col_for_increase_in_dwelling_units_with_3_4_units, axis=1)   

In [33]:
data['renter_occupied'] = round(data['Percent Renter Occupied']*(data[' Housing Units'])*0.01,0)

def add_col_for_increase_in_renter_occupied(row):    
    y = row['Year']
    bg = row['Name']
    increase_in_renter_occupied = compute_increase_in_renter_occupied(y,bg)
    return increase_in_renter_occupied

def compute_increase_in_renter_occupied(y, bg):
    if y==2010 or y==2020:
        return 'None'
    renter_occupied_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'renter_occupied']. astype(int))
    renter_occupied_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'renter_occupied']. astype(int))
    if(renter_occupied_year2 > renter_occupied_year1):
        return 'True'
    else:
        return 'False'
    
data['increase_in_renter_occupied'] = data.apply(add_col_for_increase_in_renter_occupied, axis=1)   

In [34]:
data['households_with_3_plus_cars'] = round(data['Percent Households w 4+ Vehicles']*(data[' Total Households'])*0.01,0) + round(data['Percent Households w 3 Vehicles']*(data[' Total Households'])*0.01,0) 

def add_col_for_decrease_in_households_with_3_plus_cars(row):    
    y = row['Year']
    bg = row['Name']
    decrease_in_households_with_3_plus_cars = compute_decrease_in_households_with_3_plus_cars(y,bg)
    return decrease_in_households_with_3_plus_cars

def compute_decrease_in_households_with_3_plus_cars(y, bg):
    if y==2010 or y==2020:
        return 'None'
    households_with_3_plus_cars_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'households_with_3_plus_cars']. astype(int))
    households_with_3_plus_cars_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'households_with_3_plus_cars']. astype(int))
    if(households_with_3_plus_cars_year2 < households_with_3_plus_cars_year1):
        return 'True'
    else:
        return 'False'
    
data['decrease_in_households_with_3_plus_cars'] = data.apply(add_col_for_decrease_in_households_with_3_plus_cars, axis=1)   

In [35]:
data['married_couples_with_children'] = round(data['Percent Families, Married w Children Under 18']*(data[' Total Households'])*0.01,0)

def add_col_for_decrease_in_households_with_children(row):    
    y = row['Year']
    bg = row['Name']
    decrease_in_households_with_children = compute_decrease_in_households_with_children(y,bg)
    return decrease_in_households_with_children

def compute_decrease_in_households_with_children(y, bg):
    if y==2010 or y==2020:
        return 'None'
    households_with_children_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'married_couples_with_children']. astype(int))
    households_with_children_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'married_couples_with_children']. astype(int))
    if(households_with_children_year2 < households_with_children_year1):
        return 'True'
    else:
        return 'False'
    
data['decrease_in_households_with_children'] = data.apply(add_col_for_decrease_in_households_with_children, axis=1)   

In [36]:
data['non_hispanic_whites'] = round(data['Percent White Non-Hispanic']*(data[' Population'])*0.01,0)

def add_col_for_decrease_in_non_hispanic_whites(row):    
    y = row['Year']
    bg = row['Name']
    decrease_in_non_hispanic_whites = compute_decrease_in_non_hispanic_whites(y,bg)
    return decrease_in_non_hispanic_whites

def compute_decrease_in_non_hispanic_whites(y, bg):
    if y==2010 or y==2020:
        return 'None'
    non_hispanic_whites_year2 =np.array(data.loc[(data['Year'] == y) & (data['Name'] == bg), 'non_hispanic_whites']. astype(int))
    non_hispanic_whites_year1 =np.array(data.loc[(data['Year'] == y-1) & (data['Name'] == bg), 'non_hispanic_whites']. astype(int))
    if(non_hispanic_whites_year2 < non_hispanic_whites_year1):
        return 'True'
    else:
        return 'False'
    
data['decrease_in_non_hispanic_whites'] = data.apply(add_col_for_decrease_in_non_hispanic_whites, axis=1)   

In [37]:
data.head()

Unnamed: 0,Id,neighborhood,larger_neighborhood,Name,Population,Median Household Income,Median Home Value,Percent Bachelors Degree,Median Rent,Percent Public Transportation to Work,...,dwelling_units_with_3-4_units,increase_in_dwelling_units_with_3_4_units,renter_occupied,increase_in_renter_occupied,households_with_3_plus_cars,decrease_in_households_with_3_plus_cars,married_couples_with_children,decrease_in_households_with_children,non_hispanic_whites,decrease_in_non_hispanic_whites
0,530330001001,Cedar Park,Lake City,BG0001001,1250,141810,660354,24.63,0,6.78,...,0.0,,0.0,,173.0,,554.0,,969.0,
1,530330001002,Cedar Park,Lake City,BG0001002,1234,33316,338235,19.87,771,16.55,...,0.0,,549.0,,162.0,,250.0,,565.0,
2,530330001003,Cedar Park,Lake City,BG0001003,1337,48703,417188,26.48,864,16.52,...,45.0,,542.0,,146.0,,393.0,,778.0,
3,530330001004,Cedar Park,Lake City,BG0001004,1492,29392,0,12.81,694,32.69,...,31.0,,838.0,,138.0,,168.0,,532.0,
4,530330001005,Cedar Park,Lake City,BG0001005,942,48882,415385,26.6,863,16.56,...,38.0,,465.0,,133.0,,355.0,,528.0,


#### Labelling the tracts per definition 3

In [38]:
def label_def3(row):
   if row['Year'] == 2010 or row['Year'] == 2020:
        return 'None'
   if (row['increase_in_workers_taking_transit'] == 'True' and row['decrease_in_households_with_3_plus_cars'] == 'True' and row['increase_in_dwelling_units_with_5_plus_units'] == 'True' and row['increase_in_dwelling_units_with_3_4_units'] == 'True') or (row['increase_in_renter_occupied'] == 'True' and row['decrease_in_households_with_children'] == 'True' and row['decrease_in_non_hispanic_whites'] == 'True'):
        return 'InDangerOfGentrifying'
   else:
        return 'NotInDangerOfGentrifying'

In [39]:
data['def3'] = data.apply(label_def3, axis=1)

In [40]:
data.to_csv('../SimplyAnalytics Data/SimplyAnalytics_data_with_labels.csv')