In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Copy_of_wgbh.csv', index_col='DateEntered', parse_dates=True)

In [2]:
# convert index to yearly DateTime object
df.index = df.index.to_period('Y')

In [3]:
df.columns

Index(['Agency', 'ProjectName', 'ProjectAddress_1', 'Developer',
       'GeneralContractor', 'SubContractor', 'SubContractorAddress_1',
       'SubContractorAddress_2', 'Trade', 'SEX', 'MINOR', 'Race_Desc',
       'RESIDENT', 'TotalHours'],
      dtype='object')

In [4]:
# drop everything but race, residency, developer columns
df = df.drop(['Agency', 'ProjectName', 'ProjectAddress_1',
       'GeneralContractor', 'SubContractor', 'SubContractorAddress_1',
       'SubContractorAddress_2', 'Trade', 'MINOR', 'TotalHours'], axis=1)

In [5]:
df = df.drop('SEX', axis=1)

In [6]:
df = df.drop(df.index[0])
# race_df = df[['Developer', 'Race_Desc']]
res_df = df[['Developer', 'RESIDENT']].dropna()

In [7]:
# create a current workforce composition report by residency per developer
summary = pd.DataFrame(columns=['Developer', 'Y', 'N', 'TOTAL_PROJECTS'])
developers = res_df['Developer'].unique()
groups = res_df.groupby(res_df.Developer)
res = ['Y', 'N']
res_df = res_df.dropna()

# compute percentage of employees for each developer by residency
for i in range(len(developers)):
    temp = groups.get_group(developers[i])
    filt1 = temp['RESIDENT'] == 'Y'
    filt2 = temp['RESIDENT'] == 'y'
    filt3 = temp['RESIDENT'] == 'N'
    filt4 = temp['RESIDENT'] == 'n'
    Y = temp[filt1]
    y = temp[filt2]
    N = temp[filt3]
    n = temp[filt4]
    num_res = Y.shape[0] + y.shape[0]
    num_non_res = N.shape[0] + n.shape[0]
    summary.loc[len(summary.index)] = [developers[i], num_res, num_non_res, num_res + num_non_res]

In [8]:
summary.columns = ['Developer', 'Resident', 'Non-resident', 'Total']
summary.to_csv('Current_Developer_Workforce_Comp_By_Residency.csv')

In [9]:
summary2 = pd.DataFrame(index=developers, columns=['2016', '2017', '2018', '2019', '2020', '2021'], data=0)

In [10]:
# compute number of Boston residents employed per year for each developer
temp_years = ['2016', '2017', '2018', '2019', '2020', '2021']

for i in range(len(developers)):
    temp = groups.get_group(developers[i])
    temp_groups = temp.groupby(temp.index)
    more_years = temp.index.unique()
    for j in range(len(more_years)):
        curr_year_df = temp_groups.get_group(more_years[j])
        residencies = curr_year_df.RESIDENT.unique()
        curr_year_groups = curr_year_df.groupby(curr_year_df.RESIDENT)
        for k in range(len(residencies)):
            curr_group = curr_year_groups.get_group(residencies[k])
            if residencies[k] == 'Y' or residencies[k] == 'y':
                summary2.loc[developers[i], temp_years[j]] += curr_group.count().RESIDENT

In [11]:
summary2

Unnamed: 0,2016,2017,2018,2019,2020,2021
CITY OF BOSTON / PFD,328,1236,855,12,0,0
THE COMMUNITY BUILDERS,96,637,1137,309,11,0
CITY OF BOSTON / PFD,48,24,41,1114,1057,0
HYM INVESTMENT GROUP,261,1385,1734,465,67,0
Delware Limited Liability Co.,116,245,0,0,0,0
...,...,...,...,...,...,...
Trusties of Boston college,7,0,0,0,0,0
WEST NAPOLI PIZZA,0,0,0,0,0,0
"PARTNERS HEALTHCARE SYSTEMS, INC.",1,0,0,0,0,0
BRIGHAM AND WOMEN'S HOSPITAL,0,0,0,0,0,0


In [12]:
summary2.to_csv('Number_of_Boston_Residents_Employed_Per_Year_By_Contractor.csv')