In [44]:
### Economic Development Capstone

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import hashlib
from sklearn.model_selection import train_test_split
import scipy.stats as stats
import statistics

%matplotlib inline

pd.options.display.float_format = '{:.2f}'.format #Stop displaying scientific notation
pd.set_option('display.max_columns', None) #Display all columns

### Load Data:

In [45]:
df_ny = pd.read_csv("ny_data.csv")

### Rename Columns

In [46]:
df.columns

Index(['Project\r\n ID #', 'Recipient Name', 'Project Name',
       'Project Description', 'Project Address', 'County', 'Postal Code',
       'Region', 'Industry', 'Start Date', 'End Date', 'Assistance Type',
       'Total ESD Assistance Awarded', 'Total Public-Private Investment',
       'Disbursements\r\n To-Date', 'Project Status', 'Compliant?',
       'Reason for Non-Compliant Status', 'Penalties Applied',
       'Rationale for Termination', 'Job Creation Commitments',
       'Job Retention Commitments', 'Jobs Created To-Date',
       'Jobs Retained \r\nTo-Date', 'Project Hires'],
      dtype='object')

In [47]:
#Rename columns
df.rename({'Project\n ID #': 'project_id', 'Recipient Name': 'company_name', 'Project Address':'company_address',
               'Project Name': 'project_name', 'Project Description':'project_description', 'County':'county',
              'Postal Code':'zip_code', 'Industry':'business_industry', 'Region':'region',
              'Start Date':'start_date','End Date':'end_date',
              'Assistance Type':'assistance_type', 'Total ESD Assistance Awarded':'total_esd_award',
              'Total Public-Private Investment': 'total_project_cost','Disbursements\n To-Date':'disbursements_to_date',
              'Compliant?':'compliant', 'Reason for Non-Compliant Status':'reason_for_noncompliance',
              'Penalties Applied':'penalties_applied',
              'Rationale for Termination':'rationale_for_term',
              'Job Creation Commitments':'projected_jobs_created',
              'Job Retention Commitments':'projected_jobs_retained',
              'Jobs Created To-Date':'final_jobs_created', 'Jobs Retained \nTo-Date':'final_jobs_retained',
              'Project Hires':'project_hires', 'Project Status':'status'
              }, axis=1, inplace=True)

In [48]:
df

Unnamed: 0,Project\r\n ID #,company_name,project_name,project_description,company_address,county,zip_code,region,business_industry,start_date,end_date,assistance_type,total_esd_award,total_project_cost,Disbursements\r\n To-Date,status,compliant,reason_for_noncompliance,penalties_applied,rationale_for_term,projected_jobs_created,projected_jobs_retained,final_jobs_created,Jobs Retained \r\nTo-Date,project_hires
0,ET1820,"Flextrapower, Inc. (internship training)","Flextrapower, Inc. (internship training)","Tax Credit, Health Care and Social Assistance,...",29-10 Thomson Ave,Queens,11101,New York City,Health Care and Social Assistance,05/01/2020,06/03/2021,Tax Credit,6000,19650,6000,Closed,Yes,,,,,,,,
1,ET1020,"Evisagenics, Inc. (internship training)","Evisagenics, Inc. (internship training)","Tax Credit, Health Care and Social Assistance,...",101 6th Ave,New York,10013,New York City,Health Care and Social Assistance,06/01/2020,04/20/2021,Tax Credit,3000,6000,3000,Closed,Yes,,,,,,,,
2,ET1520,Redesign Science Inc. (internship training),Redesign Science Inc. (internship training),"Tax Credit, Health Care and Social Assistance,...",180 Varick St,New York,10014,New York City,Health Care and Social Assistance,06/01/2020,04/20/2021,Tax Credit,2329,4658,2329,Closed,Yes,,,,,,,,
3,ET1620,"Vant AI, Inc. (internship training)","Vant AI, Inc. (internship training)","Tax Credit, Health Care and Social Assistance,...",33 Irving Place,New York,10003,New York City,Health Care and Social Assistance,06/01/2020,04/20/2021,Tax Credit,15000,45851,15000,Closed,Yes,,,,,,,,
4,ET920,Cureatr Inc. (internship training),Cureatr Inc. (internship training),"Tax Credit, Health Care and Social Assistance,...",17 West 20th St,New York,10011,New York City,Health Care and Social Assistance,06/01/2020,04/20/2021,Tax Credit,6000,13240,6000,Closed,Yes,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1736,SN68321,"Ad Valorem Tax Appeals, Inc. DBA Nearest Neighbor","Ad Valorem Tax Appeals, Inc. DBA Nearest Neighbor","Tax Credit, Information, Western New York, Ad ...","701 Ellicott Street, CBLS, 2nd Floor, Rooms B2...",Erie,14203,Western New York,Information,06/03/2021,,Tax Credit,0,0,0,Active,Yes,,,,8.00,,,,
1737,SN68421,"GrabALatte, Inc.","GrabALatte, Inc.","Tax Credit, Professional, Scientific, and Tech...","1 Seneca Dr., Z80 Labs, 24th Floor, Room 2417A",Erie,14203,Western New York,"Professional, Scientific, and Technical Services",06/03/2021,,Tax Credit,0,0,0,Active,Yes,,,,9.00,,,,
1738,SN68621,"Aircards, Inc.","Aircards, Inc.","Tax Credit, Professional, Scientific, and Tech...","701 Ellicott Street, CBLS, 2nd floor, Room B2-138",Erie,14203,Western New York,"Professional, Scientific, and Technical Services",06/03/2021,,Tax Credit,0,0,0,Active,Yes,,,,4.00,,,,
1739,SN68721,"North American Sleep Management, Inc. DBA Ognomy","North American Sleep Management, Inc. DBA Ognomy","Tax Credit, Professional, Scientific, and Tech...","701 Ellicott St., CBLS, 2nd Floor, Room B2-266",Erie,14203,Western New York,"Professional, Scientific, and Technical Services",06/03/2021,,Tax Credit,0,0,0,Active,Yes,,,,11.00,,,,


In [49]:
## Examining Status Column

Now we need to rename these various statuses so they match up with our Iowa data.

In [50]:
# Success = 2
# Cancelled = 0
# WIP = 1

df['status'].replace({'Active':'1'}, inplace=True)
df['status'].replace({'Compliance Complete':'2'}, inplace=True)
df['status'].replace({'Out of Business- written off':'0'}, inplace=True)
df['status'].replace({'Out of Business- confirmed':'0'}, inplace=True)
df['status'].replace({'Out of Business- Not written off':'0'}, inplace=True)
df['status'].replace({'Out of Business- not written off':'0'}, inplace=True)
df['status'].replace({'Repaid':'2'}, inplace=True)
df['status'].replace({'Out of Business':'0'}, inplace=True)
df['status'].replace({'Out of Business- Loan repaid in full':'2'}, inplace=True)
df['status'].replace({'Released':'2'}, inplace=True)
df['status'].replace({'OOB':'0'}, inplace=True)
df['status'].replace({'Out of business- confirmed':'0'}, inplace=True)
df['status'].replace({'Repaid early- Released':'2'}, inplace=True)
df['status'].replace({'Out of business- written off':'0'}, inplace=True)
df['status'].replace({'Repaid early- Released 10/2018':'2'}, inplace=True)
df['status'].replace({'Released 10/2017':'2'}, inplace=True)
df['status'].replace({'Repaid 1/8/2015':'2'}, inplace=True)
df['status'].replace({'Repaid early released 6/18/2018':'2'}, inplace=True)
df['status'].replace({'Out of Business- repaid':'2'}, inplace=True)
df['status'].replace({'Released 12/20/2018':'2'}, inplace=True)
df['status'].replace({'Compliance complete':'2'}, inplace=True)
df['status'].replace({'Company sold to Leggett & Platt':'0'}, inplace=True)
df['status'].replace({'Repaid 11/3/2017':'2'}, inplace=True)
df['status'].replace({'Active- Loan repaid but still w/in 10 years':'2'}, inplace=True)
df['status'].replace({'Repaid 12/31/2017':'2'}, inplace=True)
df['status'].replace({'Released 9/2/2015':'2'}, inplace=True)
df['status'].replace({'Repaid early- Released 10/2013':'2'}, inplace=True)
df['status'].replace({'Out of Business- Confirmed':'0'}, inplace=True)

df['status'].value_counts()

1             1395
Closed         325
Terminated      21
Name: status, dtype: int64

In [51]:
df.isna().sum()

Project\r\n ID #                0
company_name                    0
project_name                    0
project_description             0
company_address                 0
county                          0
zip_code                        0
region                          0
business_industry               0
start_date                      0
end_date                     1389
assistance_type                 0
total_esd_award                 0
total_project_cost              0
Disbursements\r\n To-Date       0
status                          0
compliant                       0
reason_for_noncompliance     1741
penalties_applied            1741
rationale_for_term           1741
projected_jobs_created        605
projected_jobs_retained       764
final_jobs_created           1327
Jobs Retained \r\nTo-Date     764
project_hires                1624
dtype: int64

In [52]:
## Drop Unnecessary Columns

In [53]:
df.drop(columns=['Project\r\n ID #', 'company_name', 'project_name', 'project_description', 'company_address', 'region', 'end_date', 'reason_for_noncompliance', 'penalties_applied', 'rationale_for_term', 'projected_jobs_created', 'projected_jobs_retained', 'final_jobs_created', 'Jobs Retained \r\nTo-Date', 'project_hires', 'compliant', 'status', 'Disbursements\r\n To-Date', 'start_date'], inplace=True)
df.head(5)

Unnamed: 0,county,zip_code,business_industry,assistance_type,total_esd_award,total_project_cost
0,Queens,11101,Health Care and Social Assistance,Tax Credit,6000,19650
1,New York,10013,Health Care and Social Assistance,Tax Credit,3000,6000
2,New York,10014,Health Care and Social Assistance,Tax Credit,2329,4658
3,New York,10003,Health Care and Social Assistance,Tax Credit,15000,45851
4,New York,10011,Health Care and Social Assistance,Tax Credit,6000,13240


In [54]:
df.isna().sum()

county                0
zip_code              0
business_industry     0
assistance_type       0
total_esd_award       0
total_project_cost    0
dtype: int64

## Add new derived columns

In [55]:
#We have a few rows with this See EXP column. Need to remove those
df = df[df['p_ft_job_creation'].str.contains("See EXP")==False]
df = df[df['p_ft_job_creation'].str.contains("-")==False]

#Convert projected jobs to int so they can be added together
df['projected_jobs_total'] = df['p_ft_job_creation'].astype(float) + df['p_pt_job_creation'].astype(float)

KeyError: 'p_ft_job_creation'

In [None]:
df

In [None]:
#df['job_obligation_status'] = df['job_obligation_status'].replace(['Met'],'2')
# Adding new columns
df_suc['project_type'] = '2' #Success = 2
df_can['project_type'] = '0' #Cancelled = 0
df_wip['project_type'] = '1' #WIP = 1

In [None]:
# Make a copy of the df
X = df.copy()

#Drop our response column from the X df
X = X.drop(columns='project_type')

#Need this for labels later
X_labels = X

#Create our y/response df
y = df['project_type']

In [None]:
#One hot encode X
# create an object of the OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

X = ohe.fit_transform(X)

In [None]:
#Scale data

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [None]:
#Create train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Remember to always work on the train set

In [None]:
#Create and fit model
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

#Make prediction and check accuracy score
preds = tree_clf.predict(X_test)
acc_score = accuracy_score(preds, y_test)
print('Accuracy=%s' % (acc_score))

In [None]:
#Grid Search

from sklearn.model_selection import GridSearchCV

#Create dictionary
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2,3,4,5, 6, 7, 8]}

#Create GridSearch
grid_search_cv = GridSearchCV(tree_clf, param_grid, verbose=1, cv=3)
grid_search_cv.fit(X_train, y_train)

#Print out best parameters
print(grid_search_cv.best_params_)

print("The best parameters are: ", "max_depth: 2, max_leaf_nodes: 2, min_samples_split: 2")

In [None]:
from sklearn.tree import export_text

tree_clf2 = DecisionTreeClassifier(max_depth=6, max_leaf_nodes = 7, min_samples_split = 8)
tree_clf2.fit(X, y)
cols = ohe.get_feature_names(input_features=X_labels.columns)
print(export_text(tree_clf2))