In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pandas as pd

In [3]:
data = pd.read_excel('./Data/UAL_2015_to_2018_V2.xlsx')
test = data.loc[data['Audit Year'] == 2018].drop(['Audit Year'], axis = 1)
train = data.loc[data['Audit Year'] != 2018].drop(['Audit Year'], axis = 1)
out = test.loc[:,['UnitYear', 'Internal Controls', 'Financial Issues General Fund', 'Financial Issues Water Sewer Fund']]
names = test['UnitYear'].values

In [4]:
test = test.drop(['Financial Issues Water Sewer Fund', 'Internal Controls'], axis = 1)
train = train.drop(['Financial Issues Water Sewer Fund', 'Internal Controls'], axis = 1)

In [5]:
features = test.columns[1:-1].values
print(features)

Y = train['Financial Issues General Fund']
X = train[features]

x_test = test[features]
y_test = test['Financial Issues General Fund']

['FBA' 'FBA w/o Powell Bill' 'Total Expenditures' '<8% FBA/Expenditures'
 '<8% FBA w/o Powell/ Expenditures' 'WS Quick Ratio <1'
 'WS Working Capital' 'WS Cash Flow from ops less debt service'
 'WS-EF- Interest Expense']


In [6]:
clf = RandomForestClassifier(n_estimators=100)
model = clf.fit(X, Y)

In [7]:
print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), features), reverse=True))

[(0.2603, '<8% FBA/Expenditures'), (0.1354, 'FBA'), (0.1119, 'FBA w/o Powell Bill'), (0.1078, '<8% FBA w/o Powell/ Expenditures'), (0.1062, 'Total Expenditures'), (0.0828, 'WS Working Capital'), (0.0758, 'WS Cash Flow from ops less debt service'), (0.0615, 'WS Quick Ratio <1'), (0.0583, 'WS-EF- Interest Expense')]


In [8]:
est = model.predict(x_test)

In [9]:
print('Accuracy : ',metrics.accuracy_score(y_test, est))

Accuracy :  0.9365609348914858


In [10]:
add_wWater = pd.DataFrame({'UnitYear':names, 'Genral Fund Prediction': est})
out = pd.merge(out, add_wWater, how = 'left', on = 'UnitYear')

Much like the SVM it is so accurate because it predicts the vast majority as not being on the UAL. Contrary to SVM, however, the rf classifier actually predicted some units being on the UAL. Great! 
Let's try the same model removing the water sewer data.

In [11]:
features = test.columns[1:6].values
print(features)

X = train[features]
x_test = test[features]

['FBA' 'FBA w/o Powell Bill' 'Total Expenditures' '<8% FBA/Expenditures'
 '<8% FBA w/o Powell/ Expenditures']


In [12]:
clf = RandomForestClassifier(n_estimators=100)
model = clf.fit(X, Y)

In [13]:
print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), features), reverse=True))

[(0.3151, '<8% FBA/Expenditures'), (0.1865, 'FBA'), (0.1796, 'Total Expenditures'), (0.1696, '<8% FBA w/o Powell/ Expenditures'), (0.1492, 'FBA w/o Powell Bill')]


In [14]:
est = model.predict(x_test)

In [15]:
print('Accuracy : ',metrics.accuracy_score(y_test, est))

Accuracy :  0.9315525876460768


In [16]:
add_woWater = pd.DataFrame({'UnitYear':names, 'Genral Fund Prediction No Water': est})
out = pd.merge(out, add_woWater, how = 'left', on = 'UnitYear')

out.to_excel('./Data/RF_UAL_Predictions.xlsx')

In [17]:
from sklearn.preprocessing import Normalizer
import numpy as np

In [18]:
scaler = Normalizer().fit(X)  
scaled_X = scalx = scaler.transform(X)

In [19]:
model = clf.fit(scaled_X, Y)
scaler = Normalizer().fit(x_test) 
x_test_S = scaler.transform(x_test)
est = model.predict(x_test_S)

In [20]:
print('Accuracy : ',metrics.accuracy_score(y_test, est))
print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), features), reverse=True))

Accuracy :  0.9198664440734557
[(0.3417, 'FBA'), (0.2758, 'Total Expenditures'), (0.2616, 'FBA w/o Powell Bill'), (0.0652, '<8% FBA/Expenditures'), (0.0558, '<8% FBA w/o Powell/ Expenditures')]


In [21]:
X['FBA/Exp 8%'] = np.where(X['<8% FBA/Expenditures']<0.08,1,0)
X['FBA/Exp w/o P 8%'] = np.where(X['<8% FBA w/o Powell/ Expenditures'] < 0.08,1,0)

x_test['FBA/Exp 8%'] = np.where(x_test['<8% FBA/Expenditures']<0.08,1,0)
x_test['FBA/Exp w/o P 8%'] = np.where(x_test['<8% FBA w/o Powell/ Expenditures'] < 0.08,1,0)

print(x_test['FBA/Exp 8%'].head(20))

10    1
12    0
18    0
21    0
24    0
27    0
33    0
34    0
35    0
38    0
45    0
49    0
52    0
63    0
68    0
69    0
71    0
73    0
74    0
75    0
Name: FBA/Exp 8%, dtype: int32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://

In [22]:
sum(train['Financial Issues General Fund'])

77

In [23]:
features

array(['FBA', 'FBA w/o Powell Bill', 'Total Expenditures',
       '<8% FBA/Expenditures', '<8% FBA w/o Powell/ Expenditures'],
      dtype=object)

Now lets try a couple gradient boosting methods:

In [24]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [25]:
abc = AdaBoostClassifier(n_estimators = 500, learning_rate = 1)
model = abc.fit(X, Y)

In [26]:
est_ada = model.predict(x_test)

In [27]:
print('Accuracy : ',metrics.accuracy_score(y_test, est_ada))
print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), features), reverse=True))

Accuracy :  0.9298831385642737
[(0.264, '<8% FBA/Expenditures'), (0.196, '<8% FBA w/o Powell/ Expenditures'), (0.194, 'Total Expenditures'), (0.188, 'FBA w/o Powell Bill'), (0.158, 'FBA')]


In [29]:
sum(est_ada)

11

In [33]:
check = pd.DataFrame({'test':y_test, 'preds':est_ada})

In [32]:
check.loc[check['preds']==1]

Unnamed: 0,test,preds
10,1,1
21,1,1
24,1,1
27,1,1
35,1,1
49,1,1
105,1,1
122,0,1
513,0,1
535,0,1


It certainly appears that gradient boosting is a much better way to go here. We do actually get some predictions of municipalities on the UAL, and more are correct than not. Step in the right direction.