In [49]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp
import matplotlib.pyplot as plt 


from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import seaborn as sns
from __future__ import print_function
import statsmodels.api as sm
from scipy import stats


%matplotlib inline


plt.rc("font", size=14)



In [50]:
percentage_change_all_vars = pd.read_csv("percentage_change_all_vars.csv")
newdf=percentage_change_all_vars[['UNITID','TYPE','0809_GR','0809_EN','0809_ASSETS','0809_EXPENSES','0809_REVENUE','0809_RESEARCH','0809_ANYAIDN','0809_LOAN_T', '0809_FGRNT_T','0809_SGRNT_T','0809_IGRNT_T']]
newdf.head()

Unnamed: 0,UNITID,TYPE,0809_GR,0809_EN,0809_ASSETS,0809_EXPENSES,0809_REVENUE,0809_RESEARCH,0809_ANYAIDN,0809_LOAN_T,0809_FGRNT_T,0809_SGRNT_T,0809_IGRNT_T
0,105534,for-profit,29.049484,93.411996,61.508476,50.472747,44.947968,0.01,30.092119,34.640445,38.958976,-18.088062,196.595208
1,126605,for-profit,24.288107,4.347826,-16.670281,12.317907,12.629649,0.01,-28.076923,-48.822485,6.291402,-53.911985,13.659674
2,126702,for-profit,-8.277946,-1.377031,-4.354315,-1.478056,0.219253,0.01,-21.885522,-17.632936,28.905474,-85.551604,19.472967
3,132338,for-profit,-11.255411,-0.877706,-6.982836,-2.75362,-0.771812,0.01,-22.368421,-37.733554,-14.371541,-25.887703,-14.707282
4,138813,for-profit,-21.451744,21.061093,-3.311869,14.391678,9.940057,0.01,9.97191,27.382161,45.234526,64.646994,14.289328


In [51]:
newdf = newdf.sample(frac=1)

newdf = newdf.drop(['UNITID'], axis=1)

Dy = [1 if x > 0 else 0 for x in newdf['0809_EN'].tolist()]
Dy = np.array(Dy)

newdf = newdf.drop(['0809_EN'], axis=1)

newdf['is_for_profit']      = [(1 if x == 'for-profit'      else 0) for x in newdf['TYPE'].tolist()]
newdf['is_not_for_profit']  = [(1 if x == 'not-for-profit'  else 0) for x in newdf['TYPE'].tolist()]
newdf['is_public']          = [(1 if x == 'public'          else 0) for x in newdf['TYPE'].tolist()]
newdf['const']              = 1
newdf['0809_ANYAIDN_SQ']    = np.power(newdf['0809_ANYAIDN'], 2)
newdf['0809_EXPENSES_SQ']    = np.power(newdf['0809_EXPENSES'], 2)







newdf = newdf.drop(['TYPE'], axis=1)

for col in newdf.columns:
    print(col)

DX = newdf.values

DX = pp.normalize(DX, axis=0)



0809_GR
0809_ASSETS
0809_EXPENSES
0809_REVENUE
0809_RESEARCH
0809_ANYAIDN
0809_LOAN_T
0809_FGRNT_T
0809_SGRNT_T
0809_IGRNT_T
is_for_profit
is_not_for_profit
is_public
const
0809_ANYAIDN_SQ
0809_EXPENSES_SQ


In [52]:
import math
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers import Dropout
from keras.optimizers import RMSprop, Adam
from keras.models import load_model
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
import keras.backend as K

import datetime


#70% -> lr = 1e-05, epochs = 500,batch = 64,droputRate = 0.3,activation = 'relu',hidden_neurons_first_layer = 144, hidden_neurons_in_ratio_of_previous = 0.66

lr = 1e-05
epochs = 500
batch = 64
droputRate = 0.3

activation = 'relu'

hidden_neurons_first_layer = 144
hidden_neurons_in_ratio_of_previous = 0.66
hidden_layers = 3


def trainModel(X, y, activation, hidden_neurons_first_layer, hidden_neurons_in_ratio_of_previous, hidden_layers, lr, epochs, batch, model_name):
    # Create a Network using Keras package ------------------------------------
    model = Sequential()
    model.add(Dropout(0.4, input_shape=(len(X[0]),)))
    model.add(Dense(hidden_neurons_first_layer, init='lecun_normal', input_shape=(len(X[0]),), kernel_constraint=maxnorm(5)))
    model.add(Activation(activation))
    model.add(Dropout(droputRate, input_shape=(hidden_neurons_first_layer,)))

    neurons = hidden_neurons_first_layer
    for i in range(hidden_layers-1):
        neurons = math.ceil(neurons * hidden_neurons_in_ratio_of_previous)
        model.add(Dense(neurons, init='lecun_normal', kernel_constraint=maxnorm(5)))
        model.add(Activation(activation))
        if i < hidden_layers-1:
            model.add(Dropout(droputRate, input_shape=(hidden_neurons_first_layer,)))



    model.add(Dense(1, init='lecun_normal', activation='sigmoid'))
    
    adam = Adam(lr=lr)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(X, y, epochs=epochs, verbose=2, batch_size=batch)

    model.save(model_name + '.h5')


def evaluateModel(X, y, model_name):
    model = load_model(model_name + '.h5')
    scores = model.evaluate(X_test, y_test, verbose=0)
    return(scores)


def makePrediction(x, model_name):
    model = load_model(model_name + '.h5')
    prediction = model.predict(x)
    return prediction


def preprocessData(X,y):
    numEx = X.shape[0]
    numTrain = int(math.ceil(numEx*0.7))
    return (X[:1107,:], y[:1107], X[1107:,:], y[1107:])


X,y,X_test,y_test = preprocessData(DX, Dy)

model_name = 'model_' + datetime.datetime.now().strftime("%Y%m%d%H%M%S")

trainModel(X, y, activation, hidden_neurons_first_layer, hidden_neurons_in_ratio_of_previous, hidden_layers, lr, epochs, batch, model_name)

file = open('result' + model_name + '.txt', 'w+')

file.write("#_of_training_samples,#_of_test_samples_,lr,epochs,batch,binary_crossentropy,accuracy\n")

result = evaluateModel(X_test,y_test, model_name)

file.write(str(len(X))+","+str(len(X_test))+","+str(lr)+","+str(epochs)+","+str(batch)+","+str(result[0])+","+str(result[1])+"\n")

file.close()

print("#_of_training_samples,#_of_test_samples_,lr,epochs,batch,binary_crossentropy,accuracy\n")
print(str(len(X))+","+str(len(X_test))+","+str(lr)+","+str(epochs)+","+str(batch)+","+str(result[0])+","+str(result[1])+"\n")





Epoch 1/500
 - 2s - loss: 0.6748 - acc: 0.6658
Epoch 2/500
 - 0s - loss: 0.6408 - acc: 0.6667
Epoch 3/500
 - 0s - loss: 0.6327 - acc: 0.6667
Epoch 4/500
 - 0s - loss: 0.6282 - acc: 0.6667
Epoch 5/500
 - 0s - loss: 0.6232 - acc: 0.6667
Epoch 6/500
 - 0s - loss: 0.6163 - acc: 0.6667
Epoch 7/500
 - 0s - loss: 0.6168 - acc: 0.6667
Epoch 8/500
 - 0s - loss: 0.6124 - acc: 0.6667
Epoch 9/500
 - 0s - loss: 0.6164 - acc: 0.6667
Epoch 10/500
 - 0s - loss: 0.6109 - acc: 0.6667
Epoch 11/500
 - 0s - loss: 0.6086 - acc: 0.6676
Epoch 12/500
 - 0s - loss: 0.6067 - acc: 0.6658
Epoch 13/500
 - 0s - loss: 0.6102 - acc: 0.6685
Epoch 14/500
 - 0s - loss: 0.6127 - acc: 0.6703
Epoch 15/500
 - 0s - loss: 0.6066 - acc: 0.6667
Epoch 16/500
 - 0s - loss: 0.6101 - acc: 0.6712
Epoch 17/500
 - 0s - loss: 0.6156 - acc: 0.6766
Epoch 18/500
 - 0s - loss: 0.6145 - acc: 0.6685
Epoch 19/500
 - 0s - loss: 0.6107 - acc: 0.6631
Epoch 20/500
 - 0s - loss: 0.6030 - acc: 0.6649
Epoch 21/500
 - 0s - loss: 0.6045 - acc: 0.6721
E

Epoch 171/500
 - 0s - loss: 0.5952 - acc: 0.6965
Epoch 172/500
 - 0s - loss: 0.5838 - acc: 0.6974
Epoch 173/500
 - 0s - loss: 0.5846 - acc: 0.6865
Epoch 174/500
 - 0s - loss: 0.5988 - acc: 0.6847
Epoch 175/500
 - 0s - loss: 0.5963 - acc: 0.6911
Epoch 176/500
 - 0s - loss: 0.5835 - acc: 0.7046
Epoch 177/500
 - 0s - loss: 0.5867 - acc: 0.6965
Epoch 178/500
 - 0s - loss: 0.5880 - acc: 0.6838
Epoch 179/500
 - 0s - loss: 0.5697 - acc: 0.7019
Epoch 180/500
 - 0s - loss: 0.5948 - acc: 0.6829
Epoch 181/500
 - 0s - loss: 0.5792 - acc: 0.7028
Epoch 182/500
 - 0s - loss: 0.5921 - acc: 0.6883
Epoch 183/500
 - 0s - loss: 0.5861 - acc: 0.6947
Epoch 184/500
 - 0s - loss: 0.5800 - acc: 0.7046
Epoch 185/500
 - 0s - loss: 0.5834 - acc: 0.6902
Epoch 186/500
 - 0s - loss: 0.5767 - acc: 0.6965
Epoch 187/500
 - 0s - loss: 0.5791 - acc: 0.6956
Epoch 188/500
 - 0s - loss: 0.5875 - acc: 0.6929
Epoch 189/500
 - 0s - loss: 0.5815 - acc: 0.7037
Epoch 190/500
 - 0s - loss: 0.5750 - acc: 0.7028
Epoch 191/500
 - 0s 

 - 0s - loss: 0.5765 - acc: 0.7118
Epoch 339/500
 - 0s - loss: 0.5779 - acc: 0.6865
Epoch 340/500
 - 0s - loss: 0.5785 - acc: 0.6883
Epoch 341/500
 - 0s - loss: 0.5772 - acc: 0.6947
Epoch 342/500
 - 0s - loss: 0.5755 - acc: 0.6938
Epoch 343/500
 - 0s - loss: 0.5700 - acc: 0.7082
Epoch 344/500
 - 0s - loss: 0.5820 - acc: 0.6974
Epoch 345/500
 - 0s - loss: 0.5772 - acc: 0.6893
Epoch 346/500
 - 0s - loss: 0.5814 - acc: 0.6965
Epoch 347/500
 - 0s - loss: 0.5706 - acc: 0.6883
Epoch 348/500
 - 0s - loss: 0.5732 - acc: 0.6956
Epoch 349/500
 - 0s - loss: 0.5797 - acc: 0.6992
Epoch 350/500
 - 0s - loss: 0.5905 - acc: 0.6883
Epoch 351/500
 - 0s - loss: 0.5701 - acc: 0.7055
Epoch 352/500
 - 0s - loss: 0.5827 - acc: 0.6947
Epoch 353/500
 - 0s - loss: 0.5808 - acc: 0.6811
Epoch 354/500
 - 0s - loss: 0.5758 - acc: 0.6956
Epoch 355/500
 - 0s - loss: 0.5775 - acc: 0.7019
Epoch 356/500
 - 0s - loss: 0.5829 - acc: 0.6911
Epoch 357/500
 - 0s - loss: 0.5746 - acc: 0.7064
Epoch 358/500
 - 0s - loss: 0.5629

In [44]:
print(sum(y_test))

327


In [20]:
len(X_test)

474

In [59]:
y_hat = []
y_hat = makePrediction(X_test, "model_20180427172630")

#for i in range(len(X_test)):
#    print(np.array(X_test[i,:]).shape)
#    y_hat.append(makePrediction(np.array(X_test[i,:]), "model_20180427172630"))

In [61]:
y_hat = [1 if x>0.5 else 0 for x in y_hat]
print(y_hat)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [63]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_hat))

             precision    recall  f1-score   support

          0       0.53      0.22      0.31       144
          1       0.73      0.92      0.81       330

avg / total       0.67      0.70      0.66       474



In [None]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg, 18)
y=['dummy_enrollment']
listofcol=data.columns.values.tolist()
X=[i for i in listofcol[1:] if i not in y]
print(X)
X_train, X_test, y_train, y_test = train_test_split(data[X],  data[y], test_size=0.3, random_state=0)


rfe = rfe.fit(X_train, y_train )
print(rfe.support_)
print(rfe.ranking_)

In [None]:
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

In [None]:
from sklearn.metrics import confusion_matrix
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
glm_binom = sm.GLM(y_train, X_train, family=sm.families.Binomial())
res = glm_binom.fit()
print(res.summary())