<h2>Using weighted classes in SVC with emphasis on converted donors improves model significantly</h2>

In [1]:
from sqlalchemy import create_engine, inspect
import pandas as pd
import psycopg2
import re
from mcnulty import cost_column_to_float
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

  """)


In [2]:
cnx = create_engine('postgresql://user1:password@localhost/mcnulty', isolation_level="READ COMMITTED")

In [3]:
conn = cnx.connect()

In [4]:
query = '''
SELECT donations.donation_amount AS donation_amt,
        donations.donor_cart_sequence AS ord,
        donations.converts AS conv,
        donations.project_id,
        p.school_id,
        p.project_cost,
        p.project_grade_level_category AS grade_level,
        p.project_current_status AS funded,
        s.school_percentage_free_lunch AS perc_lunch,
        s.school_county AS county,
        s.school_metro_type AS metro,
        d.donor_is_teacher AS is_teacher,
        donations.donation_included_optional_donation AS incl_opt,
        t.teacher_prefix AS prefix
FROM donations
JOIN projects p ON donations.project_id = p.project_id
JOIN schools s ON p.school_id = s.school_id
JOIN donors d ON donations.donor_id = d.donor_id
JOIN teachers t ON p.teacher_id = t.teacher_id
WHERE donor_cart_sequence = 1 AND s.school_state = 'Washington'
'''

In [5]:
i4_df = pd.read_sql_query(query ,cnx)

In [6]:
i4_df = cost_column_to_float(i4_df)

In [8]:
X = i4_df[['project_cost', 'perc_lunch', 'donation_amt', 'is_teacher', 'incl_opt', 'prefix', 'grade_level', 'funded', 'county', 'metro']]
y = i4_df['conv']

In [9]:
mapper = DataFrameMapper([
    (['project_cost'], StandardScaler()),
    (['perc_lunch'], [Imputer(), StandardScaler()]),
    (['donation_amt'], StandardScaler()),
    (['is_teacher'], [LabelBinarizer(), StandardScaler()]),
    (['incl_opt'], [LabelBinarizer(), StandardScaler()]),
    (['prefix'], [LabelBinarizer(), StandardScaler()]),
    (['grade_level'], [LabelBinarizer(), StandardScaler()]),
    (['funded'], [LabelBinarizer(), StandardScaler()]),
    (['county'], [LabelBinarizer(), StandardScaler()]),
    (['metro'], [LabelBinarizer(), StandardScaler()])
])

In [9]:
X_columns = X.columns

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
Xtrain_rescaled = pd.DataFrame(mapper.fit_transform(X_train.copy()))
Xtest_rescaled = pd.DataFrame(mapper.transform(X_test.copy()))



In [12]:
RFclf = RandomForestClassifier(n_jobs=200)
RFclf.fit(Xtrain_rescaled,y_train)
y_RFpred = RFclf.predict(Xtest_rescaled)

In [13]:
f1_score(y_test, y_RFpred)

0.09570552147239265

In [14]:
accuracy_score(y_test, y_RFpred)

0.8457513604018417

In [15]:
recall_score(y_test, y_RFpred)

0.06543624161073826

In [16]:
pd.Series(y_RFpred).value_counts()

0    9118
1     438
dtype: int64

In [17]:
pd.Series(y_test).value_counts()

0    8364
1    1192
Name: conv, dtype: int64

In [22]:
SVmodel = SVC(C=10, class_weight={0: 1, 1:5})
SVmodel.fit(Xtrain_rescaled,y_train)
y_SVM = SVmodel.predict(Xtest_rescaled)

In [23]:
f1_score(y_test, y_SVM)

0.15990905646078063

In [24]:
pd.Series(y_SVM).value_counts()

0    8109
1    1447
dtype: int64

In [25]:
recall_score(y_test, y_SVM)

0.17701342281879195

In [27]:
accuracy_score(y_test, y_SVM)

0.7679991628296359

In [16]:
import pickle as pkl

In [17]:
with open('i4SVmodel.pkl', 'wb') as open_file:
    pkl.dump(SVmodel, open_file)

In [26]:
pd.Series(y_test).value_counts()

0    8364
1    1192
Name: conv, dtype: int64

In [36]:
precision_score(y_test, y_SVM)

0.14581893572909468

In [34]:
for i in SVmodel.decision_function(Xtest_rescaled):
    print(i)

-0.849045050815015
0.11183001188239461
-1.6083648217422732
-0.9969086736618985
-1.0352625746266173
-0.12828297758511675
0.20364576307846927
1.1238976482138359
-1.0017550445021421
-0.9063976442994867
-1.000981013139826
-0.9869988983918215
-2.076981131653363
-0.6474864135826816
-0.9908652634070098
-1.4651240518415114
-0.03372658820262353
-0.9770474385255542
-0.4912266133555544
-1.9453132660162011
1.0970828710851417
-1.003310451827525
-1.5655417729702754
0.6211400834113695
-0.9812491922156735
-1.0298047516727462
-0.8560394420288538
-1.000124726214843
-0.9539934226281129
-0.991984063158648
-0.8052676661698438
-1.0002090816835696
-0.9281976941561622
-1.0000283939425114
-0.9312714585461567
-0.9577706897995512
0.594904778349909
-0.5389119178667013
-0.9995000033463292
-1.1329531141704527
-0.9374360928940514
0.8003864358338305
0.06561262564537096
-1.1585555459957275
-0.9466337876744216
-1.0045746155286168
-0.8206913681753271
-0.838854848121806
-0.8629198369989476
-0.9877398401264944
-1.06568280

-0.9999666470440624
-1.1046939815454127
-0.995584283897199
1.1056609329011788
-0.9987990465449886
-1.0000071089064662
1.4477245172704416
-0.9986966650249366
0.3012587236039376
-0.9887300702826151
-1.0206917952136911
-0.9584835980116194
-1.029349902693037
-0.996096009066869
-1.0014334447255657
-0.9922302976188976
-0.9976804887829401
-1.0030081943621298
-0.9999212716157165
-0.992656500312599
-1.0115411895130575
0.001981066410834953
-0.9258059764138439
-1.2294575993668109
-1.0983199746335273
0.8965790176295997
-0.9750574633979573
-0.9221743890877055
-1.215304328912452
-0.9990123228496154
-0.9853465036294237
-0.8864784279472347
-0.998474299649469
-0.6525161652668036
-1.0291806751781256
-1.0280371410179783
0.767699354571188
-0.7341206941761346
-0.9991458134393868
-1.0061451158111216
-1.083995751833486
-0.3048732368633881
-0.7144812962337709
-0.918677794159719
-0.9876534190682665
0.7637188496733899
-0.9827216649753149
-1.0002659402667642
0.796937868133243
-1.0026548435647848
-0.9694418859627

-0.9961924128695013
-0.43389280973902034
-0.8885918758699337
-0.07264940384799468
-1.001589326347943
0.19110023260984177
-1.0141128844102907
0.9869558318606674
-0.9562009312045235
-0.9974312134292965
-1.0163509385427
-0.9831509638958424
-2.194483573050679
-1.0004275588656504
1.4814183283875617
-0.8459891762218791
-0.6825230738832578
-1.0000385597641603
0.95006714578797
-1.315289685649411
0.9940851577746234
0.933204958305668
0.8676726872195296
-0.9958538548974637
-1.00011701498256
-0.9746390535386013
-0.9959193435104993
-0.8512571401661403
-0.9761713249541819
-0.9840354165065065
-0.9999781890210738
-1.0065419289575366
-0.9824545888690869
0.8862239928905299
-1.001523767381514
-1.0158740451126564
-1.0105293236856157
-2.548509162715151
-0.8644408144554182
-1.0007083843329085
-0.9638135883050273
-0.9450455765236189
-0.9818561927351354
-0.317437174018829
-0.9754485145375024
0.7945752798985214
-2.605840876467358
-1.1012966220106999
-0.9939795181086034
-0.9759204704503402
-1.0044874151816598
-

-0.998720568382898
-1.0136015222064034
0.7707536865990329
-0.4637484310168215
-0.7909756910301459
-0.9947545397329497
-0.9684343464682511
0.4492822444943575
-1.0675884254949053
0.5006579611104215
-0.1806900895381348
-1.0279884126052914
-1.000305692461947
0.977605758005402
-0.9896222501395745
-0.46602772429522143
-1.0044008340305446
-0.9970217818656332
-1.0007142390497052
-0.9920778413796415
-0.9930363856872807
-1.6604997938802706
-0.3925433725116031
-1.0063762566149648
-0.9903492742021797
-0.957593205795712
-0.6007120324007204
-0.997036754268302
-0.9732353070785708
-0.9981547657743172
-1.3458676611453257
-0.9573416329543369
-1.000292808584462
-0.9413062811479022
-1.0037737393980524
-0.9950355235736484
-0.9873987011023914
-1.002908006068795
-1.1212097041885247
-1.0268916401163741
-0.9465495128930764
-1.8833905893638878
0.19294855673061984
-0.9008011693962578
-0.9566257520796035
-0.4031422657735849
-1.0005674200062722
0.03596741309585039
-0.9936933883565991
-0.9945054909663943
-0.0898426

In [35]:
from sklearn.model_selection import GridSearchCV