<h2>Using weighted classes in SVC with emphasis on converted donors improves model significantly</h2>

In [19]:
from sqlalchemy import create_engine, inspect
import pandas as pd
import psycopg2
import re
from mcnulty import cost_column_to_float
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
cnx = create_engine('postgresql://user1:password@localhost/mcnulty', isolation_level="READ COMMITTED")

In [3]:
conn = cnx.connect()

In [4]:
query = '''
SELECT donations.donation_amount AS donation_amt,
        donations.donor_cart_sequence AS ord,
        donations.converts AS conv,
        donations.project_id,
        p.school_id,
        p.project_cost,
        p.project_grade_level_category AS grade_level,
        p.project_current_status AS funded,
        s.school_percentage_free_lunch AS perc_lunch,
        s.school_county AS state,
        d.donor_is_teacher AS is_teacher,
        donations.donation_included_optional_donation AS incl_opt,
        t.teacher_prefix AS prefix
FROM donations
JOIN projects p ON donations.project_id = p.project_id
JOIN schools s ON p.school_id = s.school_id
JOIN donors d ON donations.donor_id = d.donor_id
JOIN teachers t ON p.teacher_id = t.teacher_id
WHERE donor_cart_sequence = 1 AND s.school_state = 'Washington'
'''

In [5]:
i3_df = pd.read_sql_query(query ,cnx)

In [6]:
i3_df = cost_column_to_float(i3_df)

In [7]:
X = i3_df[['project_cost', 'perc_lunch', 'donation_amt', 'is_teacher', 'incl_opt', 'prefix', 'grade_level', 'funded', 'state']]
y = i3_df['conv']

In [8]:
mapper = DataFrameMapper([
    (['project_cost'], StandardScaler()),
    (['perc_lunch'], [Imputer(), StandardScaler()]),
    (['donation_amt'], StandardScaler()),
    (['is_teacher'], [LabelBinarizer(), StandardScaler()]),
    (['incl_opt'], [LabelBinarizer(), StandardScaler()]),
    (['prefix'], [LabelBinarizer(), StandardScaler()]),
    (['grade_level'], [LabelBinarizer(), StandardScaler()]),
    (['funded'], [LabelBinarizer(), StandardScaler()]),
    (['state'], [LabelBinarizer(), StandardScaler()])
])

In [9]:
X_columns = X.columns

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
Xtrain_rescaled = pd.DataFrame(mapper.fit_transform(X_train.copy()))
Xtest_rescaled = pd.DataFrame(mapper.transform(X_test.copy()))



In [34]:
RFclf = RandomForestClassifier(n_jobs=200)
RFclf.fit(Xtrain_rescaled,y_train)
y_RFpred = RFclf.predict(Xtest_rescaled)

In [35]:
f1_score(y_test, y_RFpred)

0.093935790725327

In [36]:
accuracy_score(y_test, y_RFpred)

0.8405190456257848

In [37]:
recall_score(y_test, y_RFpred)

0.06633081444164568

In [38]:
pd.Series(y_RFpred).value_counts()

0    9065
1     491
dtype: int64

In [39]:
pd.Series(y_test).value_counts()

0    8365
1    1191
Name: conv, dtype: int64

In [12]:
SVmodel = SVC(C=10, class_weight={0: 1, 1:10})
SVmodel.fit(Xtrain_rescaled,y_train)
y_SVM = SVmodel.predict(Xtest_rescaled)

In [13]:
f1_score(y_test, y_SVM)

0.22648501362397816

In [14]:
pd.Series(y_SVM).value_counts()

1    7953
0    1603
dtype: int64

In [15]:
recall_score(y_test, y_SVM)

0.8502454991816694

In [16]:
import pickle as pkl

In [17]:
with open('i3SVmodel.pkl', 'wb') as open_file:
    pkl.dump(SVmodel, open_file)

In [21]:
pd.Series(y_test).value_counts()

0    8334
1    1222
Name: conv, dtype: int64

In [23]:
precision_score(y_test, y_SVM)

0.1306425248333962