Goal of this notebook is to create a model implementing is_teacher and included_optional_donation features 

In [23]:
from sqlalchemy import create_engine, inspect
import pandas as pd
import psycopg2
import re
from mcnulty import cost_column_to_float
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
cnx = create_engine('postgresql://user1:password@localhost/mcnulty', isolation_level="READ COMMITTED")

In [3]:
conn = cnx.connect()

In [4]:
query = '''
SELECT donations.donation_amount AS donation_amt,
        donations.donor_cart_sequence AS ord,
        donations.converts AS conv,
        donations.project_id,
        p.school_id,
        p.project_cost,
        s.school_percentage_free_lunch AS perc_lunch,
        s.school_state AS state,
        d.donor_is_teacher AS is_teacher,
        donations.donation_included_optional_donation AS incl_opt
FROM donations
JOIN projects p ON donations.project_id = p.project_id
JOIN schools s ON p.school_id = s.school_id
JOIN donors d ON donations.donor_id = d.donor_id
WHERE donor_cart_sequence = 1 AND s.school_state = 'Washington'
'''

In [5]:
i1_df = pd.read_sql_query(query ,cnx)

In [7]:
i1_df = cost_column_to_float(i1_df)

In [17]:
X = i1_df[['project_cost', 'perc_lunch', 'donation_amt', 'is_teacher', 'incl_opt']]
y = i1_df['conv']

In [31]:
mapper = DataFrameMapper([
    (['project_cost'], StandardScaler()),
    (['perc_lunch'], [Imputer(), StandardScaler()]),
    (['donation_amt'], StandardScaler()),
    (['is_teacher'], [LabelBinarizer(), StandardScaler()]),
    (['incl_opt'], [LabelBinarizer(), StandardScaler()])
])

In [28]:
X_columns = X.columns

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [52]:
Xtrain_rescaled = pd.DataFrame(mapper.fit_transform(X_train.copy()), columns=X_columns)
Xtest_rescaled = pd.DataFrame(mapper.transform(X_test.copy()), columns=X_columns)



In [69]:
RFclf = RandomForestClassifier(n_jobs=50)
RFclf.fit(Xtrain_rescaled,y_train)
y_RFpred = RFclf.predict(Xtest_rescaled)

In [70]:
f1_score(y_test, y_RFpred)

0.09621166566446182

In [71]:
accuracy_score(y_test, y_RFpred)

0.8427166178317288

In [72]:
pd.Series(y_RFpred).value_counts()

0    9102
1     454
dtype: int64

In [73]:
pd.Series(y_test).value_counts()

0    8347
1    1209
Name: conv, dtype: int64