# MACHINE LEARNING FOR PUBLIC POLICY
# Homework 3 - Cristina Mac Gregor Vanegas
### Due: May 17, 2018

#### PART 1.A:
     1. Generating auxiliary functions for feature manipulation and generation
     2. Generating functions for exploration of data
#### Part 1.B: 
     3. Building functions for applying classifiers  
#### Part 1.C:
     4. Building functions for evaluating classifiers 
#### Part 1.D:
     5. Preparation for loop. 
#### Part 2: 
     Running the pipeline
    

## PART 1.A


##### Set up and import of data

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import geopandas as gpd
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from itertools import product
from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import ParameterGrid
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
import datetime
from datetime import timedelta
from datetime import datetime

In [2]:
def read_files(file_name):
    '''
    Reading in downloaded csv files.
    '''
    dframe = pd.read_csv(file_name)
    return dframe

In [3]:
def merge_frames(frame1, frame2, id1, id2) :
    frame2 = frame2.rename(columns={id2: id1})
    frame = pd.merge(frame1, frame2, on=id1)
    return frame

In [4]:
def get_geo(shape_file, frame, var, shp_name, level_str):
    '''
    Creates a geopandas file at the geographical level specified, given a pandas
    dataframe and a shape or geojson file.
    Inputs: shape_file: shapefile or geojson file
            frame: pandas frame
            shp_name: name of column in geojson file
            level_str: name of column in frame
    Outputs:
        Extended geo frame (geopandas object)

    '''
    frame.groupby()
    geo_df = gpd.read_file(shape_file)
    geo_df = geo_df.rename(columns={shp_name: level_str})
    geo_df_ext = geo_df.merge(frame, on=level_str, how = 'left')
    return geo_df_ext

##### 1. Pre-processing auxiliary data functions

In [5]:
def check_mv(frame, var):
    '''
    Prints out percentage of missing values for a given variable.
    '''
    frame["temp2"] = frame[var].apply(lambda x: 1 if pd.isnull(x) else 0)
    print("Missing values", frame["temp2"].value_counts(True))

In [6]:
def fill_mvals(frame, var, measure = "mean"):
    '''
    Setting missing values to a meassure of central distribution as specified by parameters.  
    '''
    if measure == "median":
        md = frame[var].median()
    elif measure == "mean":
        md = frame[var].mean()
    frame[var] = frame[var].apply(lambda x: md if pd.isnull(x) else x) 
    return frame, md

In [7]:
def winsorize(frame, var, level = .99):
    '''
    Winzorizing process: setting outliers to the value of the 99 percentile. 
    '''
    if frame[var].dtype == int:
        return frame, frame[var].max()
    cuttoff = frame[var].quantile(.99)
    frame[var] = frame[var].apply(lambda x: cuttoff if x > cuttoff else x) 
    return frame, cuttoff 

In [8]:
def generate_buckets(frame, var, nbuckets):
    '''
    Auxiliary function for discretization. Generates equally sized bins for a variable according 
    to the number of bins specified as a parameter. 
    Returns a list with the upper boundries for every bucket. 
    '''
    min_ = frame[var].min()
    max_ = frame[var].max()
    step = (max_  - min_ ) / nbuckets
    steps = []
    temp = min_
    while temp <= max_:
        temp += step 
        steps += temp
    return steps

In [9]:
def cats(row, buckets, var):
    '''
    Auxiliary variable for discretization. Returns category to which a variable corresponds 
    if its being classified into buckets. 
    '''
    last = 0 
    gr = 1
    for i in buckets: 
        if (row[var]<= i) and (row[var]> last): 
            return gr
        gr += 1
        last = i

In [10]:
def discretize(frame, var, buckets = None, quartiles = False, num_buckets = None):
    '''
    Returns a discrete variable. If buckets are specified, they are used as thresholds. If not, 
    quartiles can be used to discretize a continious variable. If neither buckets nor quartiles 
    are specified, number of equal sized buckets can be used.
    '''
    new_name = str(var) + "_discrete" 
    
    if quartiles:     
        x25 = frame[var].quantile(.25)    
        x50 = frame[var].quantile(.50)    
        x75 = frame[var].quantile(.75)
        x100 = frame[var].max()
        buckets = [x25, x50, x75, x100]
    
    if num_buckets: 
        buckets = generate_buckets(frame, var, num_buckers)
        
    frame[new_name] = frame.apply(cats, axis=1, args = (buckets, var)) 
    return frame, buckets, new_name

In [11]:
def dummify(frame, var, buckets_spec = None,  threshold = None):
    '''
    Makes dummy variables for each category of a discrete variable.
    '''
    if buckets_spec is not None: 
        for i in buckets_spec: 
            print(buckets_spec)
            new_name = str(var) + "_d_" + str(i)
            frame[new_name] = frame[var].apply(lambda x: 1 if x == i else 0)
        return frame
    
    elif threshold: 
        new_name = str(var) + "_d" 
        frame[new_name] = frame.apply(lambda x: 1 if x[var] < threshold else 0)
        return frame
    
    else:
        buckets = frame[var].unique()
        new_vars = []
        for i in buckets: 
            new_name = str(var) + "_d_" + str(i)
            frame[new_name] = frame[var].apply(lambda x: 1 if x == i else 0)
            new_vars = new_vars + [new_name]
            
        return frame, new_vars, buckets

In [12]:
def count_cases(row, frame, plusminus_range, var):
    '''
    Counts how many cases fit within a given range. 
    '''
    min_time = row[var] - plusminus_range 
    max_time = row[var] + plusminus_range 
    count = frame[(row[var] >= min_time) and (row[var] <= max_time)].count()
    return count

##### 2. Functions for data exploration

In [13]:
def get_stats(frame, target_var, group_vars = None):
    '''
    Prints general statistics for each variable, and if specified, also 
    means of grouped-by varibales, grouped by specified groups. 
    '''
    
    if not group_vars:
        print("\n", target_var, frame[target_var].describe())
        print(frame[target_var].value_counts(True))
    
    if group_vars: 
        print(frame.groupby(group_vars)[target_var].mean())

In [14]:
def print_map_byvar(frame, varbs):
    '''
    Plots a map of the geographic distribution of the variables we wish to see. 
    '''
    for i in varbs:
        geo_df.plot(column=i, cmap='OrRd')
        plt.title(i)
        plt.show()

In [15]:
def show_cor(frame):
    '''
    Prints spearman correlations from a complete dataframe
    '''
    return frame.corr("spearman")

In [16]:
def scat(frame, varbs, target_var):
    '''
    Prints scatter plots for all the possible features against the predicted variable. 
    '''
    pairs = []
    for i in varbs:
        plt.scatter(frame[target_var], frame[i])
        plt.title("{} vs {}".format(target_var, i))
        plt.xlabel(target_var)
        plt.ylabel(i)
        plt.show()

## PART 1.B 

##### 3. Functions for building classifiers


In [17]:
def keep_feats(varbs, frame):
    '''
    Keeps features we want to include as predicitve features
    '''
    f2 = frame[varbs]
    return f2

In [18]:
def drop_feats(varbs, frame):
    '''
    Deletes variables that we don't want to include as predicitve features
    '''
    f2 = frame.drop(varbs, axis=1)
    return f2

In [19]:
def split(frame, test_percentage, target_var):
    '''
    Splits data into train and test sections. 
    '''
    X = frame.drop(target_var, axis=1)
    Y = frame[target_var]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_percentage)

    return  [x_train, x_test, y_train, y_test]

##### 4. Functions for evaluation

In [20]:
def conf_matrix(output_array, real_vals, threshold):
    '''
    Returns the confussion matrix according to the test
    predictions and true values, according to a given threshold. 
    Inputs:
        output_array: (array) predicted values from test fraction of the data
        real_vals: (array) true values from test fraction of data (y_test)
        threshold: (float) threshold for predicted probabilities. 
    Outputs: 
        list [TP, FP, TN, FN]
    '''
        
    test = {'pred': output_array, 'real': real_vals}
    test_f = pd.DataFrame(data=test)   
    TP, FP, TN, FN = 0, 0, 0, 0
    for indx, row in test_f.iterrows():
        status_predicted = 0
        if row["pred"] > threshold:
            status_predicted= 1 

        if (status_predicted == 1) and (row["real"]==1):
            TP += 1
        if (status_predicted == 1) and (row["real"]==0):
            FP += 1
        if (status_predicted == 0) and (row["real"]==0):
            TN += 1
        if (status_predicted == 0) and (row["real"]==1):
            FN += 1
    
    return [TP, FP, TN, FN]

In [21]:
def accuracy(conf_m):
    TP, FP, TN, FN = conf_m[0], conf_m[1], conf_m[2], conf_m[3]
    if (TP + TN + FP + FN) == 0: 
        return 0
    acc =  (TP + TN) / (TP + TN + FP + FN)
    return acc

In [22]:
def recall(conf_m):
    TP, FP, TN, FN = conf_m[0], conf_m[1], conf_m[2], conf_m[3]
    if (TP + FN) == 0: 
        return 0
    rec =  (TP) / (TP + FN)
    return rec

In [23]:
def precision(conf_m):
    TP, FP, TN, FN = conf_m[0], conf_m[1], conf_m[2], conf_m[3]
    if (TP + FP )== 0: 
        return 0
    prec = (TP) / (TP + FP )
    return prec 

In [24]:
def specificity(conf_m): 
    TP, FP, TN, FN = conf_m[0], conf_m[1], conf_m[2], conf_m[3]
    if (TN + FN) == 0:
        return  0
    spec = (TN) / (TN + FN)
    return spec 

In [25]:
def f1(prec, rec): 
    if (prec + rec) == 0:
        return 0
    return 2 * (prec * rec) / (prec + rec)

In [26]:
def get_main_metrics(output_array, real_vals, cuttoff):
    cm = conf_matrix(output_array, real_vals, cuttoff)
    recall_m = recall(cm)
    precision_m = precision(cm)
    accuracy_m = accuracy(cm)
    specificity_m = specificity(cm)
    f1_ = f1(precision_m, recall_m)
    return recall_m, precision_m, accuracy_m, specificity_m, f1_

In [27]:
def get_metrics_array(output_array, real_vals, steps = 20):
    recall_arr = []
    precision_arr = []
    acc_arr = []
    spec_arr = []
    for i in range (0, 100, steps):
        r, p, a, s, f = get_main_metrics(output_array, real_vals, i/100)
        recall_arr += [r]
        precision_arr += [p]
        acc_arr += [a]
        spec_arr  += [s]
    return recall_arr, precision_arr, acc_arr, spec_arr

In [28]:
def precision_rec(output_array, real_vals):
    r_arr, p_arr, a_arr, s_arr = get_metrics_array(output_array, real_vals)
    
    plt.step(r_arr, p_arr, color='a', alpha=0.2, where='post')
    plt.fill_between(r_arr, p_arr, step='post', alpha=0.2, color='a')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve')
    plt.show()

In [29]:
def prec_rec_auc(output_array, real_vals):
    r_arr, p_arr, a_arr, s_arr = get_metrics_array(output_array, real_vals)
    area =  metrics.auc(r_arr, p_arr)
    return area 

In [30]:
def roc_auc(output_array, real_vals):
    
    area = roc_auc_score(output_array, real_vals)
    return area 

In [31]:
# def baseline(mod, output_array,real_vals):
#     test = {'pred': output_array, 'real': real_vals}
#     test_f = pd.DataFrame(data=test)   
#     baseline_score = 0 
#     for indx, row in test_f.iterrows():
#         if row["pred"] == mod:
#             baseline_score += 1
            
#     return (baseline_score / test_f.shape[0])

##### Functions for creating models 

In [35]:
'''
From DSSG Magic loop - different values for each model. 
Reference: DSSG magic loop; source https://github.com/rayidghani/magicloops

'''
fn = {  'RF': RandomForestClassifier,
        'LR': LogisticRegression,
        'SVM': svm.SVC,
        'GB': GradientBoostingClassifier,
        'DT': DecisionTreeClassifier,
        'KNN': KNeighborsClassifier,
        'BL': DummyClassifier
            }
small_grid = { 
        'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
        'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
        'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
        'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']},
        'BL': {}
        }


test_grid = { 
    'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'LR': { 'penalty': ['l1'], 'C': [0.01]},
    'SGD': { 'loss': ['perceptron'], 'penalty': ['l2']},
    'ET': { 'n_estimators': [1], 'criterion' : ['gini'] ,'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]},
    'GB': {'n_estimators': [1], 'learning_rate' : [0.1],'subsample' : [0.5], 'max_depth': [1]},
    'NB' : {},
    'DT': {'criterion': ['gini'], 'max_depth': [1],'min_samples_split': [10]},
    'SVM' :{'C' :[0.01],'kernel':['linear']},
    'KNN' :{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']}
           }

In [36]:
def split_time(frame, cuttoff, target_var, time_var):
    '''
    Splits data into train and test sections. 
    Reference: DSSG magic loop; source https://github.com/rayidghani/magicloops
    '''    
    train = frame[frame[time_var] <= cuttoff]
    train = train.drop(time_var, axis=1)
    x_train = train.drop(target_var, axis=1)
    y_train = train[target_var]
    
    test = frame[frame[time_var] > cuttoff]
    test = test.drop(time_var, axis=1)
    x_test = test.drop(target_var, axis=1)
    y_test = test[target_var]    
    
    return  [x_train, x_test, y_train, y_test]

In [37]:
def process_split(raw_split, trgt):
    '''
    Cleans data for train set, and test set according to manipulations done to that 
    '''
    xtr, xtst, ytr, ytst = raw_split[0], raw_split[1], raw_split[2], raw_split[3]
   
    for var in vars_categorical: 
        xtr, newvrs, bcks_d = dummify(xtr, var)
        xtst = dummify(xtst, var, bcks_d)
        
        xtr = drop_feats(var, xtr)
        xtst = drop_feats(var, xtst)
        
    for var in vars_cont: 
        xtr, buckts, newv = discretize(xtr, var, None, True)
        xtr, newvrs, bcks_d = dummify(xtr, newv)
        
        xtst, buckts2, newv = discretize(xtst, var, buckts)
        xtst = dummify(xtst, newv, bcks_d)
       
        xtr = drop_feats([var, newv], xtr)
        xtst = drop_feats([var, newv], xtst)

    for var in xtr.columns:
        mode_ = xtr[var].mode()[0]
        xtr[var].fillna(mode_, inplace = True) 
        xtst[var].fillna(mode_, inplace = True) 
        
    temp = ytr.mode()
    ytr.fillna(temp) 
    ytst.fillna(temp)
    
    return [xtr, xtst, ytr, ytst]

In [52]:
def complete_evaluation_loop(fn, models_to_run, model_grid, thresholds_given, split):
    '''
    Runs loop to evaluate models with different combinations from different parameters.
    '''
    d = {}
    
    for model, params in model_grid.items():
        if model in models_to_run: 
            combinations_vals  = {}
            count = 0
            parameters  = []
            for ind, val in params.items():
                count += 1
                parameters += [ind]
                combinations_temp = []
                for i in val: 
                    combinations_temp += [i]
                combinations_vals[count] = combinations_temp
                combinations_params = list(product(*combinations_vals.values())) 
            list_values = []
            model_d = {}
            for idx, item in enumerate(combinations_params):    
                specification = str(parameters) +' = '+ str(item)
                item = list(item)
                pred, real = apply_model(fn, model, parameters, item, split)
                temp = {}
                prec_array = []
                recall_array = []
                for i in thresholds_given: 
                    recall, precision, accuracy, specificity, f1_ = get_main_metrics(pred, real, i)
                    string = "precision at " + str(i)
                    prec_array += [precision]
                    temp[string] = precision
                    string2 = "recall at " + str(i)
                    recall_array += [recall]
                    temp[string2] = recall
                    string3 = "f1 at " + str(i)
                    temp[string3] = f1_ 
                temp['auc'] = metrics.auc(prec_array, recall_array)
                model_d[specification] = temp
  
    return d

In [39]:
def apply_model(fn, model, keys, values, split):
    '''
    Visited for information about ziping
    https://stackoverflow.com/questions/209840/map-two-lists-into-a-dictionary-in-python
    '''
    
    x_tr, x_tst, y_tr, y_tst = split[0], split[1], split[2], split[3]
    skfunction = fn[model]()
    params = dict(zip(keys, values))
    skfunction.set_params(**params)
    skfunction.fit(x_tr, y_tr)  
    preds = skfunction.predict_proba(x_tst)
    return preds[:,1], y_tst


## PART 2


In [40]:
'''
Processing inital data
'''
fr1 = read_files('data/projects.csv')
fr2 = read_files('data/outcomes.csv')
fr = merge_frames(fr1, fr2, 'projectid', 'projectid')
fr['Year'] = pd.to_datetime(fr['date_posted']).dt.year
fr['date_posted'] = pd.to_datetime(fr['date_posted'])

fr = fr[(fr['Year'] >= 2011 )] 
fr = fr[(fr['Year'] <= 2014 )] 

In [41]:
all_cols = list(fr.columns.values)

bool_vars = ['school_charter', 'school_magnet',
       'school_year_round', 'school_nlns', 'school_kipp',
       'school_charter_ready_promise', 'fully_funded', 'at_least_1_green_donation', 'great_chat',
       'three_or_more_non_teacher_referred_donors',
       'one_non_teacher_referred_donor_giving_100_plus',
       'donation_from_thoughtful_donor']

for var in (bool_vars): 
    newn = "bool_" +   str(var) 
    fr[newn] = fr[var].apply(lambda x: 0 if (x == 'f') else 1 if (x == 't') else None)


In [42]:
#Setting globals 
pred_vr = 'bool_fully_funded'
time_var =  'date_posted'
#models = ['RF', 'LR', 'SVM','GB','DT','KNN']
models = ['RF', 'LR', 'GB','DT']

#Features categories
vars_categorical = []
vars_bool = ['bool_one_non_teacher_referred_donor_giving_100_plus', 'bool_donation_from_thoughtful_donor', 'bool_school_charter', ]
vars_cont = ['teacher_referred_count', 'non_teacher_referred_count', 'great_messages_proportion', 'total_price_including_optional_support']


In [43]:
#Exploratory analysis
fr.corr()

Unnamed: 0,school_ncesid,school_latitude,school_longitude,school_zip,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,great_messages_proportion,teacher_referred_count,...,bool_school_year_round,bool_school_nlns,bool_school_kipp,bool_school_charter_ready_promise,bool_fully_funded,bool_at_least_1_green_donation,bool_great_chat,bool_three_or_more_non_teacher_referred_donors,bool_one_non_teacher_referred_donor_giving_100_plus,bool_donation_from_thoughtful_donor
school_ncesid,1.0,0.243473,0.396794,-0.330108,-0.017939,-0.031868,-0.031868,0.02734,0.05364,-0.022759,...,-0.124195,0.019965,-0.021588,-0.082048,-0.023468,0.025767,0.03465,-0.020565,-0.038933,0.034014
school_latitude,0.243473,1.0,0.087219,-0.141947,0.000823,-0.01198,-0.01198,-0.015081,0.089778,0.013126,...,-0.072968,0.043577,-0.029841,-0.031149,0.05476,0.08645,0.078483,0.039733,0.009119,0.031608
school_longitude,0.396794,0.087219,1.0,-0.936741,-0.024144,-0.046944,-0.046944,0.011793,0.113149,0.004915,...,-0.179351,0.052164,0.011045,-0.101642,-0.017953,0.089759,0.074947,0.013196,-0.063454,0.071072
school_zip,-0.330108,-0.141947,-0.936741,1.0,0.010444,0.041063,0.041063,-0.000112,-0.120275,-0.009322,...,0.142832,-0.056527,0.005018,0.087218,0.004029,-0.099324,-0.081326,-0.030623,0.056863,-0.081094
fulfillment_labor_materials,-0.017939,0.000823,-0.024144,0.010444,1.0,-0.050427,-0.050427,-0.018141,-0.09759,-0.059273,...,0.027344,0.027794,-0.000834,0.038143,-0.007849,-0.073401,-0.067962,0.045438,-0.006781,-0.075406
total_price_excluding_optional_support,-0.031868,-0.01198,-0.046944,0.041063,-0.050427,1.0,1.0,0.118112,-0.014668,0.114397,...,-0.001142,0.011279,0.007838,0.00943,-0.089427,-0.058382,-0.021915,-0.029764,0.031169,0.007444
total_price_including_optional_support,-0.031868,-0.01198,-0.046944,0.041063,-0.050427,1.0,1.0,0.118112,-0.014668,0.114397,...,-0.001142,0.011279,0.007838,0.00943,-0.089427,-0.058382,-0.021915,-0.029764,0.031169,0.007444
students_reached,0.02734,-0.015081,0.011793,-0.000112,-0.018141,0.118112,0.118112,1.0,-0.01209,-0.018748,...,-0.014727,-0.000172,0.006456,0.014895,-0.015606,-0.032178,-0.005523,-0.011043,0.020889,0.0106
great_messages_proportion,0.05364,0.089778,0.113149,-0.120275,-0.09759,-0.014668,-0.014668,-0.01209,1.0,0.014509,...,-0.024997,0.011834,0.012318,-0.012731,-0.207684,0.309892,0.823336,0.130186,-0.149915,0.085711
teacher_referred_count,-0.022759,0.013126,0.004915,-0.009322,-0.059273,0.114397,0.114397,-0.018748,0.014509,1.0,...,-0.003558,0.01466,0.006229,0.004316,0.134782,0.233473,-0.052372,0.136053,-0.112542,-0.01036


In [44]:
#Exploratory analysis
target_columns = ['bool_donation_from_thoughtful_donor', 'teacher_referred_count', 'non_teacher_referred_count',
                 'great_messages_proportion', 'bool_one_non_teacher_referred_donor_giving_100_plus', 'bool_school_charter','total_price_including_optional_support']

for i in target_columns: 
    get_stats(fr,i)
    get_stats(fr,i, pred_vr)


 bool_donation_from_thoughtful_donor count    306905.000000
mean          0.019182
std           0.137164
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: bool_donation_from_thoughtful_donor, dtype: float64
0.0    0.980818
1.0    0.019182
Name: bool_donation_from_thoughtful_donor, dtype: float64
bool_fully_funded
0    0.006905
1    0.022118
Name: bool_donation_from_thoughtful_donor, dtype: float64

 teacher_referred_count count    306905.000000
mean          1.215979
std           2.672419
min           0.000000
25%           0.000000
50%           0.000000
75%           2.000000
max         151.000000
Name: teacher_referred_count, dtype: float64
0.0      0.640348
1.0      0.108389
2.0      0.084974
3.0      0.050289
4.0      0.034532
5.0      0.023848
6.0      0.016474
7.0      0.011570
8.0      0.008055
9.0      0.005500
10.0     0.003819
11.0     0.002802
12.0     0.001874
13.0     0.001440
14.0     0.001121
15

In [45]:
#Analyzing time variale
fr.dropna(subset=[time_var], inplace=True)

temp_time = fr[time_var].sort_values(axis=0, ascending=True, inplace=False)
#Get proportion we want in train set 
prop_train = temp_time.size*.8
prop_train
x = temp_time.loc[int(prop_train)] 
time_cuttoff = x

In [46]:
feats_tokeep = target_columns + [time_var] + [pred_vr]
eval_frame = keep_feats(feats_tokeep, fr)

In [47]:
raw_split = split_time(eval_frame, time_cuttoff, pred_vr, time_var)
split = process_split(raw_split, target_columns)

[nan  4.  3.]
[nan  4.  3.]
[nan  4.  3.]
[ 3.  4.  1. nan  2.]
[ 3.  4.  1. nan  2.]
[ 3.  4.  1. nan  2.]
[ 3.  4.  1. nan  2.]
[ 3.  4.  1. nan  2.]
[ 2.  3. nan  4.  1.]
[ 2.  3. nan  4.  1.]
[ 2.  3. nan  4.  1.]
[ 2.  3. nan  4.  1.]
[ 2.  3. nan  4.  1.]
[2 1 3 4]
[2 1 3 4]
[2 1 3 4]
[2 1 3 4]


In [48]:
#verifying that train and test columns are equal
print(split[0].columns)
print(split[1].columns)

Index(['bool_donation_from_thoughtful_donor',
       'bool_one_non_teacher_referred_donor_giving_100_plus',
       'bool_school_charter', 'teacher_referred_count_discrete_d_nan',
       'teacher_referred_count_discrete_d_4.0',
       'teacher_referred_count_discrete_d_3.0',
       'non_teacher_referred_count_discrete_d_3.0',
       'non_teacher_referred_count_discrete_d_4.0',
       'non_teacher_referred_count_discrete_d_1.0',
       'non_teacher_referred_count_discrete_d_nan',
       'non_teacher_referred_count_discrete_d_2.0',
       'great_messages_proportion_discrete_d_2.0',
       'great_messages_proportion_discrete_d_3.0',
       'great_messages_proportion_discrete_d_nan',
       'great_messages_proportion_discrete_d_4.0',
       'great_messages_proportion_discrete_d_1.0',
       'total_price_including_optional_support_discrete_d_2',
       'total_price_including_optional_support_discrete_d_1',
       'total_price_including_optional_support_discrete_d_3',
       'total_price_incl

In [None]:
#RUNNING LOOP
#Ran into some erros w SVC
models = ['RF', 'LR','GB','DT', 'KNN']
dict_models = complete_evaluation_loop(fn, models, small_grid, [.01, .02, .05, .10, .20, .30, .50], split)
#dict_models = complete_evaluation_loop(fn, models, test_grid, [.30], split)

{'n_estimators': 10, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_jobs': -1}
{'n_estimators': 10, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_jobs': -1}
{'n_estimators': 10, 'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 2, 'n_jobs': -1}
{'n_estimators': 10, 'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 10, 'n_jobs': -1}
{'n_estimators': 10, 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_jobs': -1}
{'n_estimators': 10, 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_jobs': -1}
{'n_estimators': 10, 'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 2, 'n_jobs': -1}
{'n_estimators': 10, 'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 10, 'n_jobs': -1}
{'n_estimators': 100, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_jobs': -1}
{'n_estimators': 100, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_jobs': -1}

In [None]:
dict_models

In [None]:
'''
Constructing frame from nested loop following 
https://stackoverflow.com/questions/13575090/construct-pandas-dataframe-from-items-in-nested-dictionary 
'''
x = pd.DataFrame.from_dict({(i,j): dict_models[i][j] 
                           for i in dict_models.keys() 
                           for j in dict_models[i].keys()},
                           orient='index')
x.to_csv("complete_all_obs.csv")

In [None]:
#RE-RUNNING WITH DIFFERENT TIME SPLITS - LESS YEARS COVERED. 

fr = fr[(fr['Year'] >= 2012 )] 
fr = fr[(fr['Year'] <= 2013 )] 

feats_tokeep = target_columns + [time_var] + [pred_vr]
eval_frame = keep_feats(feats_tokeep, fr)
models = ['RF', 'LR','GB','DT', 'KNN']

for size_ in [.5, .6, .7]:
    temp_time = fr[time_var].sort_values(axis=0, ascending=True, inplace=False)
    prop_train = temp_time.size*size_
    x = temp_time.loc[int(prop_train)] 
    time_cuttoff = x
    
    raw_split = split_time(eval_frame, time_cuttoff, pred_vr, time_var)
    split = process_split(raw_split, target_columns) 
    dict_models = complete_evaluation_loop(fn, models, small_grid, [.01, .02, .05, .10, .20, .30, .50], split)
    
    temporal = pd.DataFrame.from_dict({(i,j): dict_models[i][j] 
                           for i in dict_models.keys() 
                           for j in dict_models[i].keys()},
                           orient='index')
    name_string = "models_" + str(x)
    temporal.to_csv(name_string)