# MACHINE LEARNING FOR PUBLIC POLICY
# Homework 3 - Cristina Mac Gregor Vanegas
### Due: May 17, 2018

#### PART 1.A:
     1. Generating auxiliary functions for feature manipulation and generation
     2. Generating functions for exploration of data
#### Part 1.B: 
     3. Building functions for applying classifiers  
#### Part 1.C:
     4. Building functions for evaluating classifiers 
#### Part 1.D:
     5. Preparation for loop. 
#### Part 2: 
     Running the pipeline
    

## PART 1.A


##### Set up and import of data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import geopandas as gpd
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from itertools import product
from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition, svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import ParameterGrid
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
import datetime
from datetime import timedelta
from datetime import datetime



In [3]:
def read_files(file_name):
    '''
    Reading in downloaded csv files.
    '''
    dframe = pd.read_csv(file_name)
    return dframe

In [4]:
def merge_frames(frame1, frame2, id1, id2) :
    frame2 = frame2.rename(columns={id2: id1})
    frame = pd.merge(frame1, frame2, on=id1)
    return frame

In [5]:
def get_geo(shape_file, frame, var, shp_name, level_str):
    '''
    Creates a geopandas file at the geographical level specified, given a pandas
    dataframe and a shape or geojson file.
    Inputs: shape_file: shapefile or geojson file
            frame: pandas frame
            shp_name: name of column in geojson file
            level_str: name of column in frame
    Outputs:
        Extended geo frame (geopandas object)

    '''
    frame.groupby()
    geo_df = gpd.read_file(shape_file)
    geo_df = geo_df.rename(columns={shp_name: level_str})
    geo_df_ext = geo_df.merge(frame, on=level_str, how = 'left')
    return geo_df_ext

##### 1. Pre-processing auxiliary data functions

In [6]:
def check_mv(frame, var):
    '''
    Prints out percentage of missing values for a given variable.
    '''
    frame["temp"] = frame[var].apply(lambda x: x if float(x) else np.nan)
    frame["temp2"] = frame[var].apply(lambda x: 1 if pd.isnull(x) else 0)
    print("Missing values", frame["temp2"].value_counts(True))

In [7]:
def fill_mvals(frame, var, measure = "mean"):
    '''
    Setting missing values to a meassure of central distribution as specified by parameters.  
    '''
    if measure == "median":
        md = frame[var].median()
    elif measure == "mean":
        md = frame[var].mean()
    frame[var] = frame[var].apply(lambda x: md if pd.isnull(x) else x) 
    return frame, md

In [8]:
def winsorize(frame, var, level = .99):
    '''
    Winzorizing process: setting outliers to the value of the 99 percentile. 
    '''
    cuttoff = frame[var].quantile(level)
    frame[var] = frame[var].apply(lambda x: cuttoff if x > cuttoff else x) 
    return frame, cuttoff 

In [9]:
def generate_buckets(frame, var, nbuckets):
    '''
    Auxiliary function for discretization. Generates equally sized bins for a variable according 
    to the number of bins specified as a parameter. 
    Returns a list with the upper boundries for every bucket. 
    '''
    min_ = frame[var].min()
    max_ = frame[var].max()
    step = (max_  - min_ ) / nbuckets
    steps = []
    temp = min_
    while temp <= max_:
        temp += step 
        steps += temp
    return steps

In [10]:
def cats(row, buckets, var):
    '''
    Auxiliary variable for discretization. Returns category to which a variable corresponds 
    if its being classified into buckets. 
    '''
    last = 0 
    gr = 1
    for i in buckets: 
        if (row[var]<= i) and (row[var]> last): 
            return gr
        gr += 1
        last = i

In [11]:
def discretize(frame, var, buckets = None, quartiles = False, num_buckets = None):
    '''
    Returns a discrete variable. If buckets are specified, they are used as thresholds. If not, 
    quartiles can be used to discretize a continious variable. If neither buckets nor quartiles 
    are specified, number of equal sized buckets can be used.
    '''
    new_name = str(var) + "_discrete" 
    
    if quartiles:     
        x25 = frame[var].quantile(.25)    
        x50 = frame[var].quantile(.50)    
        x75 = frame[var].quantile(.75)
        x100 = frame[var].max()
        buckets = [x25, x50, x75, x100]
    
    if num_buckets: 
        buckets = generate_buckets(frame, var, num_buckers)
        
    frame[new_name] = frame.apply(cats, axis=1, args = (buckets, var)) 
    return frame

In [12]:
def dummify(frame, var, threshold = None):
    '''
    Makes dummy variables for each category of a discrete variable.
    '''
    if threshold: 
        new_name = str(var) + "_d" 
        frame[new_name] = frame.apply(lambda x: 1 if x[var] < threshold else 0)
    else:
        buckets = frame[var].unique()
        for i in buckets: 
            new_name = str(var) + "_d_" + str(i)
            frame[new_name] = frame[var].apply(lambda x: 1 if x == i else 0)
            
    return frame

In [13]:
def count_cases(row, frame, plusminus_range, var):
    '''
    Counts how many cases fit within a given range. 
    '''
    min_time = row[var] - plusminus_range 
    max_time = row[var] + plusminus_range 
    count = frame[(row[var] >= min_time) and (row[var] <= max_time)].count()
    return count

##### 2. Functions for data exploration

In [14]:
def get_stats(frame, target_var, group_vars = None):
    '''
    Prints general statistics for each variable, and if specified, also 
    means of grouped-by varibales, grouped by specified groups. 
    '''
    
    if not group_vars:
        print("\n", target_var, frame[target_var].describe())
        print(frame[target_var].value_counts(True))
    
    if group_vars: 
        print(frame.groupby(group_vars)[target_var].mean())

In [15]:
def print_map_byvar(frame, varbs):
    '''
    Plots a map of the geographic distribution of the variables we wish to see. 
    '''
    for i in varbs:
        geo_df.plot(column=i, cmap='OrRd')
        plt.title(i)
        plt.show()

In [16]:
def show_cor(frame):
    '''
    Prints spearman correlations from a complete dataframe
    '''
    return frame.corr("spearman")

In [17]:
def scat(frame, varbs, target_var):
    '''
    Prints scatter plots for all the possible features against the predicted variable. 
    '''
    pairs = []
    for i in varbs:
        plt.scatter(frame[target_var], frame[i])
        plt.title("{} vs {}".format(target_var, i))
        plt.xlabel(target_var)
        plt.ylabel(i)
        plt.show()

## PART 1.B 

##### 3. Functions for building classifiers


In [18]:
def keep_feats(varbs, frame):
    '''
    Keeps features we want to include as predicitve features
    '''
    f2 = frame[varbs]
    return f2

In [19]:
def drop_feats(varbs, frame):
    '''
    Deletes variables that we don't want to include as predicitve features
    '''
    f2 = frame.drop(varbs, axis=1)
    return f2

In [20]:
def split(frame, test_percentage, target_var):
    '''
    Splits data into train and test sections. 
    '''
    X = frame.drop(target_var, axis=1)
    Y = frame[target_var]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_percentage)

    return  [x_train, x_test, y_train, y_test]

##### 4. Functions for evaluation

In [21]:
def conf_matrix(output_array, real_vals, threshold):
    '''
    Returns the confussion matrix according to the test
    predictions and true values, according to a given threshold. 
    Inputs:
        output_array: (array) predicted values from test fraction of the data
        real_vals: (array) true values from test fraction of data (y_test)
        threshold: (float) threshold for predicted probabilities. 
    Outputs: 
        list [TP, FP, TN, FN]
    '''
    test = {'pred': output_array, 'real': real_vals}
    test_f = pd.DataFrame(data=test)   
    TP, FP, TN, FN = 0, 0, 0, 0
    for indx, row in test_f.iterrows():
        status_predicted = 0
        if row["pred"] > threshold:
            status_predicted= 1 
        if (status_predicted == 1) and (row["real"]==1):
            TP += 1
        if (status_predicted == 1) and (row["real"]==0):
            FP += 1
        if (status_predicted == 0) and (row["real"]==0):
            TN += 1
        if (status_predicted == 0) and (row["real"]==1):
            FN += 1
            
    return [TP, FP, TN, FN]

In [22]:
def accuracy(conf_m):
    TP, FP, TN, FN = conf_m[0], conf_m[1], conf_m[2], conf_m[3]
    if (TP + TN + FP + FN) == 0: 
        return 0
    acc =  (TP + TN) / (TP + TN + FP + FN)
    return acc

In [23]:
def recall(conf_m):
    TP, FP, TN, FN = conf_m[0], conf_m[1], conf_m[2], conf_m[3]
    if (TP + FN) == 0: 
        return 0
    rec =  (TP) / (TP + FN)
    return rec

In [24]:
def precision(conf_m):
    TP, FP, TN, FN = conf_m[0], conf_m[1], conf_m[2], conf_m[3]
    if (TP + FP )== 0: 
        return 0
    prec = (TP) / (TP + FP )
    return prec 

In [25]:
def specificity(conf_m): 
    TP, FP, TN, FN = conf_m[0], conf_m[1], conf_m[2], conf_m[3]
    if (TN + FN) == 0:
        return  0
    spec = (TN) / (TN + FN)
    return spec 

In [26]:
def f1(prec, rec): 
    if (prec + rec) == 0:
        return 0
    return 2 * (prec * rec) / (prec + rec)
    

In [27]:
def get_main_metrics(output_array, real_vals, cuttoff):
    cm = conf_matrix(output_array, real_vals, cuttoff)
    recall_m = recall(cm)
    precision_m = precision(cm)
    accuracy_m = accuracy(cm)
    specificity_m = specificity(cm)
    f1_ = f1(precision_m, recall_m)
    return recall_m, precision_m, accuracy_m, specificity_m, f1_

In [28]:
def get_metrics_array(output_array, real_vals, steps = 20):
    recall_arr = []
    precision_arr = []
    acc_arr = []
    spec_arr = []
    for i in range (0, 100, steps):
        r, p, a, s, f = get_main_metrics(output_array, real_vals, i/100)
        recall_arr += [r]
        precision_arr += [p]
        acc_arr += [a]
        spec_arr  += [s]
    return recall_arr, precision_arr, acc_arr, spec_arr

In [29]:
def precision_rec(output_array, real_vals):
    r_arr, p_arr, a_arr, s_arr = get_metrics_array(output_array, real_vals)
    
    plt.step(r_arr, p_arr, color='a', alpha=0.2, where='post')
    plt.fill_between(r_arr, p_arr, step='post', alpha=0.2, color='a')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve')
    plt.show()

In [30]:
def prec_rec_auc(output_array, real_vals):
    r_arr, p_arr, a_arr, s_arr = get_metrics_array(output_array, real_vals)
    area =  metrics.auc(r_arr, p_arr)
    return area 

In [31]:
def roc_auc(output_array, real_vals):
    
    area = roc_auc_score(output_array, real_vals)
    return area 

In [32]:
def baseline(mode, real_vals):
    mode = frame['pred_var'].mode()
    return mode

##### Functions for creating models 

In [33]:
'''
From DSSG Magic loop - different values for each model. 
Reference: DSSG magic loop; source https://github.com/rayidghani/magicloops

'''
fn = {  'RF': RandomForestClassifier,
        'LR': LogisticRegression,
        'SVM': svm.SVC,
        'GB': GradientBoostingClassifier,
        'DT': DecisionTreeClassifier,
        'KNN': KNeighborsClassifier
            }
small_grid = { 
        'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10], 'n_jobs': [-1]},
        'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
        'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
        'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
           }

In [34]:
def split_time(frame, cuttoff, target_var, time_var):
    '''
    Splits data into train and test sections. 
    Reference: DSSG magic loop; source https://github.com/rayidghani/magicloops
    '''    
    train = frame[frame[time_var] <= cuttoff]
    train = train.drop(time_var, axis=1)
    x_train = train.drop(target_var, axis=1)
    y_train = train[target_var]
    
    test = frame[frame[time_var] > cuttoff]
    test = test.drop(time_var, axis=1)
    x_test = test.drop(target_var, axis=1)
    y_test = test[target_var]

    return  [x_train, x_test, y_train, y_test]

In [35]:
def process_split(raw_split):
    xtr, xtst, ytr, ytst = raw_split[0], raw_split[1], raw_split[2], raw_split[3]
    for i in xtr.columns.values:
        
        xtr[i], md = fill_mvals(xtr, i, "mean")
        xtr[i], cut = winsorize(xtr, i, "mean")
        
        ytr[i] = frame[i].apply(lambda x: md if pd.isnull(x) else x) 
        ytr[i] = frame[i].apply(lambda x: cut if x > cuttoff else x) 
    
    return [xtr, xtst, ytr, ytst]

In [36]:
def complete_evaluation_loop(fn, models_to_run, model_grid, thresholds_given, split):
    '''
    '''
    d = {}
    
    for model, params in model_grid.items():
        if model in models_to_run: 
            combinations_vals  = {}
            count = 0
            parameters  = []
            for ind, val in params.items():
                count += 1
                parameters += [ind]
                combinations_temp = []
                for i in val: 
                    combinations_temp += [i]
                combinations_vals[count] = combinations_temp
                combinations_params = list(product(*combinations_vals.values())) 
            list_values = []

            for idx, item in enumerate(combinations_params): 
                d['model'] = model
                d['parameters'] = str(parameters) +' = '+ str(item)
                item = list(item)

In [37]:
def apply_model(fn, model, keys, values, split):
    '''
    Visited for information about ziping
    https://stackoverflow.com/questions/209840/map-two-lists-into-a-dictionary-in-python
    '''
    
    x_tr, x_tst, y_tr, y_tst = split[0], split[1], split[2], split[3]
    skfunction = fn[model]()
    
    params = dict(zip(keys, values))
    print(params)
    skfunction.set_params(**params)
    preds = skfunction.fit(x_tr, y_tr).predict_proba(x_tst)
    return preds[:,1], y_tst


## PART 2


In [38]:
'''
Processing inital data
'''
fr1 = read_files('data/projects.csv')
fr2 = read_files('data/outcomes.csv')
fr = merge_frames(fr1, fr2, 'projectid', 'projectid')
fr['Year'] = pd.to_datetime(fr['date_posted']).dt.year
fr['date_posted'] = pd.to_datetime(fr['date_posted'])

fr = fr[(fr['Year'] >= 2011 )] 
fr = fr[(fr['Year'] <= 2014 )] 

In [None]:
#Setting globals 

fr = drop_feats(['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid', 'school_latitude',
 'school_longitude'], fr)
all_cols = list(fr.columns.values)
pred_vr = 'fully_funded'
time_cuttoff = datetime.strptime('2013-10-01', '%Y-%m-%d' )
time_var =  'date_posted'
models = ['RF', 'LR', 'SVM','GB','DT','KNN']

In [None]:
#Exploratory analysis
scat(fr, all_cols, pred_vr)
for i in all_cols: 
    get_stats(fr,i)


In [None]:
show_cor(fr)

In [None]:
scat(fr, all_cols, pred_vr)

In [None]:
#SETTING UP DATA
bool_features = ['school_charter_ready_promise',
 'teacher_teach_for_america', 'at_least_1_teacher_referred_donor']

for var in (bool_features += [pred_vr]): 
    fr[var] = fr[var].apply(lambda x: '0' if (x == 'f') else '1')

ind_features = bool_features += ['students_reached', 'school_zip','total_price_excluding_optional_support' ]

feats_tokeep = ind_features += [time_var] += [pred_vr]
test_frame = keep_feats(feats_tokeep, fr)

In [None]:
raw_split = split_time(test_frame, time_cuttoff, pred_vr,time_var)
split = process_split(raw_split)

In [None]:
#RUNNING LOOP
dict_models = complete_evaluation_loop(fn, models, small_grid, [.01, .02, .05, .10, .20, .30, .50], split)

In [None]:
results_df =  pd.DataFrame(columns=('model_type','clf', 'parameters', 'outcome', 'validation_date', 'group',
                                        'train_set_size', 'validation_set_size','predictors',
                                        'baseline','precision_at_5','precision_at_10','precision_at_20','precision_at_30','precision_at_40',
                                        'precision_at_50','recall_at_5','recall_at_10','recall_at_20','recall_at_30','recall_at_40',
                                        'recall_at_50','auc-roc'))

In [None]:
dict_models