# MACHINE LEARNING FOR PUBLIC POLICY
# Homework 2 - Cristina Mac Gregor Vanegas
### Due: April 17, 2018

The notebook is organized in the following way: First, all functions created in order to carry out the analysis are defined, for points 1 through 6. On a second part of the notebook, the functions are called ot excecute the analyisis
#### PART 1.A:
     1. Loading data
     2. Exploring data
     3. Pre-processing data
     4. Generating features and predictors. 
#### Part 1.B: 
     5. Building KNN classifier
     6. Evaluating the classifier. 
#### Part 2.A
    Running exploatory analysis
#### Part 2.B
    Running model

## PART 1.A


##### Read Data functions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import geopandas as gpd
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split

In [None]:
def read_files(file_name):
    '''
    Reading in downloaded csv files.
    '''
    dframe = pd.read_csv(file_name)
    return dframe

##### Exploring and pre-processing data functions

In [None]:
def check_mv(frame, var):
    '''
    Prints out percentage of missing values for a given variable.
    '''
    frame["temp"] = frame[var].apply(lambda x: x if float(x) else np.nan)
    frame["temp2"] = frame[var].apply(lambda x: 1 if pd.isnull(x) else 0)
    print("Missing values", frame["temp2"].value_counts(True))


In [None]:
def clean_var(frame, var):
    '''
    Cleans variables from unwanted characters; removes outliers and fills in missing values 
    '''
    frame[var] = frame[var].apply(lambda x: np.nan if (x == "-") else float(x))
    
    #Winzorizing process: setting outliers to the value of the 99 percentile. 
    x99 = frame[var].quantile(.99)
    print("dropping outliers")
    frame[var] = frame[var].apply(lambda x: x99 if x > x99  else x)  
    
    print("setting mv to mean")
    #Setting missing values to be the value of the mean
    x50 = frame[var].mean()
    print(x50)
    frame[var] = frame[var].apply(lambda x: x50 if pd.isnull(x) else x)  
    
    return frame

In [None]:
def get_stats(frame, target_var, group_vars = None):
    '''
    Prints general statistics for each variable, and if specified, also 
    means of grouped-by varibales, grouped by specified groups. 
    '''
    
    if not group_vars :
        print("\n", target_var, frame[target_var].describe())
        print(frame[target_var].value_counts(True))
    
    if group_vars is not None: 
        print(frame.groupby(group_vars)[target_var].mean())

In [None]:
def get_geo(shape_file, frame, var, shp_name, level_str):
    '''
    Creates a geopandas file at the geographical level specified, given a pandas
    dataframe and a shape or geojson file.
    Inputs: shape_file: shapefile or geojson file
            frame: pandas frame
            shp_name: name of column in geojson file
            level_str: name of column in frame
    Outputs:
        Extended geo frame (geopandas object)

    '''
    frame.groupby()
    geo_df = gpd.read_file(shape_file)
    geo_df = geo_df.rename(columns={shp_name: level_str})
    geo_df_ext = geo_df.merge(frame, on=level_str, how = 'left')
    return geo_df_ext

In [None]:
def print_map_byvar(frame, varbs):
    '''
    Plots a map of the geographic distribution of the variables we wish to see. 
    '''
    for i in varbs:
        geo_df.plot(column=i, cmap='OrRd')
        plt.title(i)
        plt.show()

In [None]:
def show_cor(frame):
    '''
    Prints spearman correlations from a complete dataframe
    '''
    return frame.corr("spearman")


In [None]:
def scat(frame, varbs, target_var):
    '''
    Prints scatter plots for all the possible features against the predicted variable. 
    '''
    pairs = []
    for i in varbs:
            plt.scatter(frame[target_var], frame[i])
            plt.title("{} vs {}".format(target_var, i))
            plt.xlabel(target_var)
            plt.ylabel(i)
            plt.show()


##### Generate features functions

In [None]:
def discretize_equal(frame, var, buckets):
    '''
    Discretizes a continious variable into the amount of buckets specified
    '''
    new_name = str(var) + "_discrete" 
    array = frame[var]
    dif = (array.max() - array.min()) / (buckets - 1)
    small_temp = array.min()
    largest_temp = array.min() + dif
    group = 1     
    while largest_temp <=  array.max():
        frame[new_name] = frame.apply(lambda x: group if (small_temp <= x) and (x < largest_temp) else x[new_name])
        group = group + 1
        small_temp = largest_temp
        largest_temp = largest_temp + dif       
    
    return frame

In [None]:
def discretize_quartiles(frame, var, buckets):
    '''
    Discretizes a continious variable into the amount of buckets specified
    '''
    new_name = str(var) + "_discrete_q" 
    x25 = frame[var].quantile(.25)    
    x50 = frame[var].quantile(.50)    
    x75 = frame[var].quantile(.75)  
    group = 1 
    last = 0
    for i in [x25, x50, x75]:
        frame[new_name] = frame[var].apply(lambda: group if (x <= i) and (x > last)) 
    return frame

In [None]:
def make_dummies(frame, var, threshold = None):
    '''
    Makes dummy variables for each category of a discrete variable. 
    '''
    if threshold: 
        new_name = str(var) + "_d" 
        frame[new_name] = frame.apply(lambda x: 1 if x[var] < threshold else 0)
    else:
        buckets = len(frame[var].value_counts())
        for i in buckets: 
            new_name = str(var) + "_d_" + str(i)
            frame[new_name] = frame.apply(lambda x: 1 if x[var] == i else 0)

## PART 1.B 

##### Build classifier

In [None]:
def drop_feats(varbs, frame):
    '''
    Deletes variables that we don't want to include as predicitve features
    '''
    f2 = frame.drop(varbs, axis=1)
    return f2

In [None]:
def split(frame, test_percentage, target_var):
    '''
    Splits data into train and test sections. 
    '''
    X = frame.drop(target_var, axis=1)
    Y = frame[target_var]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_percentage)

    return  x_train, x_test, y_train, y_test

In [None]:
def knn_f(x_tr, y_tr, x_tst, y_tst, nn = 10, p =3, weights_ = 'distance'):
    '''
    Fits model and applies to testing data according to the specificities provided
    for the adjustment of the model.
    '''
    knn = KNeighborsClassifier(n_neighbors=nn, metric='minkowski', metric_params={'p': p}, weights=weights_)
    knn.fit(x_tr, y_tr)
    preds = knn.predict_proba(x_tst)
    return preds

##### Evaluate classifier

In [None]:
def test_evals(output_array, real_vals, evals, threshold):
    '''
    Returns an evaluation meassure according to the test
    predictions and true values, according to a given threshold
    and according to the specified meassure of evaluation. 
    Inputs:
        output_array: (array) predicted values from test fraction of the data
        real_vals: (array) true values from test fraction of data (y_test)
        evals: (str) meassure of evaluation. Can be accuracy, recall, precision 
                    or specificity
        threshold: (float) threshold for predicted probabilities. 
    Outputs: 
        Score (float)
    '''
    
    test = {'pred': list(output_array), 'real': real_vals}
    test_f = pd.DataFrame(data=test)
    test_f['pred_bool'] = test_f['pred'].apply(lambda x: 1 if x >= threshold else 0)
   
    TP, FP, TN, FN = 0, 0, 0, 0
    
    for indx, row in test_f.iterrows():
        if (row["pred"] == 1) and (row["real"]==1):
            TP += 1
        if (row["pred"] == 1) and (row["real"]==0):
            FP += 1
        if (row["pred"] == 0) and (row["real"]==0):
            TN += 1
        if (row["pred"] == 0) and (row["real"]==1):
            FN += 1
            
    if evals == 'accuracy':
        return (TP + TN) / (TP + TN + FP + FN)
    elif evals == 'recall':
        return (TP) / (TP + FN)
    elif evals == 'precision':
        return (TP) / (TP + FP )
    elif evals == 'specificity':
        return (TN) / (TN + FN)

In [None]:
def test_models(pvals, numn, weights, tested_param, threshold):
    '''
    Tests all possible specifications for a knn model given 
    -pvals (tuple) : the range of values for p in the knn model specification
    -numn (list) : the values of number of nieghbohrs that we want to test
    -weights (list) : kind of weights applied 
    -tested_param (str): meassure of evaluation. Can be accuracy, recall, precision 
                    or specificity
    -threshold (float):  threshold for predicted probabilities. 
    Outputs the information for the best model. 
    '''
    accuracy = 0 
    for p_vals in range(pvals[0], pvals[1]):
        for num_n in numn:
            for w in weights:
                pred = knn_f(xtrain, ytrain, xtest, ytest, num_n, p_vals, w)
                score_ = test_evals(pred, ytest, tested_param, threshold)
                
                if score_ > accuracy:
                    keep = (p_vals, num_n, w, score_)
                    accuracy = score_
    
    return keep 

## PART 2.A


In [None]:
csv = "data/credit-data.csv"
geo = "data/chi_boundaries_zip.geojson"
fr = read_files(csv)
fr.head()

In [None]:
all_vars = ["SeriousDlqin2yrs", "RevolvingUtilizationOfUnsecuredLines", "age", 
            "zipcode", "NumberOfTime30-59DaysPastDueNotWorse", "DebtRatio", 
            "MonthlyIncome","NumberOfOpenCreditLinesAndLoans", "NumberOfTimes90DaysLate", 
            "NumberRealEstateLoansOrLines", "NumberOfTime60-89DaysPastDueNotWorse",
            "NumberOfDependents"]

for i in all_vars:
    print ("******", i,"****** \n")
    print("Missing values percentages; 0 = not missing; 1 = mv")
    check_mv(fr, i)
    print("-----------")
    get_stats(fr, i)
    print("----------- \n" )

In [None]:
corr = show_cor(fr)
x #shows correlation grid 

#we are particularly interested in the way seriousDIqin2yrs is related to other variables. We see that variables who have higher relation are Revolving Utilization of Unsecuted lines, and all the variables counting the number of times late, in particular the number of times 90 days late.
#Since, in turn, NumberOfTimes90DaysLate is relatively highly correlated with those meassuring DaysPastDueNotWorse, we will only focus on this variable from this group. 

# Age seems to be relatively strong, but zipcode and number of dependents seems less strongly correlated. We proceed to 
# observe dinamics grouping by zipcode and number of dependents. Monthly income is moderately and negatively correlated with 
# the target variable


In [None]:
vars_selected = ["SeriousDlqin2yrs", "age", "DebtRatio", 
            "MonthlyIncome", "NumberOfOpenCreditLinesAndLoans", 
            "NumberRealEstateLoansOrLines", "NumberOfDependents"]

#Printing stats by zipcode
for i in vars_selected: 
    print ("******", i,"****** \n")
    get_stats(fr, i, "zipcode")
    print("----------- \n" )

In [None]:
test_percentage = .3
# we can also further loop changing test percentage. 
vars_remove = ["NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime60-89DaysPastDueNotWorse", "temp", "temp2"]
fr2 = drop_feats(vars_remove, fr)

In [None]:
feats = ["RevolvingUtilizationOfUnsecuredLines", "age", 
         "zipcode","DebtRatio", "MonthlyIncome","NumberOfOpenCreditLinesAndLoans", 
          "NumberOfTimes90DaysLate", "NumberRealEstateLoansOrLines",
         "NumberOfDependents"]

xtrain, xtest, ytrain, ytest = split(fr2, test_percentage, "SeriousDlqin2yrs")  
for i in xtrain, xtest: 
    for v in feats:         
        clean_var(i, v)

In [None]:
pvals = (1, 10)
numn = [1,5,10,20,40,80,100]
ws = ['uniform','distance']
test_param = 'accuracy'
thresh = .7 #Note: can also loop further with threshold
best_model = test_models(pvals, numn, ws, test_param, thresh)
print("best p:", best_model[0])
print("number of neighborhs:", best_model[1])
print("w: ", best_model[2])
print("Score for {}: {}".format(test_param, best_model[3]))