# Final Modeling Notebook Intro

Authors: Tim Gorman, Yu Cao, Ling Zhou

In this notebook we take the data output from data_prep2.0 (`"./foursquare-location-matchin/cao_code/data_prep2.0.ipynb"`) </br>
This data has some important aspects: </br>
* Curated Ground Truth Data
    * This is the raw data from kaggle after removing nans, noise and reducing to only data with country_1 and country_2 both being 'US'.
    * Label:
        * match - Indicates whether two ids represent the same point of interest
    * Features:
        * id_1, id_2
        * Modeling Features: 
        * name_1, name_2 
        * latitude_1, latitude_2 
        * longitude_1, longitude_2 
        * address_1, address_2
        * city_1, city_2
        * state_1, state_2
        * country_1, country_2
        * url_1, url_2
        * phone_1, phone_2
        * categories_1, categories_2
* Differenced Data
    * Lat./Long. Differences
        * Simply Euclidean Differences (geo_diff)
        * Angular Difference (geo_theta_diff)
    * String differences
        * Three different ways to calucate differences
            * Sequence Matching (*_seq)
            * Levenshtein Distance (*_lev)
            * Cosine Similarity (*_csim)

We then take the different ways to difference data and run them through a Logistic regression model to see which differencing method performs best. Based on this, the best method will be sent through a random forest classification model. At the end of the notebook we'll compare all of the results against two naive models.
  

# Library Imports

In [None]:
#Library Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import (train_test_split, cross_validate,
                                     StratifiedKFold, GridSearchCV, RandomizedSearchCV)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, roc_auc_score, f1_score,
                             accuracy_score, precision_score, recall_score,
                             precision_recall_curve, roc_curve)
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [None]:
# This data is the curated data that comes out of "data_prep2.0"
data = pd.read_csv(r"..\data_curated\pairs_us_diffs.csv", low_memory = False)

In [None]:
# A dictionary that defines which features to use when modeling with different string metrics 
# (sequence matching, cosine similarity, levenshtein).
feature_dict = {'x_col_csim': ['geo_theta_diff', 'name_csim', 'address_csim',
       'city_csim', 'state_csim', 'zip_csim', 'url_csim', 'phone_csim',
       'categories_csim'], 
                'x_col_seq':['geo_theta_diff', 'name_diff_seq',
       'address_diff_seq', 'city_diff_seq','state_diff_seq', 'zip_diff_seq',
       'url_diff_seq', 'phone_diff_seq', 'categories_diff_seq'],
               'x_col_lev':['geo_theta_diff', 'name_diff_lev',
       'address_diff_lev', 'city_diff_lev','state_diff_lev', 'zip_diff_lev',
       'url_diff_lev', 'phone_diff_lev', 'categories_diff_lev'] }

# Function Definitions

In [None]:
# Defining Modeling Functions
def logistic_regression_pipe(features, labels):
    x_train, x_test, y_train, y_test = train_test_split(features, labels,
                                                    shuffle      = True,
                                                    random_state = 614,
                                                    test_size    = 0.2,
                                                    stratify     = labels)
    skf  = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 614)
    lreg = LogisticRegression()
    #HyperParameters
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l2']
    c_values = [100, 50, 10, 5, 1.0, 0.5, 0.1, 0.05, 0.01]
    grid = dict(solver=solvers,penalty=penalty,C=c_values)
    grid_search = GridSearchCV(estimator=lreg, param_grid=grid, n_jobs=-1, cv=skf, scoring='accuracy',error_score=0)
    grid_result = grid_search.fit(x_train, y_train)
    # print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    print("Best mean test score {:5.5f} using {}".format(grid_result.best_score_, grid_result.best_params_))
    means  = grid_result.cv_results_['mean_test_score']
    stds   = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    print("mean     std             param")
    print("------------------------------"*2)
    for mean, stdev, param in zip(means, stds, params):
        # print("%f (%f) with: %r" % (mean, stdev, param))
        print("{:5.5f}  ({:5.5f}) with: {}".format(mean, stdev, param))
    return grid_result.best_params_

def random_forest_pipe(features, labels):
    x_train, x_test, y_train, y_test = train_test_split(features, labels,
                                                    shuffle      = True,
                                                    random_state = 614,
                                                    test_size    = 0.2,
                                                    stratify     = data['match'])
    #Defining Estimators
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1200, num = 6)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [2, 4, 10]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators'     : n_estimators,
                   'max_features'     : max_features,
                   'max_depth'        : max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf' : min_samples_leaf,
                   'bootstrap'        : bootstrap}
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, 
                                   param_distributions = random_grid, 
                                   n_iter = 10, cv = 3, verbose=2, 
                                   random_state=614, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(x_train, y_train)
    
    print(rf_random.best_params_)
    
    return rf_random.best_params_


# Logistic Regression With Hyperparameter tuning and K-fold Cross_validation

In [None]:
best_params_dict = {}
for features in feature_dict:
    print(features)
    best_params = logistic_regression_pipe(features = data[feature_dict[features]], labels = data['match'])
    best_params_dict[features] = best_params

In [None]:
best_params_dict['x_col_lev']

What I get from above is that the levenshtein distance works as the best metric. I ran this twice with "geo_diff" and "geo_theta_diff" (not shown with "geo_diff"). They give nearly identical results so I'll stick with "geo_theta_diff" because I believe it to be the most accurate metric. </br> </br>
Here's what I will use for the "best logistic regression" on the test set.

x_col_lev
Best: 0.760583 using {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

In [None]:
skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 614)

In [None]:
lreg_best = LogisticRegression(penalty = best_params_dict['x_col_lev']['penalty'], solver = best_params_dict['x_col_lev']['solver'], C = best_params_dict['x_col_lev']['C'])

In [None]:
#Defining Test Set that will be used for next logistic regression
x_train, x_test, y_train, y_test = train_test_split(data[feature_dict['x_col_lev']], data['match'],
                                                    shuffle = True,
                                                    random_state = 614,
                                                   test_size = 0.2,
                                                   stratify = data['match'])

In [None]:
scores = cross_validate(lreg_best, x_train, y_train, cv = skf, scoring = ['accuracy', 'roc_auc', 'f1', 'average_precision', 'jaccard'], n_jobs = -1)

Results

In [None]:
print('Mean Accuracy')
print(np.mean(scores['test_accuracy']))
print('Mean roc_auc')
print(np.mean(scores['test_roc_auc']))
print('Mean F1')
print(np.mean(scores['test_f1']))
print('Mean Average Precision')
print(np.mean(scores['test_average_precision']))
print('Mean Jaccard')
print(np.mean(scores['test_jaccard']))

## Logistic Regression on Test Set

In [None]:
lreg_best.fit(x_train, y_train)
y_pred = lreg_best.predict(x_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, lreg_best.predict_proba(x_test)[:,1])

In [None]:
plt.figure(figsize=(12,8))

plt.plot(recall,precision)

plt.xlabel("Recall",fontsize=16)
plt.ylabel("Precision",fontsize=16)

plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, lreg_best.predict_proba(x_test)[:,1])

In [None]:
plt.figure(figsize=(12,8))

plt.plot(fpr,tpr)

plt.xlabel("False Positive Rate",fontsize=16)
plt.ylabel("True Positive Rate",fontsize=16)

plt.show()

In [None]:
cutoffs = np.arange(0, 1.01, 0.01)
y_train_prob = lreg_best.predict_proba(x_train)[:,1]
accs = []
aucs = []
f1s = []
precision = []
recall = []
for cutoff in cutoffs:
    cutoffs = np.arange(0, 1.01, 0.01)
    y_train_pred = 1*(y_train_prob >= cutoff)
    
    #accs.append(np.sum(y_train_pred == y_train)/len(y_train))
    accs.append(accuracy_score(y_train, y_train_pred))
    aucs.append(roc_auc_score(y_train, y_train_pred))
    f1s.append(f1_score(y_train, y_train_pred))
    
plt.figure(figsize=(12,8))

plt.scatter(cutoffs,accs)

plt.xlabel("Cutoff",fontsize=16)
plt.ylabel("Training Accuracy",fontsize=16)

plt.show()

plt.figure(figsize=(12,8))

plt.scatter(cutoffs,aucs)

plt.xlabel("Cutoff",fontsize=16)
plt.ylabel("AUC",fontsize=16)

plt.show()

plt.figure(figsize=(12,8))

plt.scatter(cutoffs,f1s)

plt.xlabel("Cutoff",fontsize=16)
plt.ylabel("F1",fontsize=16)

plt.show()


# Random Forest with Hyperparameter Tuning

Becuase the Levenshtein distance served as the best string difference metric through the logistic regression analysis, we'll use this as the input data set for the following random forest model.

In [None]:
#Train Test Split Using Levensthein Columns
x_train, x_test, y_train, y_test = train_test_split(data[feature_dict['x_col_lev']], data['match'],
                                                    shuffle = True,
                                                    random_state = 614,
                                                   test_size = 0.2,
                                                   stratify = data['match'])

## Hyperparameter Tuning

In [None]:
best_params = random_forest_pipe(features = data[feature_dict['x_col_lev']], labels = data['match'])

## Tuned Random Forest on Test Data

In [None]:
rf_best = RandomForestClassifier(best_params['n_estimators'], 
                                max_features = best_params['max_features'], 
                                random_state = 614,
                                min_samples_split = best_params['min_samples_split'],
                                min_samples_leaf = best_params['min_samples_leaf'],
                                max_depth = best_params['max_depth'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data[feature_dict['x_col_lev']], data['match'],
                                                    shuffle = True,
                                                    random_state = 614,
                                                   test_size = 0.2,
                                                   stratify = data['match'])

In [None]:
rf_best.fit(x_train,y_train)

In [None]:
y_rf_pred = rf_best.predict(x_test)

In [None]:
f1_score(y_test, y_rf_pred)

In [None]:
accuracy_score(y_test, y_rf_pred)

In [None]:
roc_auc_score(y_test, y_rf_pred)

In [None]:
confusion_matrix(y_test, y_rf_pred)

In [None]:
rf_best.feature_importances_

## A Naive Model V1: (developed by Ling Zhou) 

In [None]:
baseline_cols = ['geo_diff', 'name_diff_lev', 'address_diff_lev']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data[baseline_cols], data['match'],
                                                    shuffle = True,
                                                    random_state = 614,
                                                   test_size = 0.2,
                                                   stratify = data['match'])

In [None]:
def baseline(df, cols, thresh):
    df_copy = df.copy()
    df_copy['match'] = np.random.choice([True, False], len(df))
    df_copy.loc[df_copy[cols].max(axis=1)<thresh] = True
    
    return df_copy

In [None]:
# Using Levenshtein distance
#cols = ['location_diff','name_diff','address_diff']
#y_test = data['match']
y_pred = baseline(x_test, baseline_cols, 0.1).match
print(confusion_matrix(y_test,y_pred),'\n', roc_auc_score(y_test,y_pred))

In [None]:
f1_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_pred)

## A Naive Model V2: Guess all matches are True

In [None]:
sample_arr = [True]

naive_pred = np.random.choice(sample_arr, len(y_test))

In [None]:
naive_pred

In [None]:
confusion_matrix(y_test, naive_pred)

In [None]:
roc_auc_score(y_test, naive_pred)

In [None]:
f1_score(y_test, naive_pred)

In [None]:
accuracy_score(y_test, naive_pred)