In this notebook I train a logistic regression model on the data set "pairs_final_diffed_us.csv". That data is picked form "pairs.csv" and limited to country code US.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
import seaborn as sns

In [None]:
data = pd.read_csv(r"C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_final_diffed_us.csv")

In [None]:
data.columns

In [None]:
data = data.drop('Unnamed: 0', axis =1)

## Inspecting Data Before Modeling

In [None]:
data.head()

In [None]:
data.info()

In [None]:
x_col = ['theta_diff', 'name_cosines', 'full_address_cosines', 'categories_cosines']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data[x_col], data['match'],
                                                    shuffle = True,
                                                    random_state = 614,
                                                   test_size = 0.2,
                                                   stratify = data['match'])

## Calculating True/False Ratio

In [None]:
len(y_train[y_train == True])/len(y_train[y_train == False])

In [None]:
len(y_train[y_train == True])/len(y_train)

## Modeling the Unscaled Data

In [None]:
lreg = LogisticRegression()

In [None]:
lreg.fit(x_train, y_train)

In [None]:
y_pred = lreg.predict(x_test)

In [None]:
y_pred

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
len(y_pred)/len(y_test)

## Modeling the Scaled Data

In [None]:
data_scaled = pd.read_csv(r"C:\Users\gorma\OneDrive\Documents\Erdos\foursquare-location-matching\tim_code\working_data\pairs_final_diffed_us_scaled.csv")

In [None]:
data_scaled.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data_scaled[x_col], data_scaled['match'],
                                                    shuffle = True,
                                                    random_state = 614,
                                                   test_size = 0.2,
                                                   stratify = data['match'])
lreg = LogisticRegression()
lreg.fit(x_train, y_train)
y_pred = lreg.predict(x_test)

In [None]:
y_train

In [None]:
y_test

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_pred)

So scaling didn't do much

## Now Playing with threshold of scaled data

In [None]:
x_train

In [None]:
cutoffs = np.arange(0, 1.01, 0.01)
y_train_prob = lreg.predict_proba(x_train)[:,1]


In [None]:
y_train_prob

In [None]:
accs = []
aucs = []
for cutoff in cutoffs:
    cutoffs = np.arange(0, 1.01, 0.01)
    y_train_pred = 1*(y_train_prob >= cutoff)
    
    accs.append(np.sum(y_train_pred == y_train)/len(y_train))
    aucs.append(roc_auc_score(y_train, y_train_pred))

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12,8))

plt.scatter(cutoffs,accs)

plt.xlabel("Cutoff",fontsize=16)
plt.ylabel("Training Accuracy",fontsize=16)

plt.show()

In [None]:
plt.figure(figsize=(12,8))

plt.scatter(cutoffs,aucs)

plt.xlabel("Cutoff",fontsize=16)
plt.ylabel("AUC",fontsize=16)

plt.show()

# A Naive Model

In [None]:
sample_arr = [True, False]

naive_pred = np.random.choice(sample_arr, len(y_test))

In [None]:
confusion_matrix(y_test, naive_pred)

In [None]:
roc_auc_score(y_test, naive_pred)

In [None]:
sample_arr = [True]

naive_pred = np.random.choice(sample_arr, len(y_test))

In [None]:
naive_pred

In [None]:
confusion_matrix(y_test, naive_pred)

In [None]:
roc_auc_score(y_test, naive_pred)

In [None]:
len(y_test)