# Data Prep for Modeling

Author: Tim Gorman

In this code I prep the pairs data for cosine similarity calculations. We'll be using the subset of data that is just 'US' Country Code from pairs.csv.

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# from googletrans import Translator, constants
import seaborn as sns
import langdetect
#from langdetect import detect
import matplotlib.pyplot as plt

In [None]:
# # Tim loading
df_pairs = pd.read_csv('../../data_raw/pairs.csv')

In [None]:
df_pairs.info()

## Reducing to US Matches

In [None]:
df_pairs = df_pairs[(df_pairs['country_1'] == 'US') & (df_pairs['country_2'] == 'US')]

In [None]:
df_pairs.head()

## Definining Types, Filling NA Values, Lower Casing, and Remvoing Punctuation

In [None]:
string_features = ['name_1', 'name_2',
                  'address_1', 'address_2', 'city_1', 'city_2',
                  'state_1', 'state_2', 'zip_1', 'zip_2',
                 'country_1', 'country_2', 'url_1', 'url_2',
                 'phone_1', 'phone_2', 'categories_1', 'categories_2']

In [None]:
position_features = ['latitude_1', 'latitude_2', 'longitude_1', 'longitude_2']

In [None]:
df_pairs[string_features] = df_pairs[string_features].fillna('').astype(str).apply(
    lambda x: x.str.lower().str.replace('[{}]'.format(string.punctuation),''))


In [None]:
df_pairs.head()

In [None]:
df_pairs[position_features] = df_pairs[position_features].astype('float64')

In [None]:
df_pairs = df_pairs.reset_index().drop('index', axis = 1)

## Calculating Angular Difference for Lat. and Long.

In [None]:
df_pairs['theta_diff'] = np.abs(np.arctan(np.sqrt(
    (np.cos(np.radians(df_pairs['latitude_2']))*np.sin(np.radians(df_pairs['longitude_1']-df_pairs['longitude_2'])))**2 +
        (np.cos(np.radians(df_pairs['latitude_1']))*np.sin(np.radians(df_pairs['latitude_2']))-
            np.sin(np.radians(df_pairs['latitude_1']))*np.cos(np.radians(df_pairs['latitude_2']))*np.cos(np.radians(df_pairs['longitude_1']-df_pairs['longitude_2'])))**2
                                                )/
    (np.sin(np.radians(df_pairs['latitude_1']))*np.sin(np.radians(df_pairs['latitude_2']))+
        np.cos(np.radians(df_pairs['latitude_1']))*np.cos(np.radians(df_pairs['latitude_2']))*np.cos(np.radians(df_pairs['longitude_1']-df_pairs['longitude_2']))
    )
                                               )
                                      )

## Calculating Cosine Similarities for String Features

In [None]:
for item in string_features:
        cosines = []
        if "_1" in item:
            print(item)
            for i in range(len(df_pairs)):
                try:
                    if (df_pairs[item].iloc[i] == '') | (df_pairs[item.replace('_1', '_2')].iloc[i] == ''):
                        csim = -1
                    elif (df_pairs[item].iloc[i] == ' ') | (df_pairs[item.replace('_1', '_2')].iloc[i] == ' '):
                        csim = -1
                    else:
                        csim = cosine_similarity(CountVectorizer().fit_transform([df_pairs[item].iloc[i], df_pairs[item.replace('_1', '_2')].iloc[i]]).toarray())[0][1]
                        pass
                except ValueError:
                    csim = -1 
                    continue
                cosines.append(csim)
            df_pairs[item.replace('_1', '_cos')] = pd.DataFrame(cosines)
            
        

Filling any remaining NaNs in the cosine features with -1.

In [None]:
cos_features = ['name_cos', 'address_cos', 
                'city_cos', 'state_cos', 'zip_cos', 'country_cos',
                'url_cos', 'phone_cos', 'categories_cos']

In [None]:
df_pairs[cos_features] = df_pairs[cos_features].fillna(-1)

In [None]:
df_pairs = df_pairs.drop(string_features, axis = 1)

In [None]:
df_pairs = df_pairs.drop('country_cos', axis = 1)

In [None]:
df_pairs.head()

In [None]:
df_pairs.info()

In [None]:
df_pairs.describe()

In [None]:
df_pairs_corr = df_pairs.corr()

In [None]:
sns.heatmap(df_pairs_corr)

# Saving dataframe to CSV

In [None]:
df_pairs.to_csv('../../data_curated/pairs_us_cosines.csv')