# Matchmaking!

In [None]:
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('profiles_revised.csv.zip')
df.sample(3)

In [None]:
df.shape

## Simple Idea: Find Commonalities

In [None]:
df['body_type'].value_counts()

In [None]:
df['drinks'].value_counts()

### Putting People Together

Suppose we want to find a match for this person:

In [None]:
match_me = df.iloc[0]
match_me

In [None]:
matches = df[(df['body_type'] == match_me['body_type'])
            & (df['drinks'] == match_me['drinks'])]
matches.shape

### Assuring Sexual Compatibility

In [None]:
full_matches = matches[(matches['sex'] == 'f') & (matches['orientation'] == 'straight')]
full_matches.sample(3)

In [None]:
full_matches.shape

## More Sophisticated Matching

### One-Hot Encoding

In [None]:
vars_that_matter = ['diet', 'drinks', 'drugs', 'smokes']

In [None]:
filtered = df[vars_that_matter]

In [None]:
filtered.isna().sum()

In [None]:
si = SimpleImputer(strategy='most_frequent')

si.fit(filtered)

In [None]:
imputed = pd.DataFrame(si.transform(filtered),
                       columns=filtered.columns,
                       index=filtered.index)
imputed.sample(3)

In [None]:
ohe = OneHotEncoder(sparse=False)

ohe.fit(imputed)

In [None]:
encoded = pd.DataFrame(ohe.transform(imputed),
                      columns=ohe.get_feature_names(),
                      index=imputed.index)
encoded.sample(3)

Here is our to-be-matched person again, now encoded:

In [None]:
match_me_encoded = encoded.iloc[0].values.reshape(1, 32).flatten()

### Cosine Similarity

In [None]:
numerators = np.array([match_me_encoded.dot(other) for other in encoded.iloc[1:].values])

In [None]:
denominators = np.array([np.sqrt(sum(match_me_encoded**2)) *\
                         np.sqrt(sum(other**2)) for other in encoded.iloc[1:].values])

In [None]:
sim_scores = numerators / denominators

In [None]:
sim_scores_full = [1]
sim_scores_full.extend(sim_scores)

In [None]:
encoded['similarity'] = sim_scores_full

In [None]:
encoded[encoded['similarity'] == 1]

In [None]:
metrics.pairwise.cosine_similarity(encoded[:10])

### Putting Variables on a Scale