In [None]:
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, re, shutil
import torch

from datetime import datetime, timedelta
from google.cloud import storage
from mpl_toolkits.mplot3d import Axes3D

from PIL import Image

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
from torch.utils.data import DataLoader
from torchvision import transforms, datasets, utils

from facenet_pytorch.models import inception_resnet_v1
from facenet_pytorch import training

In [None]:
### Parameterization
DISTRICT_ID = 1

In [None]:
### Local FS Declarations
BASE_DIR = "."
CROPPED_DIR = BASE_DIR + f'/cropped_{DISTRICT_ID}'

### GCS Declaratiosns
BUCKET = "...."
INPUTS = "inputs/image-clustering/"
CROPPED_IMAGES = "cropped_faces/"
MODEL_STATE_DICT = "model_state_dict.pkl"
OUTPUT_URI = f"/outputs/image_clustering_v1_district_{DISTRICT_ID}.csv"

### Inpute GCS objects
users_file = "gs://.../inputs/users.csv"
likes_file = "gs://.../inputs/implicit-svd/likes_v2.csv"

In [None]:
# Hyperparameters

### CNN Embedding learning: because the model has already been trained, the importance of this is largely for memory management
BATCH_SIZE = 16

### Nearest neighbors and recs
NNEIGHBORS = 300
NRECS = 150

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
client = storage.Client('....')
bucket = client.get_bucket(BUCKET)

### Obtaining Likes data for use in Nearest Neighbors Search and Filtering

In [None]:
likes_cols = [
    '....'
]

likes_df = pd.read_csv(
    likes_file,
    names = likes_cols
).query("(user_district_id == @DISTRICT_ID) and (target_district_id == @DISTRICT_ID)")

users = pd.read_csv(
    users_file, 
    header = None,
    names = [
        'user_id',
        'gender',
        'district_id'
    ]
).query("district_id == @DISTRICT_ID")

In [None]:
likes_df = likes_df[likes_df.user_id.isin(users['user_id'].unique())]
likes_df.user_id.nunique(), users.shape[0]

#### The above is a sanity check, to make sure that the likes data corresponds with our master users list; simply, if someone liked someone and entered the likes data, we expect them also to be in the users list. I don't expect a 1-to-1 correspondence, but the closer the better. 

#### Obtaining cropped photos

Structure is simply /user_id/photos.jpg in GCS.

However, we want to reduce the images we will use to those in the users.csv file, to avoid providing recs with inactive users. So, locally, we only download users for this district, and these are the only images we will use as a basis for recommendation, together with likes data to find nearest neighbors.

Inclusion criteria have a major impact on interpretability of results, so here they are, at least for this model and the data inputs:
From users.sql:

```sql
Redacted: custom business conditions
```
These match the likes.sql, though the likes.sql also screens for new users (account less than 1 day old). 

In [None]:
img_blobs = [b for b in bucket.list_blobs(prefix = INPUTS + CROPPED_IMAGES)  if '.jpg' in b.name]

In [None]:
len(set(users.user_id.unique()).intersection(set([int(b.name.split('/')[4]) for b in img_blobs])))

The above tells us how many likes.target_users we have images for. These are the people that we can recommend.

In [None]:
if not os.path.exists(CROPPED_DIR):
    os.mkdir(CROPPED_DIR)
    os.mkdir(CROPPED_DIR + '/1')
    os.mkdir(CROPPED_DIR + '/2')

In [None]:
n_images_downloaded = 0

for img_blob in img_blobs:
    groups = re.search(r"cropped_faces/(\d)/(.*)/(.*\.jpg)$", img_blob.name)
    if int(groups.group(2)) in users.user_id.unique():
        userdir = CROPPED_DIR + "/" + groups.group(1) + "/" + groups.group(2) + "/"
        filename = groups.group(3)
        if not os.path.exists(userdir):
            os.makedirs(userdir)
        if not os.path.exists(userdir + filename):
            img_blob.download_to_filename(userdir + filename)
            n_images_downloaded += 1

n_images_downloaded

#### Data transformation, custom dataset, batch loading

In [None]:
def prewhiten(x):
    mean = x.mean()
    std = x.std()
    std_adj = std.clamp(min=1.0/(float(x.numel())**0.5))
    y = (x - mean) / std_adj
    return y

In [None]:
preprocess = transforms.Compose(
    [
        transforms.Resize(182),
        transforms.ToTensor(),
        prewhiten,
        transforms.Normalize(
            mean = (0.485, 0.456, 0.406), 
            std = (0.229, 0.224, 0.225)
        )
    ]
)

In [None]:
mimages = datasets.ImageFolder(
    CROPPED_DIR + "/1",
    transform = preprocess
)

fimages = datasets.ImageFolder(
    CROPPED_DIR + "/2",
    transform = preprocess
)

### and because we need a mapping of dataset indexing to class (target user id)
f_idx_to_class = {ix:tid for ix, tid in zip(fimages.class_to_idx.values(), fimages.class_to_idx.keys())}
m_idx_to_class = {ix:tid for ix, tid in zip(mimages.class_to_idx.values(), mimages.class_to_idx.keys())}

In [None]:
def create_loader(dataset):
    return DataLoader(
        dataset=dataset,
        ### condition is needed for cases where size of image dataset is less than a standard batch
        batch_size=BATCH_SIZE if len(dataset) > BATCH_SIZE else len(dataset), 
        shuffle=True, ### previously, i had this set to true; but 
        drop_last=False #
        ## the effect of setting this true is to drop the last batch if it is less than batch_size
        ### this is necessary if batch normalization is used, but it should be turned off for inference as we do here
    )

### Using "Facenet" Inception-v1

In [None]:
def generate_features_embedding(model, dataset, mapping):
    model.to(device);
    model.eval();
    output = []
    labels = []
    with torch.no_grad():
        for batch in create_loader(dataset):
            out = model(batch[0].to(device))
            output.append(out.detach().cpu().numpy()) #necessary for later processing on CPU and to avoid killing GPU
            labels.append([mapping[int(ix)] for ix in batch[1]])
    output = np.concatenate(np.array(output))
    labels = np.concatenate(np.array(labels))
    return output, labels

### PCA and TSNE for dimensionality reduction and separation

In [None]:
def do_PCA(n_components, embedding):
    pca = PCA(n_components = n_components)
    reduced = pca.fit_transform(embedding)
    return reduced

In [None]:
def do_tsne(p, reduced):
    tsne = TSNE(
        perplexity=p,
        learning_rate=200,
        n_iter=700,
        verbose=0,
        n_components=3
    )
    out = tsne.fit_transform(reduced)
    return out

#### Filtering function with CL-side conditions to avoid serving up users who will be filtered out by the app

In [None]:
def filter_items(user_id, likes):
    """
    Apply business-related filtering conditions
    """
    user_age = likes[likes.user_id == user_id].user_age.iloc[0]

    filtered_target_users = likes[ 
      ('') |
      ('') |
      ('') # find users previously liked by this user; this is necessary, unlike with implicit
    ].target_user_id.unique()
    
    return filtered_target_users

#### Recommendation generation with N target neighbors

In [None]:
def generate_recs(embedding_df, likes_):
    """
    Rec generation. We pass target_n as the number of recs, which matches the number of neighbors we want from the NN model.
    The reason for this is to ensure we generate enough recs if someone only has 1 like in their history. From that 1 like,
    we need to infer N neighors, and from those N neighbors, we need to make sure we can produce N recs after filtering.
    
    If a user has many liked target users, then we take a random sample of size Sample, collect N neighbors for each sampled target 
    user, and iterate through them in order of distance until we have enough new recs to provide after filtering them.
    """
    recommendations = {}
    n_potential_targets = embedding_df.shape[0]
    # set nneighbors and fit model
    neighbors_model = NearestNeighbors(n_neighbors = NNEIGHBORS if NNEIGHBORS < n_potential_targets else n_potential_targets)
    neighbors_model.fit(embedding_df[['x', 'y', 'z']])
    
    for user_id in likes_.user_id.unique():
        neighbors = []
        recommendations[user_id] = []
        n_recs = 0
        
        to_be_filtered = filter_items(user_id, likes_)

        liked_users = likes_[likes_.user_id == user_id].target_user_id.values
        
        ### we take a subset of liked users for those who have liked more than NNEIGHBORS, 
        ### so this becomes at most NNEIGHBORS x NNEIGHBORS obtained for a user
        shuffled_subset = np.random.permutation(liked_users)[:NNEIGHBORS]
        
        ### here, we iterate through previously liked users, and obtain neighbors
        for liked_user in shuffled_subset:
            k_neighbors_ix = neighbors_model.kneighbors(
                embedding_df[embedding_df.target_user_id == liked_user][['x','y','z']],
                return_distance=False
            )[0]
            k_neighbors = embedding_df.iloc[k_neighbors_ix].target_user_id.values
            neighbors.append(k_neighbors)
        neighbors = np.stack(neighbors)
        
        for j in range(neighbors.shape[1]):
            ### iterating through the columns of the matrix, because position j corresponds to k
            ### in terms of nearness
            for neighbor in neighbors[1:, j]: ### we index 1: because first neighbor is in fact the liked_user
                ### neighbor not in to be filtered
                ### neighbor in likes data, meaning this person has been active within 2 weeks
                ### neighbor not duplicated as a rec
                if neighbor not in to_be_filtered \
                and neighbor not in recommendations[user_id]:
                    recommendations[user_id].append(neighbor)
                    n_recs += 1
                    if n_recs == NRECS:
                        break
            if n_recs == NRECS:
                break
    return recommendations

In [None]:
def main(model, dataset, likes_, mapping, perplexity = 30):

    embedding, labels = generate_features_embedding(model, dataset, mapping)
    
    pca_reduced = do_PCA(
        ### condition is needed for cases where size of image dataset is less than 50
        ### note: for the data loader, the last incomplete batch is dropped
        n_components = 50 if len(embedding) > 50 else len(embedding), 
        embedding = embedding
    )
    
    tsne_reduced = do_tsne(perplexity, pca_reduced)
    
    embedding_df = pd.DataFrame(
        data = tsne_reduced,
        columns = [
            'x',
            'y',
            'z'
        ]
    )
    
    embedding_df["target_user_id"] = labels
    embedding_df["target_user_id"] = embedding_df.target_user_id.astype(int)
    
    print(f"Average number of face photos per target user is {embedding_df.groupby('target_user_id').x.count().mean()}")
    
    #Reduce target users to those we have images for, since we need images to make recommendations
    likes_ = likes_[likes_.target_user_id.isin(labels)]
    
    print("N unique for likes data with condition of being in photos data: \n")
    print(likes_[['user_id', 'target_user_id']].nunique())
    
    recommendations = generate_recs(embedding_df, likes_)
    
    return recommendations

#### Obtaining and loading model weights

In [None]:
if not os.path.exists(f"{BASE_DIR}/model_state_dict.pkl"):
    blob = bucket.get_blob(INPUTS + MODEL_STATE_DICT)
    blob.download_to_filename(f"{BASE_DIR}/model_state_dict.pkl")

In [None]:
inceptionv1 = inception_resnet_v1.InceptionResnetV1(pretrained='vggface2', num_classes = 22)

In [None]:
model_state_dict = torch.load(f'{BASE_DIR}/model_state_dict.pkl', map_location=torch.device('cpu') if device == 'cpu' else None)

In [None]:
inceptionv1.load_state_dict(model_state_dict['model_state_dict'])

### Generating for men

We subset the likes data to only include likes with target_gender equal to the opposite gender of the users.

In [None]:
male_recs = main(
    model = inceptionv1, 
    dataset = fimages, 
    likes_ = likes_df[likes_df.target_gender == 2], ### Reduce target users to the gender we are recommending
    mapping = f_idx_to_class
)

In [None]:
df_m = pd.DataFrame(
    data = [[user_id, rec] for user_id in male_recs for rec in male_recs[user_id]],
    columns = ["user_id", "recommended_user_id"]
    ).sort_values(
        by = "user_id"
    )

del male_recs

In [None]:
df_m.groupby('user_id').count().recommended_user_id.describe()

### Generating for women

In [None]:
female_recs = main(
    model = inceptionv1, 
    dataset = mimages, 
    likes_ = likes_df[likes_df.target_gender == 1], ### Reduce target users to the gender we are recommending
    mapping = m_idx_to_class

)

In [None]:
df_f = pd.DataFrame(
    data = [[user_id, rec] for user_id in female_recs for rec in female_recs[user_id]],
    columns = ["user_id", "recommended_user_id"]
    ).sort_values(
        by = "user_id"
    )

del female_recs

In [None]:
df_f.groupby('user_id').count().recommended_user_id.plot(kind = 'hist', bins=50)

### Combine and deliver to GCS

In [None]:
df_concat = pd.concat([df_m, df_f], ignore_index = True)
df_concat.to_csv("gs://" + BUCKET + OUTPUT_URI, header = False, index = False)

### Clean up of all downloaded images

In [None]:
shutil.rmtree(CROPPED_DIR)