In [None]:
import boto3
import gc
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sklearn

from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline

In [None]:
import sagemaker

from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

session = sagemaker.Session()
role = get_execution_role()
bucket = sagemaker.Session().default_bucket()

random_state = 1

## Overcommit Memory to avoid error

!sudo -i

echo 1 > /proc/sys/vm/overcommit_memory

## Download Data

Need to add kaggle credentials to .kaggle/kaggle.json

In [None]:
#!pip install kaggle
#!kaggle competitions download -c shopee-product-matching -p /tmp
#!unzip -q /tmp/shopee-product-matching.zip -d ../data

## Explore Data

### Features and Calculated Statistics

### Sampling

### Abnormalities

In [None]:
df = pd.read_csv('../data/train.csv')
df.head()

In [None]:
df.info()

In [None]:
for attr in df:
    print('\n')
    print(df[attr].value_counts())

In [None]:
df['posting_id'] = df['posting_id'].str.split('_', expand=True)[1]
#df = df.set_index('posting_id')
df.head()

In [None]:
df.info()

In [None]:
image_counts = df.image.value_counts().head(100)
image_counts.plot.barh(figsize=(15,25))

In [None]:
image_phash_counts = df.image_phash.value_counts().head(100)
image_phash_counts.plot.barh(figsize=(15,25))

In [None]:
title_counts = df.title.value_counts().head(100)
title_counts.plot.barh(figsize=(15,25))

In [None]:
label_group_counts = df.label_group.value_counts().head(100)
label_group_counts.plot.barh(figsize=(15,25))

In [None]:
import cv2

BASE = '../data/train_images/'

def displayDF(train, random=False, COLS=6, ROWS=4, path=BASE):
    for k in range(ROWS):
        plt.figure(figsize=(20,5))
        for j in range(COLS):
            if random: row = np.random.randint(0,len(train))
            else: row = COLS*k + j
            name = train.iloc[row,1]
            title = train.iloc[row,3]
            title_with_return = ""
            for i,ch in enumerate(title):
                title_with_return += ch
                if (i!=0)&(i%20==0): title_with_return += '\n'
            img = cv2.imread(path+name)
            plt.subplot(1,COLS,j+1)
            plt.title(title_with_return)
            plt.axis('off')
            plt.imshow(img)
        plt.show()
        
displayDF(df, random=True)

### Missing Values

In [None]:
df.isnull().sum()

## Process Data

### Split Data

In [None]:
gss = sklearn.model_selection.GroupShuffleSplit(test_size=.33, n_splits=1, random_state = random_state)

train_idxs, test_idxs = next(sklearn.model_selection.GroupShuffleSplit(test_size=.33, n_splits=1, random_state = random_state).split(df, groups=df['label_group']))

train = df.iloc[train_idxs]
test = df.iloc[train_idxs]

train_idxs, val_idxs = next(sklearn.model_selection.GroupShuffleSplit(test_size=.33, n_splits=1, random_state = random_state).split(df, groups=df['label_group']))

train = df.iloc[train_idxs]
val = df.iloc[val_idxs]

In [None]:
for attr in train:
    print('\n')
    print(df[attr].value_counts())

In [None]:
for attr in test:
    print('\n')
    print(df[attr].value_counts())

In [None]:
for attr in val:
    print('\n')
    print(df[attr].value_counts())

In [None]:
X_train = train.drop(['label_group'], axis=1)
Y_train = train['label_group']
#Y_train.reset_index(inplace=True, drop=True)

X_test = test.drop(['label_group'], axis=1)
Y_test = test['label_group']
#Y_test.reset_index(inplace=True, drop=True)


X_val = test.drop(['label_group'], axis=1)
Y_val = test['label_group']
#Y_val.reset_index(inplace=True, drop=True)

In [None]:
data_dir = '../data/shopee'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [None]:
pd.concat([Y_test, X_test], axis=1).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [None]:
prefix = 'shopee'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

## Model Implementation

### Metrics

### Algorithms

### Techniques

### Complications

In [None]:
def evaluate_model(df, column_name='preds'):
    def getMetric(col):
        def f1score(row):
            n = len(np.intersect1d(row.target, row[col]))
            return 2 * n / (len(row.target) + len(row[col]))
        return f1score


    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['f1'] = df.apply(getMetric(column_name), axis=1)
    print('CV Score =', df.f1.mean())

### Predict Using Phash

In [None]:
tmp = X_train.groupby('image_phash').posting_id.agg('unique').to_dict()
X_train['preds_phash'] = X_train.image_phash.map(tmp)
X_train.head()

In [None]:
train = X_train.merge(Y_train, left_index=True, right_index=True)

evaluate_model(train, column_name='preds_phash')

### Predict Using Title Similarity

In [None]:
tmp = X_train.groupby('title').posting_id.agg('unique').to_dict()
X_train['preds_title'] = X_train.title.map(tmp)
X_train.head()

In [None]:
train = X_train.merge(Y_train, left_index=True, right_index=True)

evaluate_model(train, column_name='preds_title')

### Predict Using Image Similarity

In [None]:
!pip install keras_efficientnets

In [None]:
import tensorflow as tf
#from tensorflow.keras.applications import EfficientNetB0
from keras_efficientnets import EfficientNetB0

class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=BASE): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [None]:
model = EfficientNetB0(weights='imagenet',include_top=False, pooling='avg', input_shape=None)
train_gen = DataGenerator(train, batch_size=128)
image_embeddings = model.predict(train_gen,verbose=1)
print('image embeddings shape is',image_embeddings.shape)

In [None]:
KNN = 50
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)
distances, indices = model.kneighbors(image_embeddings)

### Predict Using Combined Predictions

In [None]:
def combine_for_cv(row):
    x = np.concatenate([row.preds_phash, row.preds_title])
    return np.unique(x)

In [None]:
train = X_train.merge(Y_train, left_index=True, right_index=True)
train['preds'] = train.apply(combine_for_cv, axis = 1)

evaluate_model(train)

## Model Refinement

### Initial Solution

### Intermediate Solutions

### Final Solution

## Model Evaluation and Validation

## Model Justification

### Benchmark Comparison

### Justification

In [None]:
def hamming_distance(hash1, hash2):
    return sum([c1 != c2 for c1, c2 in zip(hash1, hash2)])

In [None]:
phashes = X_train['image_phash']

In [None]:
from scipy.spatial import distance

preds = []
i = 0

#a = np.array(phashes)

#phash_similarities = (2 * np.inner(a - 0.5, 0.5 - a) + a.shape[1] / 2)

#a = None

for phash in phashes:
    i += 1
    phash_similarities = phashes.apply(lambda x: distance.hamming(phash, x))
    preds.append(np.where(phash_similarities > 0.9)[0])
    break
    
preds

In [None]:
tmp = Y_train.reset_index()
tmp[tmp['label_group'] == tmp.iloc[0]['label_group']].dropna()

In [None]:
preds[0] = np.append(preds[0], 22431)
train.iloc[preds[0]]

In [None]:
model = TfidfVectorizer(stop_words='english', binary=True, max_features=10000)
text_embeddings = model.fit_transform(X_train.title).toarray()
text_embeddings.shape

In [None]:
from sklearn.metrics.pairwise import linear_kernel

#cosine_similarities = linear_kernel(text_embeddings, text_embeddings)

batchsize = 1024
cosine_similarities = []
for i in range(0, text_embeddings.shape[0], batchsize):
    cosine_similarities.extend(linear_kernel(text_embeddings, text_embeddings[i:min(i+batchsize, text_embeddings.shape[0])]).flatten())
cosine_similarities = np.array(cosine_similarities)

In [None]:
pd.DataFrame(data=cosine_similarities).to_csv(os.path.join(data_dir, 'train_cosine_similarities.csv'), header=False, index=False)
train_cosine_similarities_location = session.upload_data(os.path.join(data_dir, 'train_cosine_similarities.csv'), key_prefix=prefix)

In [None]:
from sklearn.metrics.pairwise import linear_kernel

preds = []

for i in range(len(text_embeddings)):
    cosine_similarities = linear_kernel(text_embeddings[i:i+1], text_embeddings).flatten()
    preds.append(X_train.iloc[np.where(cosine_similarities > 0.7)[0]].posting_id.values)

X_train['preds_title'] = preds
X_train.head()

In [None]:
train = X_train.merge(Y_train, left_index=True, right_index=True)

evaluate_model(train, column_name='preds_title')

In [None]:
import io

buf = io.BytesIO()
sagemaker.amazon.common.write_numpy_to_dense_tensor(buf, text_embeddings, Y_train.to_numpy())
buf.seek(0)

s3_train_data = os.path.join(prefix, 'title_text_embeddings.csv')
boto3.resource('s3').Bucket(bucket).Object(s3_train_data).upload_fileobj(buf)
s3_train_data = f"s3://{session.default_bucket()}/{prefix}/{'title_text_embeddings.csv'}"

In [None]:
print(s3_train_data)

In [None]:
def trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path):
    """
    Create an Estimator from the given hyperparams, fit to training data,
    and return a deployed predictor

    """
    # set up the estimator
    knn = sagemaker.estimator.Estimator(
        sagemaker.amazon.amazon_estimator.get_image_uri(boto3.Session().region_name, "knn"),
        get_execution_role(),
        instance_count=1,
        instance_type="ml.m5.2xlarge",
        output_path=output_path,
        sagemaker_session=sagemaker.Session(),
    )
    knn.set_hyperparameters(**hyperparams)

    # train a model. fit_input contains the locations of the train and test data
    fit_input = {"train": s3_train_data}
    knn.fit(fit_input)
    return knn





#s3_train_data = os.path.join(data_dir, 'title_text_embeddings.csv')
#boto3.resource('s3').Bucket(bucket).Object(s3_train_data).upload_fileobj(buf)
#s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"



#s3_train_data = os.path.join(data_dir, 'title_text_embeddings.csv')
#pd.DataFrame(data=text_embeddings).to_csv(s3_train_data, header=False, index=False)
#train_location = session.upload_data(os.path.join(s3_train_data), key_prefix=prefix)

hyperparams = {"feature_dim": 10000, "k": 10, "sample_size": 15374, "predictor_type": "classifier"}
output_path = f"s3://{session.default_bucket()}/{prefix}/output"

knn_estimator = trained_estimator_from_hyperparams(s3_train_data, hyperparams, output_path)

In [None]:
knn_predictor = knn_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge'
)

In [None]:
KNN = 10
model = NearestNeighbors(n_neighbors=KNN)
model.fit(text_embeddings)
distances, indices = model.kneighbors(text_embeddings)

In [None]:
def plot_row_distances(n):
    plt.figure(figsize=(20, 3))
    plt.plot(np.arange(10), np.array(distances[n, ]), 'o-')
    plt.title('Text Distance From Train Row %i to Other Train Rows'% n, size=16)
    plt.ylabel('Distance to Train Row %i'% n, size=14)
    plt.xlabel('Index Sorted by Distance to Train Row %i'% n, size=14)
    plt.show()

    print(X_train.loc[np.array(indices[n, :10]), ['title']].merge(Y_train.loc[np.array(indices[n, :10])], left_index=True, right_index=True))
    
plot_row_distances(1)

In [None]:
preds = []

for i in range(len(distances)):
    preds.append(X_train.loc[np.where(distances[i] < 0.7)[0]].posting_id.values)

In [None]:
X_train['preds_title'] = preds
X_train.head()

In [None]:
train = X_train.merge(Y_train, left_index=True, right_index=True)

evaluate_model(train, column_name='preds_title')

In [None]:
   
cosine_similarities = linear_kernel(text_embeddings[0:1], text_embeddings).flatten()
related_docs_indices = np.where(cosine_similarities > 0.7)[0]

tmp = pd.DataFrame()
tmp['indices'] = related_docs_indices
tmp['similarities'] = cosine_similarities[cosine_similarities > 0.7]

tmp

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_similarities = linear_kernel(text_embeddings[0:1], text_embeddings).flatten()
np.where(cosine_similarities > 0.7)

preds = []

for i in range(len(distances)):
    preds.append(X_train.loc[np.where(distances[i] < 0.7)[0]].posting_id.values)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(text_embeddings[0:1], text_embeddings).flatten()
below_threshold_indices = cosine_similarities < 0.7
cosine_similarities[below_threshold_indices] = 0
related_docs_indices = cosine_similarities.argsort()[:-51:-1]

tmp = pd.DataFrame()
tmp['indices'] = related_docs_indices
tmp['similarities'] = cosine_similarities[related_docs_indices]

tmp

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity(text_embeddings, text_embeddings))