# Introduction

Steps I followed
- Understand your data, check distributions
- Create a good Baseline
- Improve using Model
- Inspect Model
- Iterate

# Imports and Reading Data

In [6]:
from keras import backend as K
import time
import matplotlib.pyplot as plt
import numpy as np_utils
%matplotlib inline
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D, DepthwiseConv2D, Conv2D, SeparableConv2D, MaxPooling1D, AveragePooling1D
from keras.layers import Input, concatenate, LeakyReLU
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from keras.layers import Activation, Flatten, Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling2D
from keras.utils import np_utils
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import SGD, Nadam, Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from keras.regularizers import l2
%config InlineBackend.figure_format='retina'
from keras_contrib.callbacks import CyclicLR
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from data_science_utils.vision.keras import *
from time import time
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import missingno as msno
import re
from joblib import Parallel, delayed
from data_science_utils import dataframe as df_utils
from data_science_utils import models as model_utils
from data_science_utils import plots as plot_utils
from data_science_utils.dataframe import column as column_utils
from data_science_utils import misc as misc
from data_science_utils import preprocessing as pp_utils
from data_science_utils import nlp as nlp_utils

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from data_science_utils.dataframe import get_specific_cols

import more_itertools
from more_itertools import flatten
import ast
from sklearn.preprocessing import LabelEncoder

import gc


import sys
import os
sys.path.append(os.getcwd())
from importlib import reload
import lib
reload(lib)
from lib import *

from oclr import OneCycleLR, LRFinder
from data_science_utils.models import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error


<module 'lib' from '/Users/ahemf/Desktop/ML_hackathon/lib.py'>

In [None]:
df_train = pd.read_csv("price_prediction/training.csv")
df_test = pd.read_csv("price_prediction/public_test_features.csv")

# Data Preprocessing

- highlight `replace_numbers` and `clean_text`
- highlight `word lemmatization`

In [None]:
df_train = preprocess_for_word_cnn(df_train,jobs=32)
gl_le_train,gl_le_transform, le = get_text_le("text")
_ = gl_le_train(df_train)
df_train['text_encoded'] = gl_le_transform(df_train)

df_test = preprocess_for_word_cnn(df_test,jobs=32)
df_test['text_encoded'] = gl_le_transform(df_test)


In [None]:
df_train = preprocess_for_char_cnn(df_train,jobs=32)
df_test = preprocess_for_char_cnn(df_test,jobs=32)
char_le_train,char_le_transform, char_le = get_char_le("char")
_ = char_le_train(df_train)

df_train['char_encoded'] = char_le_transform(df_train)
df_test['char_encoded'] = char_le_transform(df_test)

In [None]:
df_train.head()

In [None]:
# df_test[["ID","GL","text","text_encoded","char","char_encoded"]].to_csv("price_prediction/test.csv",index=False)
# df_train[["ID","GL","text","text_encoded","char","char_encoded","PRICE"]].to_csv("price_prediction/train.csv",index=False)

# Baselining

## GL Based baseline

In [None]:
df_train.GL.nunique()
df_test.GL.nunique()

set(df_test.GL.unique()) - set(df_train.GL.unique())


In [None]:
df_gl_means = df_train.groupby(["GL"])[['PRICE']].mean().reset_index()

df_results = df_test.merge(df_gl_means, on=["GL"],how="left")
df_results = df_results[["ID","PRICE"]]
df_results["PRICE"] = df_results["PRICE"].fillna(df_results["PRICE"].mean())

df_results.head()

df_results.to_csv("baseline.csv",index=False)


In [None]:
# Plot GL level pricing

## Fasttext commandline based baseline

In [None]:
df_train_ft = preprocess_for_fasttext_cmd(df_train.copy(),jobs=32)
df_test_ft = preprocess_for_fasttext_cmd(df_test.copy(),jobs=32)

df_train['RPRICE'] = np.ceil(np.sqrt(df_train['PRICE']))
df_train['label'] = '__label__'
df_train['label'] = df_train['label'] + df_train['RPRICE'].astype(str)

df_train['text'] = df_train['label']
df_train['text'] = df_train['text'] + " "
df_train['text'] = df_train['text'] + df_train['char']

df_train[df_train['RPRICE']>=80][["text","price"]].sample(10)
df_test['text'] = df_test['char']

train,test = train_test_split(df_train[['text']],test_size=0.2, random_state=42)
train[['text']].to_csv("fastText-0.2.0/train-1.txt",header=False,index=False)
test[['text']].to_csv("fastText-0.2.0/train-2.txt",header=False,index=False)

df_train[['text']].to_csv("fastText-0.2.0/train.txt",header=False,index=False)
df_test[['text']].to_csv("fastText-0.2.0/test.txt",header=False,index=False)

!head -n2 fastText-0.2.0/train.txt

In [None]:
df_results = pd.read_csv("fastText-0.2.0/prediction.txt",header=None)

df_results.shape
df_results.columns=["result"]
df_results.head()

df_results['values'] = df_results['result'].apply(lambda x:x[9:]).astype(float)
df_results['values'] = np.power(df_results['values'],3)
df_results.head()

df_sub = df_test[['ID']]
df_sub['PRICE'] = df_results['values']
df_sub.to_csv("fasttext-cubic.csv",index=False)


# Modelling

- We use embedding dimension of 50 through-out

## Intro to 1D CNNs and Word Embeddings

**The Analogues**
- 1 image = 1 Asin Full Text
- Image Channels = Embedding Dimensions
- Images are 2D (depth is 3 channels), Text is 1D sequence (depth is 50 embedding dimensions)
- Edges/Gradients/Patterns in images = Text Phrases and important multi-word sequences

**Understanding the structure of each row**

Initially we have : Words -> Full text

Finally after embedding we get : Each word as a list of 50 numbers, Text as list of Words. 

Single row shape is `(1,50,#Words)`

Full Data shape is `(#Num examples, 50, #Word_Per_Row)`

## Winning Model: Using Words and Embedding Layer

## Other Models

- Model 2: Using Characters and Embedding Layer
    - Very Less Preprocessing needed since used set of characters is around 128.
    - Since the Sequence length is very long this is harder to tune though.
- Model 3: Using Pretrained Glove-twitter-50 Embeddings
- Model 4: The Fallen Ensemble

# Inspecting the Model

## Inspecting Embeddings

- Word Cloud and hue on price bucket (Select top 10 words from each bucket)
- 

## Model Params and Summary

## Model Image representation

In [2]:
# Ensemble's image

## How Do Costly vs Cheaper Asins Look in Images

- We use a 3 channel intermediate layer and use it here
- We check 5 images from each price bucket

## Error Inspection

- We inspect top 10 Asins by RMSE, MAE, MAPE to see if we can change our preprocessing style

# Observations and Suggestion for Improvements

## Observations

- Finding Good Learning rate was important
- Model Trained Embeddings did better than Pretrained Embeddings (Data size may be a reason)
- Going Wide Helped
- P3.8x Large Sagemaker Instance has multi-gpu (8 total), this needs separate Tensorflow sessions, I used separate notebooks
- Normalizing word vectors for Pretrained model was lowering performance, word vector lengths matter when you consider Text classification tasks, for NLP tasks like similarity they don't since cosine distance is used.
- The Process of using Word CNNs is easily carried over to Sub-categories Classification
- Why Ensembling Did not work?
    - Models with dropouts are like ensembles themselves
    - Only significant architecture or preprocessing step difference will help
- Why Char CNN did not work?
    - Long training time and huge number of computations
- Following a structured process was important
    - You might think something will improve the results but trying things without reasoning usually gives sub-par results

In [3]:
# Target Distribution Histogram


## Improvements

- Training with Log/CubeRoot/Standard Scaling of target since target is skewed.
- Higher Embedding Dimensions
- Pretrained Fasttext or Bert/Elmo Models for text representation
- Inspecting Errors and finding text patterns we missed
- Trying POS tags with lemmatized words
- Bucketing/Quantizing: Dividing prices into cube-rooted buckets and then feeding the top 10 bucket prediction to final model.
- LR scheduling, Cyclic LR and Super Convergence

## References

# Asin Classification into Sub-Categories