In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import tensorflow as tf
from numpy import array
from numpy import asarray
from numpy import zeros
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, Conv2D, MaxPool1D, MaxPool2D, AveragePooling1D, GlobalAveragePooling1D, \
                                    Flatten, Dropout, Dense, BatchNormalization, LayerNormalization, Concatenate
from tensorflow.keras.layers import ReLU
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')
import warnings
warnings.filterwarnings("ignore")
import gc

import pandas as pd
import numpy as np
import string
import math
from tqdm.notebook import tqdm
tqdm.pandas()

import xgboost
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import scipy
from scipy.sparse import hstack

from sklearn.metrics import mean_squared_log_error
import pickle
import regex as re
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/My Drive/Colab Notebooks/'

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## **MLP**

### **Feature Engineering for MLP**

#### **Filling missing values**

In [6]:
def fill_missing_values(df):
    df['name'].fillna('unk_name', inplace=True)
    df['category_name'].fillna('unk_cat', inplace=True)
    df['brand_name'].fillna('unk_brand', inplace=True)
    df['item_description'].fillna('unk_descr', inplace=True)
    return df

#### **Text Pre-processing**

In [7]:
import re
import string

def decontract_text(phrase):
    phrase = str(phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def stem_sentence(sentence):
    ps = PorterStemmer()
    words = word_tokenize(sentence)
    root = []
    for w in words: 
        root.append(ps.stem(w))
    return " ".join(root)

def preprocess_descriptive_text_column(sentance):
    # https://gist.github.com/sebleier/554280
    # we are removing the negative words from the stop words list: 'no', 'nor', 'not', 'shouldn't, won't, etc.
    stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
                "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
                'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
                'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
                'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
                'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
                'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
                'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
                'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
                'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
                's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
                've', 'y']

    sent = decontract_text(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\n', ' ')
    sent = sent.replace('\\"', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)

    root_sent = stem_sentence(sent.lower().strip())
    return root_sent

#### **Fill Missing Brand Names**

In [8]:
def brand_guesser(df, existing_brands, brand_names_categories):
    filled_brands = []
    for row in df[['brand_name','name','category_name']].values:
        found=False
        if row[0]=='unk_brand':
            for brand in existing_brands:
                if brand in row[1].lower() and row[2] in brand_names_categories[brand]:
                    filled_brands.append(brand)
                    found=True
                    break
            if not found:
                filled_brands.append('unk_brand')
        else:
            filled_brands.append(row[0])

    df['brand_name']=filled_brands
    return df

#### **Split Categories**

In [9]:
def split_text(text):
    if text=='unk_cat':
        return ["No Label", "No Label", "No Label"]
    return text.split("/")

def split_categories(df):
    df['general_cat'], df['subcat_1'], df['subcat_2'] = zip(*df['category_name'].apply(lambda x: split_text(x)))
    df = df.drop('category_name', axis=1)
    return df

#### **Adding Item Description len and Item Name len**

In [10]:
def get_len_feature(col_series, scaler_text_len):
    text_len = col_series.apply(lambda x: len(x.split()))
    text_len = scaler_text_len.transform(text_len.values.reshape(-1, 1))
    return text_len

#### **Add is_expensive**

In [11]:
def get_is_expensive_feature(df, expensive_brands):
    df['is_expensive'] = df['brand_name'].apply(lambda x: 1 if x in expensive_brands else 0)
    return df

#### **Make Shipping data sparse**

In [12]:
def get_shipping_feature(df):
    sparse_shipping = scipy.sparse.csr_matrix(df['shipping'].values)
    sparse_shipping = sparse_shipping.reshape(-1,1) # Now the shape will be (1111901, 1)
    return sparse_shipping

#### **Vectorizing data**

In [13]:
def vectorize_data(col_data, vectorizer):
    ohe_data = vectorizer.transform(col_data)

    return ohe_data

#### **Feature Engineering pipeline**

In [14]:
def feature_pipeline_mlp(X_data, existing_brands, brand_names_categories, expensive_brands, general_cat_vectorizer, subcat_1_vectorizer, subcat_2_vectorizer, brand_name_vectorizer, item_name_vectorizer, 
                     item_desc_vectorizer, scaler_name_len, scaler_desc_len):
    #print("Filling missing values...")
    X_data = fill_missing_values(X_data)

    #print("pre-processing text data...")
    X_data['item_description'] = X_data['item_description'].apply(preprocess_descriptive_text_column)
    X_data['name'] = X_data['name'].apply(preprocess_descriptive_text_column)
    X_data['brand_name'] = X_data['brand_name'].apply(lambda x: str(x).lower())

    #print("Guessing the missing brands...")
    X_data = brand_guesser(X_data, existing_brands, brand_names_categories)

    #print("Splitting categories...")
    X_data = split_categories(X_data)

    #print('Getting word lengths')
    name_len =  get_len_feature(X_data['name'], scaler_name_len)
    desc_len =  get_len_feature(X_data['item_description'], scaler_desc_len)

    #print("Getting is_expensive brand feature...")
    sparse_is_expensive = get_is_expensive_feature(X_data, expensive_brands)

    #print("Getting sparse shipping data...")
    sparse_shipping = get_shipping_feature(X_data)

    #print("OHE vectorizing the text and categorical variables...")
    general_cat_ohe = vectorize_data(X_data['general_cat'].values.astype('U'), general_cat_vectorizer)
    #print("general cat done...")
    subcat_1_ohe = vectorize_data(X_data['subcat_1'].values.astype('U'), subcat_1_vectorizer)
    #print("sub cat 1 done...")
    subcat_2_ohe = vectorize_data(X_data['subcat_2'].values.astype('U'), subcat_2_vectorizer)
    #print("sub cat 2 done...")
    brand_name_ohe = vectorize_data(X_data['brand_name'].values.astype('U'), brand_name_vectorizer)
    #print("brand name done...")
    item_name_ohe = vectorize_data(X_data['name'], item_name_vectorizer)
    #print("item name done...")
    item_desc_ohe = vectorize_data(X_data['item_description'], item_desc_vectorizer)
    #print("item description done...")

    #print("Creating the final featurized dataset...")
    X_featurized = hstack((general_cat_ohe, subcat_1_ohe, subcat_2_ohe, brand_name_ohe, item_name_ohe, item_desc_ohe, 
                            desc_len, name_len, X_data['item_condition_id'].values.reshape(-1,1), sparse_shipping)).tocsr()

    #print("Done!!!\n---------------------------\n")
    return X_featurized

## **CNNs**

### **Feature Engineering for CNNs**

#### **Text Pre-processing**

In [15]:
import re
import string

def decontract_text(phrase):
    phrase = str(phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def preprocess_descriptive_text_column_cnn(sentance):
    # https://gist.github.com/sebleier/554280
    # we are removing the negative words from the stop words list: 'no', 'nor', 'not', 'shouldn't, won't, etc.
    stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
                "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
                'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
                'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
                'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
                'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
                'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
                'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
                'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
                'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
                's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
                've', 'y']

    sent = decontract_text(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\n', ' ')
    sent = sent.replace('\\"', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
    sent = sent.lower()
    return sent

#### **Feature Engineering pipeline**

In [16]:
def feature_pipeline_cnn(X_data, existing_brands, brand_names_categories, expensive_brands, scaler_name_len, scaler_desc_len):
    #print("Filling missing values...")
    X_data = fill_missing_values(X_data)
    
    #print("pre-processing text data...")
    X_data['item_description'] = X_data['item_description'].apply(preprocess_descriptive_text_column_cnn) ### temp step ###
    X_data['name'] = X_data['name'].apply(preprocess_descriptive_text_column)                         ### temp step ###
    X_data['brand_name'] = X_data['brand_name'].apply(lambda x: str(x).lower())

    #print("Guessing the missing brands...")
    X_data = brand_guesser(X_data, existing_brands, brand_names_categories)

    #print("Splitting categories...")
    X_data = split_categories(X_data)

    #print('Getting word lengths')
    X_data['name_len'] =  get_len_feature(X_data['name'], scaler_name_len)
    X_data['desc_len'] =  get_len_feature(X_data['item_description'], scaler_desc_len)

    #print("Getting is_expensive brand feature...")
    X_data = get_is_expensive_feature(X_data, expensive_brands)

    #print("Done!!!\n---------------------------\n")
    return X_data

### **Get Input for CNNs**

In [17]:
def categorical_embeddings(cat_data, le):
    encoded_cat = le.transform(cat_data.values)

    return encoded_cat

In [18]:
def text_embeddings(text, tokenizer, max_len_doc):
    # Word Tokenizer
    encoded_docs_train = tokenizer.texts_to_sequences(text)
    text_padded = pad_sequences(encoded_docs_train, maxlen=max_len_doc, padding='post')

    return text_padded

In [19]:
def get_cnn_inputs(X_data, desc_tokenizer, name_tokenizer, desc_max_len_doc, name_max_len_doc, bn_le_ext, gc_le_ext, sc1_le_ext, sc2_le_ext):
    ####################### Text data #######################
    desc_text_padded = text_embeddings(X_data['item_description'].apply(str), desc_tokenizer, desc_max_len_doc)
    name_text_padded = text_embeddings(X_data['name'].apply(str), name_tokenizer, name_max_len_doc)
    #########################################################


    ################### Categorical data ####################
    bn_encoded = categorical_embeddings(X_data['brand_name'], bn_le_ext)
    gc_encoded = categorical_embeddings(X_data['general_cat'], gc_le_ext)
    sc1_encoded = categorical_embeddings(X_data['subcat_1'], sc1_le_ext)
    sc2_encoded = categorical_embeddings(X_data['subcat_2'], sc2_le_ext)
    #########################################################


    ################### Numeric data ########################
    numeric_input = pd.concat((X_data['desc_len'], X_data['name_len']), axis=1).to_numpy()
    #########################################################


    X_cnn_inputs = [desc_text_padded, name_text_padded, bn_encoded, gc_encoded, sc1_encoded, sc2_encoded, tf.one_hot(X_data['item_condition_id'], 5),\
                    tf.one_hot(X_data['shipping'], 2), tf.one_hot(X_data['is_expensive'], 2), numeric_input]
    return X_cnn_inputs

# **INFERENCE**

## **Importing pickles and models**

In [20]:
"""
This custom class is required to be loaded in production environment.
"""
from sklearn.preprocessing import LabelEncoder
import numpy as np

class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [21]:
import pickle
with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/brands_list.pkl', 'rb') as f:
    existing_brands = pickle.load(f)
    
with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/brand_names_categories_dict.pkl', 'rb') as f:
    brand_names_categories = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/expensive_brands.pkl', 'rb') as f:
    expensive_brands = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/vectorizers_scalers.pkl', 'rb') as f:
    general_cat_vectorizer, subcat_1_vectorizer, subcat_2_vectorizer, \
                                                  brand_name_vectorizer, item_name_vectorizer, item_desc_vectorizer, \
                                                  scaler_name_len, scaler_desc_len = pickle.load(f)

#########################################################

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/desc_tokenizer.pkl', 'rb') as f:
    desc_tokenizer = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/name_tokenizer.pkl', 'rb') as f:
    name_tokenizer = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/desc_max_len_doc.pkl', 'rb') as f:
    desc_max_len_doc = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/name_max_len_doc.pkl', 'rb') as f:
    name_max_len_doc = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/bn_le_ext.pkl', 'rb') as f:
    bn_le_ext = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/gc_le_ext.pkl', 'rb') as f:
    gc_le_ext = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/sc1_le_ext.pkl', 'rb') as f:
    sc1_le_ext = pickle.load(f)

with open('/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/AAIC Submission/Pickled vars/sc2_le_ext.pkl', 'rb') as f:
    sc2_le_ext = pickle.load(f)

In [22]:
mlp_model_path = "/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/DL_Ensmeble/Model Checkpoints/Model_MLP_Checkpoints/BEST_MODEL.hdfs"
mlp_model_best = tf.keras.models.load_model(mlp_model_path)

cnn_model_path = "/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/DL_Ensmeble/Model Checkpoints/Model_CNN_Checkpoints/BEST_CNN_MODEL.hdfs"
cnn_model_best = tf.keras.models.load_model(cnn_model_path)

ft_cnn_model_path = "/content/drive/My Drive/Colab Notebooks/Applied AI Assignments/Case Study 1 Mercari Price Suggestion/zzFINAL COMPLETED/DL_Ensmeble/Model Checkpoints/Model_FT_CNN_Checkpoints/BEST_FT_CNN_MODEL.hdfs"
ft_cnn_model_best = tf.keras.models.load_model(ft_cnn_model_path)

## <font color='green'>**Final Inference Function**</font>

In [None]:
def final_inference(X):
    raw_df = pd.DataFrame(X)
    raw_df.columns = ['name', 'item_condition_id', 'category_name', 'brand_name', 'shipping', 'item_description']
    X_data_mlp_inputs = feature_pipeline_mlp(raw_df, existing_brands, brand_names_categories, expensive_brands, \
                                         general_cat_vectorizer, subcat_1_vectorizer, subcat_2_vectorizer, \
                                         brand_name_vectorizer, item_name_vectorizer, item_desc_vectorizer, scaler_name_len, scaler_desc_len)
    
    X_data_cnn = feature_pipeline_cnn(raw_df, existing_brands, brand_names_categories, expensive_brands, scaler_name_len, scaler_desc_len)
    X_cnn_inputs = get_cnn_inputs(X_data_cnn, desc_tokenizer, name_tokenizer, desc_max_len_doc, name_max_len_doc, bn_le_ext, gc_le_ext, sc1_le_ext, sc2_le_ext)

    mlp_pred = mlp_model_best.predict(X_data_mlp_inputs)
    ft_cnn_pred = ft_cnn_model_best.predict(X_cnn_inputs)
    cnn_pred = cnn_model_best.predict(X_cnn_inputs)

    mlp_pred = np.exp(mlp_pred)
    ft_cnn_pred = np.exp(ft_cnn_pred)
    cnn_pred = np.exp(cnn_pred)

    ensemble_pred = (mlp_pred*0.5 + ft_cnn_pred*0.3 + cnn_pred*0.2)
    return ensemble_pred

In [37]:
%%time
prediction = final_inference([["MLB Cincinnati Reds T Shirt Size XL", 3, "Men/Tops/T-shirts", np.nan, 1, "No description yet"]])
print("$"+str(prediction[0][0]))

$8.152324
CPU times: user 205 ms, sys: 3.01 ms, total: 208 ms
Wall time: 203 ms


## <font color='green'>**Final Metric Function**</font>

In [26]:
def calculate_metric(y_act, y_pred):
    rms = np.sqrt(mean_squared_log_error(y_act.values, y_pred))
    return rms

def final_metric(X, y):
    X_data_mlp_inputs = feature_pipeline_mlp(X, existing_brands, brand_names_categories, expensive_brands, \
                                         general_cat_vectorizer, subcat_1_vectorizer, subcat_2_vectorizer, \
                                         brand_name_vectorizer, item_name_vectorizer, item_desc_vectorizer, scaler_name_len, scaler_desc_len)
    
    X_data_cnn = feature_pipeline_cnn(X, existing_brands, brand_names_categories, expensive_brands, scaler_name_len, scaler_desc_len)
    X_cnn_inputs = get_cnn_inputs(X_data_cnn, desc_tokenizer, name_tokenizer, desc_max_len_doc, name_max_len_doc, bn_le_ext, gc_le_ext, sc1_le_ext, sc2_le_ext)

    mlp_pred = mlp_model_best.predict(X_data_mlp_inputs)
    ft_cnn_pred = ft_cnn_model_best.predict(X_cnn_inputs)
    cnn_pred = cnn_model_best.predict(X_cnn_inputs)

    mlp_pred = np.exp(mlp_pred)
    ft_cnn_pred = np.exp(ft_cnn_pred)
    cnn_pred = np.exp(cnn_pred)
    ensemble_pred = (mlp_pred*0.5 + ft_cnn_pred*0.3 + cnn_pred*0.2)

    metric = calculate_metric(y, ensemble_pred)

    return metric

In [None]:
if 'train.tsv' not in os.listdir(): 
    !kaggle competitions download -c mercari-price-suggestion-challenge
    get_ipython().system_raw("7z x \*.7z && rm *.7z")
    get_ipython().system_raw("7z x \*.zip && rm *.zip")

In [None]:
train_df = pd.read_csv('train.tsv', sep='\t')

In [28]:
metric = final_metric(train_df.drop(['train_id','price'], axis=1), train_df['price'])
print("RMSLE metric =",metric)

RMSLE metric = 0.3763031440856338
