## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

from keras.layers import Input, Dense
from keras.models import Model

from scipy.sparse import hstack
from sklearn.metrics import mean_squared_log_error

import joblib
import gc

Using TensorFlow backend.


In [2]:
import warnings
warnings.filterwarnings('ignore')

## Defining Utility Functions

In [3]:
def clean_df(df):
    '''
    Input  -> Raw Pandas Dataframe
    Output -> Cleaned Pandas Dataframe
    Task   -> This function keeps only those rows in the dataframe with prices that the Mercari Platform allows
    '''
    df = df[(df['price'] >= 3) & (df['price'] <= 2000)]
    return df

In [4]:
def text_preprocessing(sent):
    '''
    Input  -> Raw text (string)
    Output -> Cleaned Text (string)
    Task   -> The objective of this function is to clean the text and make it suitable for Bag of Words/TF-IDF vectorization
              This includes removal of new lines, special characters, emojis etc.
    
    '''
    # Decontraction
    sent = re.sub(r"aren\'t", "are not", sent)
    sent = re.sub(r"didn\'t", "did not", sent)
    sent = re.sub(r"can\'t", "can not", sent)
    sent = re.sub(r"couldn\'t", "could not", sent)
    sent = re.sub(r"won\'t", "would not", sent)
    sent = re.sub(r"wouldn\'t", "would not", sent)
    sent = re.sub(r"haven\'t", "have not", sent)
    sent = re.sub(r"shouldn\'t", "should not", sent)
    sent = re.sub(r"doesn\'t", "does not", sent)
    sent = re.sub(r"don\'t", "do not", sent)
    sent = re.sub(r"didn\'t", "did not", sent)
    sent = re.sub(r"mustn\'t", "must not", sent)
    sent = re.sub(r"needn\'t", "need not", sent)
    
    #Removing special characters
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\n', ' ')

    #Removing all special characters except the period
    sent = regex_special_chars.sub(' ', sent)
    
    #Removing periods which are not either followed or preceeded by a digit
    #Ref: https://stackoverflow.com/questions/6599646/remove-decimal-point-when-not-between-two-digits
    
    sent = regex_decimal_digits.sub(' ', sent)
    
    #Converting multiple white spaces to single white space
    sent = regex_white_space.sub(' ', sent)
    
    #Removing space at starting and ending and converting to lower case
    sent = sent.strip().lower()
    
    sent_list = sent.split()
    
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in sent_list if word not in stop_words] 
    sent = " ".join(text)
    
    return sent

In [5]:
#Defining some special regexes which would be used in the function text_preprocessing() to clean the text
regex_special_chars = re.compile('[^A-Za-z0-9.]+')
regex_decimal_digits = re.compile('(?<!\d)\.(?!\d)')
regex_white_space = re.compile(r'\s+')           
    
#Creating a slightly modified list of stopwords which does not contain "no", "nor" or "not"
stop_words = set(stopwords.words("english")) - {"no", "nor", "not"}

In [6]:
def preprocess_df(df):
    '''
    This function does preprocessing of the DataFrame including tasks like imputing the null values, joining mulitple columns
    together and then applying the text_preprocessing() method defined above
    Input  -> Pandas DataFrame (Raw)
    Output -> Pandas DataFrame (Cleaned)
    Task   -> Dealing with null values, Cleaning 'item_description' column, joining together various text fields and performing
              text preprocessing defined above
    '''

    #Fill Null values with ''
    df.fillna('', inplace = True)
    
    #Convert No description yet to ' '
    df['item_description']  = df['item_description'].str.replace('^no description yet$', '', regex=True)
    
    #Combine various text fields to one single field is inspired from Kaggle Winners' solution
    #This helps in controlling the number of features generated when vectorization is applied on a text column
    #Ref: https://github.com/pjankiewicz/mercari-solution
    
    df['name'] = df['name'] + " " + df['brand_name']
    df['text'] = df['item_description'] + " " + df['name'] + " " + df['category_name']
    
    df[['name', 'text']] =  df[['name', 'text']].applymap(lambda x : text_preprocessing(x))
    
    return df[['name', 'text', 'shipping', 'item_condition_id']]

In [7]:
def tfidf_encoder(train_data, test_data, N_GRAMS = 1):
    '''
    This function returns the TF-IDF encoding of the text
    
    Input ->
    
        train_data       : Text (string or list of strings or Pandas Series with elements as strings)
        test_data        : Text (string or list of strings or Pandas Series with elements as strings)
        N_GRAMS(int)     : Upper bound of the n_grams to be considered while vectorizing the data using TF-IDF encoder
                           For eg., If the n_grams = 2, then both unigrams and bi-grams will be used while vectorizing
                           the text data. Default value is kept as 1, which means only uni-grams will be generated if this
                           argument is not supplied explicitly while calling this function
    
    Output -> Tuple of TF-IDF vectors of "train_data" and "test_data" computed using sklearn's Tfidfvectorizer() 
    
    Task   -> Given a text (string), return the TF-IDF vectors for that text
              The vectorizer is fitted on the train_data and used to tranform both the train data and the test data
    '''
    vectorizer = TfidfVectorizer(min_df = 3,
                                 max_features = 100000,
                                 ngram_range = (1, N_GRAMS),
                                 strip_accents = 'unicode',
                                 analyzer = 'word',
                                 token_pattern = r'\w{1,}')
    
    train_tdidf = vectorizer.fit_transform(train_data)
    test_tfidf =  vectorizer.transform(test_data)
    return (train_tdidf, test_tfidf)

In [8]:
def one_hot_encoder(train_data, test_data):
    '''
    This function returns the One Hot Encoded vectors for the given train data and test data
    Input ->
        train_data : Training data to be fitted on and one hot encoded (List of integers/strings or a Pandas Series)
        test_data  : Testing data to be one hot encoded (List of integers/strings or a Pandas Series)
    Output -> Tuple of one hot encoded vectors of "train_data" and "test_data"
    Task   -> This function converts the raw values (integers/strings) into one hot encoded vectors using
              sklearn's OneHotEncoder()
    '''
    ohe_encoder = OneHotEncoder()
    train_ohe   = ohe_encoder.fit_transform(train_data)
    test_ohe    = ohe_encoder.transform(test_data)
    return (train_ohe, test_ohe)

In [9]:
def mlp_model_1(train_shape):
    '''
    Task  -> This function builds the architecture of an MLP model with the input dimensions as "train_shape"
             The architecture of the model is as follows:
             Input Layer -> Dense (256) -> Dense (128) -> Dense (1) -> Output Layer
             The activation function is kept as ReLu for the hidden layers and linear activation (f(x) = x) for the output layer
    
    Input  -> train_shape: Input shape (dimensions) of the data which will be fed to the MLP
    
    Output -> Builded MLP Model
    '''
    model_input = Input(shape=(train_shape,), dtype='float32', sparse=True)
    out = Dense(256, activation='relu')(model_input)
    out = Dense(128, activation='relu')(out)
    model_out = Dense(1)(out)
    model = Model(model_input, model_out)
    return model
    
def mlp_model_2(train_shape):
    '''
    Task  -> This function builds the architecture of an MLP model with the input dimensions as "train_shape"
             The architecture of the model is as follows:
             Input Layer -> Dense (1024) -> Dense (512) -> Dense (256) -> Dense (128) -> Dense (64) -> Dense (32) -> Dense (1)
             -> Output Layer
             The activation function is kept as ReLu for the hidden layers and linear activation (f(x) = x) for the output layer
    
    Input  -> train_shape: Input shape (dimensions) of the data which will be fed to the MLP
    
    Output -> Builded MLP Model
    '''
    model_input = Input(shape=(train_shape,), dtype='float32', sparse=True)
    out = Dense(1024, activation='relu')(model_input)
    out = Dense(512, activation='relu')(out)
    out = Dense(256, activation='relu')(out)
    out = Dense(128, activation='relu')(out)
    out = Dense(64, activation='relu')(out)
    out = Dense(32, activation='relu')(out)
    out = Dense(1)(out)
    model = Model(model_input, out)
    return model

In [10]:
def pipeline_function(test_datapoint):
    '''
    Input:  Single Datapoint
    Output: Predicted price
    Task:   This function takes a single datapoint as input and returns the target variable, i.e., the predicted price for that
            datapoint
    '''
    # Load Training Data
    df_train = pd.read_csv('train.tsv', sep='\t')
    
    # Clean Training Data
    df_train = clean_df(df_train)
    
    # Extract the price column that will be used while training models
    y_train     = df_train['price'].values
    
    # Taking log of the price column so that we can directly optimize for RMSLE
    y_train_log = (np.log1p(y_train)).reshape((-1, 1))
    
    # Prprocess Training Data
    df_train = preprocess_df(df_train)
    
    # Preprocess Testing Data
    test_datapoint = preprocess_df(test_datapoint)
    
    # TF-IDF Encoding "name" and "text" columns
    train_tfidf_vectors_name, test_tfidf_vectors_name = tfidf_encoder(df_train['name'].values,
                                                                      test_datapoint['name'].values,
                                                                      N_GRAMS = 1)
    
    train_tfidf_vectors_text, test_tfidf_vectors_text = tfidf_encoder(df_train['text'].values,
                                                                      test_datapoint['text'].values,
                                                                      N_GRAMS = 2)
    
    # One Hot Encoding item_condition_id and shipping columns
    train_item_condition, test_item_condition = one_hot_encoder(np.reshape(df_train['item_condition_id'].values, (-1, 1)),
                                                                np.reshape(test_datapoint['item_condition_id'].values, (-1, 1)))
    
    train_shipping, test_shipping             = one_hot_encoder(np.reshape(df_train['shipping'].values, (-1, 1)),
                                                                np.reshape(test_datapoint['shipping'].values, (-1, 1)))
    
    # Combining all the encoded features to create the final train and test data matrices
    X_train = hstack((train_tfidf_vectors_name,
                      train_tfidf_vectors_text,
                      train_item_condition,
                      train_shipping)).tocsr().astype('float32')

    X_test  = hstack((test_tfidf_vectors_name,
                      test_tfidf_vectors_text,
                      test_item_condition,
                      test_shipping)).tocsr().astype('float32')

    # Training MLP Models    
    mlp1 = mlp_model_1(X_train.shape[1])
    mlp1.compile(optimizer='adam', loss='mean_squared_error')
    mlp1.fit(X_train, y_train_log, batch_size = 256, epochs = 1, verbose = 1)
    mlp1.fit(X_train, y_train_log, batch_size = 512, epochs = 1, verbose = 1)
    mlp1.fit(X_train, y_train_log, batch_size = 1024, epochs = 1, verbose = 1)

    mlp2 = mlp_model_2(X_train.shape[1])
    mlp2.compile(optimizer='adam', loss='mean_squared_error')
    mlp2.fit(X_train, y_train_log, batch_size = 256, epochs = 1, verbose = 1)
    mlp2.fit(X_train, y_train_log, batch_size = 512, epochs = 1, verbose = 1)
    mlp2.fit(X_train, y_train_log, batch_size = 1024, epochs = 1, verbose = 1)
    
    # Obtaining predictions from MLP-1 and MLP-2
    y_pred_mlp_1 = np.expm1((mlp1.predict(X_test)[:, 0]).reshape(-1, 1))
    y_pred_mlp_2 = np.expm1((mlp2.predict(X_test)[:, 0]).reshape(-1, 1))
    
    pred_final = 0.42*mlp1_preds + 0.58*mlp2_preds
    
    return pred_final

In [11]:
def function_1():
    '''
    This function is used to calculate the predicted price for an input datapoint (selected from the test set)
    '''
    df_test        = pd.read_csv('test.tsv', sep='\t')
    test_datapoint = df_test[21:22]
    pred_final     = pipeline_function(test_datapoint)
    print("Input datapoint is: \n", test_datapoint.values, "\n")
    print("Final predicted price for the given datapoint is =", np.round(pred_final, 2))

In [8]:
function_1()

Input datapoint is: 
 [[21 'iPhone 6:6S Case Marble **SALE*' 1
  'Electronics/Cell Phones & Accessories/Cases, Covers & Skins' 'Apple' 1
  '**MESSAGE THE MARBLE COLOR AND SIZE YOU WOULD LIKE AFTER YOU ORDER** BUNDLE PROMOTION: ONE FOR [rm] OR PICK ANY TWO COLORS FOR ONLY [rm]!! (Get one for yourself and another for your friend or family!) Up for sale is a BRAND NEW Marble Pattern Case (Color Options: White/Black/Milky Blue/Pink) Designed Perfectly to fit BOTH iPhone 6 & 6S 4.7 ALSO AVAILABLE for iPhone 6/6S PLUS MODELS" Design Spec: - Easy Access to ALL ports - High Quality TPU Shell Shock Proof - Slim and Light Weighto - 360 Silicone Grip & Protection FAST FREE SHIPPING! All our products are shipped via USPS First Class with real- time TRACKING! CUSTOMER SERVICE We are an experienced seller and for us customer satisfaction is our priority. We work HARD to make sure all our customers are 100% satisfied! We respond quick so feel free to reach out to us with any questions! PRICED TO SELL

In [12]:
def rmsle(y_true, y_pred):
    '''
    This function take the tuple of true class labels and the predicted class label as input and gives the Root mean squared
    log error between these as the output
    '''
    return np.sqrt((np.log1p(y_true) - np.log1p(y_pred))**2)

In [15]:
def function_2():
    '''
    This function is used to calculate the predicted price for an input datapoint (selected from the train set) and the
    associated RMSLE calculated with the original datapoint
    '''
    df_train = pd.read_csv('train.tsv', sep='\t')
    df_train = clean_df(df_train)
    test_datapoint = df_train[1:2]
    price_original = test_datapoint['price'].values[0]
    pred_final = pipeline_function(test_datapoint)[0][0]
    print("Input datapoint is: \n", test_datapoint.values[0], "\n")
    print("Final predicted price for the given datapoint is =", np.round(pred_final, 2))
    print("RMSLE for the given datapoint is = ", rmsle(pred_final, price_original))

In [9]:
function_2()

Input datapoint is: 
 [1 'Razer BlackWidow Chroma Keyboard' 3
 'Electronics/Computers & Tablets/Components & Parts' 'Razer' 52.0 0
 'This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.'] 

Final predicted price for the given datapoint is = 56.21
RMSLE for the given datapoint is =  0.07643717427746788
