## Final Submission

 Final.ipynb file should contain the following :

    It should have two functions.
    Function-1
        Should include entire pipeline, from data preprocessing to making final predictions.
        It should take in raw data as input.
        It should return predictions for your input. Here the input can be a single point or a set of points.
        def final_fun_1(X):
        .....
        .....
        ..... # you will use the best model that you found out with your experiments
        return predictions made on X ( Raw Data)
    Function-2
        Should include entire pipeline, from data preprocessing to making final predictions.
        It should take in raw data as input along with its target values.
        It should return the metric value that you are judging your models on.
        def final_fun_2(X,Y):
        .....
        .....
        ..... # you will use the best model that you found out with your experiments
        return final_metric computed on X ( Raw Data) and Y (target variable)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### Importing all the required libraries

In [None]:
import math
import numpy as np
import pandas as pd
import nltk
import string
import re
import scipy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
import matplotlib.pyplot as plt
import pylab as plot
import seaborn as sn
import joblib
import warnings
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
from tqdm import tqdm_notebook, tqdm
warnings.filterwarnings('ignore')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('omw-1.4')
import gc
from scipy.sparse import hstack
from nltk.corpus import stopwords
from sklearn.metrics import mean_squared_log_error
import tensorflow as tf
from tensorflow.keras import Model


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


### Function -- 1

In [None]:
def function1(X):    
    """
    Description -> Accepts input X as raw data
    The function returns predicted price
    """
    X.fillna('', inplace=True)
    X['item_description']  = X['item_description'].str.replace('^no description yet$', '', regex=True)
    
    # Reference: https://www.kaggle.com/c/mercari-price-suggestion-challenge/discussion/50256
    # https://github.com/divyanshjain19/Mercari_Price_Suggestion/blob/master/2_Preprocessing_and_featurizations.ipynb
    X['name'] = X['name'] + " " + X['brand_name']
    X['text'] = X['item_description'] + " " + X['name'] + " " + X['category_name']


    df_train = joblib.load('/content/gdrive/MyDrive/binary_files/df_train_26Jan23_v1.joblib')

    # Ref: AAIC Notebook for Donors' Choose
    def decontracted(phrase):
        '''
        Description -> Replaces the short form words to their decontracted form such as won't to will not, 
                      this is done to make the text data uniform using regex commands.
        '''
        phrase = re.sub(r"aren\'t", "are not", phrase)
        phrase = re.sub(r"didn\'t", "did not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)
        phrase = re.sub(r"couldn\'t", "could not", phrase)
        phrase = re.sub(r"won\'t", "would not", phrase)
        phrase = re.sub(r"wouldn\'t", "would not", phrase)
        phrase = re.sub(r"haven\'t", "have not", phrase)
        phrase = re.sub(r"shouldn\'t", "should not", phrase)
        phrase = re.sub(r"doesn\'t", "does not", phrase)
        phrase = re.sub(r"don\'t", "do not", phrase)
        phrase = re.sub(r"didn\'t", "did not", phrase)
        phrase = re.sub(r"mustn\'t", "must not", phrase)
        phrase = re.sub(r"needn\'t", "need not", phrase)
        
        return phrase

        X['name'] = X['name'].apply(lambda x : decontracted(x))
        X['text'] = X['text'].apply(lambda x : decontracted(x))


    #Reference: https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
    #Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item
    def perform_lemma(sent,all_stopwords):
        '''
        Description -> Applying wordnet lemmatizer on the input sentence and returning the nearest base word/sentence.
        '''    
        sent_list = sent.split()
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in sent_list if word not in all_stopwords]
        sent = " ".join(text)
        return sent

    regex_special_chars = re.compile('[^A-Za-z0-9.]+')
    regex_decimal_digits = re.compile('(?<!\d)\.(?!\d)')
    regex_white_space = re.compile(r'\s+')

    # Since considering positive and negative emotion of buyer, so considering negative words also
    # Reference : https://stackabuse.com/removing-stop-words-from-strings-in-python/
    # all_stopwords = stopwords.words('english')
    # all_stopwords.remove('not')
    # all_stopwords.remove('nor')
    # all_stopwords.remove('no')
    #optimized way
    all_stopwords = set(stopwords.words("english")) - {"no", "nor", "not"} 


    def process_text_data(sent):
        '''
        Description -> Calling the demojify and lemmatization functions one by one with data as the input
                      and returning the final preprocessed data.
        '''
        
        #Removing new line, carriage return, double quotes
        sent = sent.replace('\\r', ' ')
        #sent = sent.replace('\\"', ' ')
        sent = sent.replace('\\n', ' ')
        #remove enojis  https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
        #sent = deEmojify(sent)

        #removing all special charecter except mentioned in regex strings    
        sent = regex_special_chars.sub(' ', sent)

        #removing all digit mentioned in regex strings
        sent = regex_decimal_digits.sub(' ', sent)
        
        # removing all white spaces  https://bobbyhadz.com/blog/python-remove-whitespace-regex      
        sent = regex_white_space.sub(' ', sent)
        #sent = re.sub(r'^\s+', '', sent, flags=re.MULTILINE)

        #removing end space and converting to lower
        sent = sent.strip().lower()

        # only take the words which are not stop words
        sent = perform_lemma(sent, all_stopwords)    
        return sent

    X['name'] = X['name'].apply(lambda x : process_text_data(x))
    X['text'] = X['text'].apply(lambda x : process_text_data(x))

    def text_encoder_testdata(train_data,test_data,type,params):
        '''
        Description -> Encoding different types of input text data according to its requirements using Countvectorizer
                      & Tfidfvectorizer and returning the transformed data as output
        '''
        if(type == "BOW"):
            vectorizer = CountVectorizer(ngram_range = params[0],min_df = params[1],max_df = params[2],max_features = params[3])
        elif(type == "TFIDF"):
            N_GRAMS =params
            vectorizer = vectorizer = TfidfVectorizer(max_features = 100000,
                                    ngram_range = (1, N_GRAMS),
                                    strip_accents = 'unicode',
                                    analyzer = 'word',
                                    token_pattern = r'\w+')
        elif(type=="CNTVECT"):
            vectorizer = CountVectorizer(vocabulary=params, lowercase=False, binary=True)

        #Vectorize on train data and transform on test data
        vectorizer.fit(train_data)    
        test_transform = vectorizer.transform(test_data)

        if (type == "BOW"):
            return test_transform,  ''
        elif (type == "CNTVECT"):
            return test_transform, ''
        elif (type == "TFIDF"):
            feat_names = vectorizer.get_feature_names_out()
            del vectorizer
            gc.collect()
            return test_transform, feat_names

    X_test_name,_ = text_encoder_testdata(df_train['name'],X['name'],"TFIDF", 1)
    X_test_text,_ = text_encoder_testdata(df_train['text'],X['text'],"TFIDF", 2)

    #X_test_name.shape,X_test_text.shape

    def one_hot_encoder_testdata(train_data,test_data):
        ohe_encoder = OneHotEncoder()
        #Vectorize on train data and transform on test data
        ohe_encoder.fit(train_data)
        test_ohe = ohe_encoder.transform(test_data)
        return test_ohe
      
    X_test_shipping = one_hot_encoder_testdata(np.reshape(df_train['shipping'].values, (-1, 1)),np.reshape(X['shipping'].values, (-1, 1)))
    X_test_item_condition = one_hot_encoder_testdata(np.reshape(df_train['item_condition_id'].values, (-1, 1)),np.reshape(X['item_condition_id'].values, (-1, 1)))

    testframe = hstack((X_test_name,
                  X_test_text,
                  X_test_shipping,
                  X_test_item_condition)).tocsr().astype('float32')

    #testframe.shape
    
    loaded_model1 = tf.keras.models.load_model('/content/gdrive/MyDrive/binary_files/model1.hdf5')
    loaded_model2 = tf.keras.models.load_model('/content/gdrive/MyDrive/binary_files/model2.hdf5')

    y_pred_test       = loaded_model1.predict(testframe)[:, 0]
    y_pred_test_model1 = np.expm1(y_pred_test.reshape(-1, 1))[:, 0]

    y_pred_test       = loaded_model2.predict(testframe)[:, 0]
    y_pred_test_model2 = np.expm1(y_pred_test.reshape(-1, 1))[:, 0]

    # from training 
    wmin =0.405
    final_predictions_test = wmin*y_pred_test_model1 + (1-wmin)*y_pred_test_model2
    print(final_predictions_test)

    return final_predictions_test


### Function -- 2


In [None]:
def function2(X,y_true):
    """
    Description -> Accepts input X and y_true as raw data and target values
    The function returns metric value
    """
    y_pred=function1(X)
    return np.sqrt(mean_squared_log_error(y_true,y_pred))


## Test on Large Set

In [None]:
df_test= pd.read_csv('/content/gdrive/MyDrive/train.tsv', sep='\t')
predicted_price = function1(df_test)

[ 9.894726 56.904846 10.494579 ... 15.027779 14.315481 25.384705]


In [None]:
X[1:10]

array([56.904846, 10.494579, 29.515343, 23.389828, 69.261856, 40.889202,
        7.286416, 19.687225, 11.361588], dtype=float32)

In [None]:
price_original = df_test['price']
print("RMSLE is = ", function2(df_test,price_original))

[ 9.894726 54.59494   9.135599 ... 15.027779 14.315481 25.384705]
RMSLE is =  0.28858137497814784


## Test on Small Set

In [None]:
df_test= pd.read_csv('/content/gdrive/MyDrive/train.tsv', sep='\t')[20:30]
predicted_price = function1(df_test)

[ 13.916389   22.321388  301.16        7.9210873  10.539222   75.43791
  18.355772   28.235123   12.587719   15.591455 ]


In [None]:
price_original = df_test['price']
print("RMSLE is = ", function2(df_test,price_original))

[ 13.930695   23.001     277.09552     7.9210873  10.628723   64.76397
  18.21217    26.504126   11.667847   14.220484 ]
RMSLE is =  0.28542474902077025
