### Importing all the required libraries

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install transformers
!pip install Afinn

In [None]:
import math
import numpy as np
import pandas as pd
import nltk
import string
import re
import scipy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from transformers import pipeline
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from afinn import Afinn
import matplotlib.pyplot as plt
import pylab as plot
import seaborn as sn
import joblib
import warnings
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
from tqdm import tqdm_notebook, tqdm
warnings.filterwarnings('ignore')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('omw-1.4')
import gc
from scipy.sparse import hstack
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


### Data vectorization and other pre processing for Test Data

In [None]:
df_test= pd.read_csv('/content/gdrive/MyDrive/test_stg2.tsv', sep='\t')

In [None]:
df_test.fillna('', inplace=True)
df_test['item_description']  = df_test['item_description'].str.replace('^no description yet$', '', regex=True)

In [None]:
df_test['name'] = df_test['name'] + " " + df_test['brand_name']
df_test['text'] = df_test['item_description'] + " " + df_test['name'] + " " + df_test['category_name']

In [None]:
df_train = joblib.load('/content/gdrive/MyDrive/binary_files/df_train_26Jan23_v1.joblib')

In [None]:
# Ref: AAIC Notebook for Donors' Choose
def decontracted(phrase):
    '''
    Description -> Replaces the short form words to their decontracted form such as won't to will not, 
                   this is done to make the text data uniform using regex commands.
    '''
    phrase = re.sub(r"aren\'t", "are not", phrase)
    phrase = re.sub(r"didn\'t", "did not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"couldn\'t", "could not", phrase)
    phrase = re.sub(r"won\'t", "would not", phrase)
    phrase = re.sub(r"wouldn\'t", "would not", phrase)
    phrase = re.sub(r"haven\'t", "have not", phrase)
    phrase = re.sub(r"shouldn\'t", "should not", phrase)
    phrase = re.sub(r"doesn\'t", "does not", phrase)
    phrase = re.sub(r"don\'t", "do not", phrase)
    phrase = re.sub(r"didn\'t", "did not", phrase)
    phrase = re.sub(r"mustn\'t", "must not", phrase)
    phrase = re.sub(r"needn\'t", "need not", phrase)
    
    return phrase

In [None]:
df_test['name'] = df_test['name'].apply(lambda x : decontracted(x))
df_test['text'] = df_test['text'].apply(lambda x : decontracted(x))

In [None]:
#Reference: https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
#Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item
def perform_lemma(sent,all_stopwords):
    '''
    Description -> Applying wordnet lemmatizer on the input sentence and returning the nearest base word/sentence.
    '''    
    sent_list = sent.split()
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in sent_list if word not in all_stopwords]
    sent = " ".join(text)
    return sent

In [None]:
regex_special_chars = re.compile('[^A-Za-z0-9.]+')
regex_decimal_digits = re.compile('(?<!\d)\.(?!\d)')
regex_white_space = re.compile(r'\s+')

# Since considering positive and negative emotion of buyer, so considering negative words also
# Reference : https://stackabuse.com/removing-stop-words-from-strings-in-python/
# all_stopwords = stopwords.words('english')
# all_stopwords.remove('not')
# all_stopwords.remove('nor')
# all_stopwords.remove('no')
#optimized way
all_stopwords = set(stopwords.words("english")) - {"no", "nor", "not"} 


def process_text_data(sent):
    '''
    Description -> Calling the demojify and lemmatization functions one by one with data as the input
                  and returning the final preprocessed data.
    '''
    
    #Removing new line, carriage return, double quotes
    sent = sent.replace('\\r', ' ')
    #sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    #remove enojis  https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python
    #sent = deEmojify(sent)

    #removing all special charecter except mentioned in regex strings    
    sent = regex_special_chars.sub(' ', sent)

    #removing all digit mentioned in regex strings
    sent = regex_decimal_digits.sub(' ', sent)
    
    # removing all white spaces  https://bobbyhadz.com/blog/python-remove-whitespace-regex      
    sent = regex_white_space.sub(' ', sent)
    #sent = re.sub(r'^\s+', '', sent, flags=re.MULTILINE)

    #removing end space and converting to lower
    sent = sent.strip().lower()

    # only take the words which are not stop words
    sent = perform_lemma(sent, all_stopwords)    
    return sent

In [None]:
df_test['name'] = df_test['name'].apply(lambda x : process_text_data(x))
df_test['text'] = df_test['text'].apply(lambda x : process_text_data(x))

In [None]:
def text_encoder_testdata(train_data,test_data,type,params):
    '''
    Description -> Encoding different types of input text data according to its requirements using Countvectorizer
                   & Tfidfvectorizer and returning the transformed data as output
    '''
    if(type == "BOW"):
        vectorizer = CountVectorizer(ngram_range = params[0],min_df = params[1],max_df = params[2],max_features = params[3])
    elif(type == "TFIDF"):
        N_GRAMS =params
        vectorizer = vectorizer = TfidfVectorizer(max_features = 100000,
                                 ngram_range = (1, N_GRAMS),
                                 strip_accents = 'unicode',
                                 analyzer = 'word',
                                 token_pattern = r'\w+')
    elif(type=="CNTVECT"):
        vectorizer = CountVectorizer(vocabulary=params, lowercase=False, binary=True)

    #Vectorize on train data and transform on test data
    vectorizer.fit(train_data)    
    test_transform = vectorizer.transform(test_data)

    if (type == "BOW"):
        return test_transform,  ''
    elif (type == "CNTVECT"):
        return test_transform, ''
    elif (type == "TFIDF"):
        feat_names = vectorizer.get_feature_names_out()
        del vectorizer
        gc.collect()
        return test_transform, feat_names

In [None]:
X_test_name,_ = text_encoder_testdata(df_train['name'],df_test['name'],"TFIDF", 1)
X_test_text,_ = text_encoder_testdata(df_train['text'],df_test['text'],"TFIDF", 2)

In [None]:
X_test_name.shape,X_test_text.shape

((3460725, 85394), (3460725, 100000))

In [None]:
def one_hot_encoder_testdata(train_data,test_data):
    ohe_encoder = OneHotEncoder()
    #Vectorize on train data and transform on test data
    ohe_encoder.fit(train_data)
    test_ohe = ohe_encoder.transform(test_data)
    return test_ohe

In [None]:
X_test_shipping = one_hot_encoder_testdata(np.reshape(df_train['shipping'].values, (-1, 1)),np.reshape(df_test['shipping'].values, (-1, 1)))
X_test_item_condition = one_hot_encoder_testdata(np.reshape(df_train['item_condition_id'].values, (-1, 1)),np.reshape(df_test['item_condition_id'].values, (-1, 1)))

In [None]:
testframe = hstack((X_test_name,
                 X_test_text,
                 X_test_shipping,
                 X_test_item_condition)).tocsr().astype('float32')

In [None]:
testframe.shape

(3460725, 185401)

In [None]:
joblib.dump(testframe, '/content/gdrive/MyDrive/binary_files/testframe_26Jan23_v1.joblib')


['/content/gdrive/MyDrive/binary_files/testframe_26Jan23_v1.joblib']

In [None]:
joblib.dump(df_test, '/content/gdrive/MyDrive/binary_files/df_test_26Jan23_v1.joblib')

['/content/gdrive/MyDrive/binary_files/df_test_26Jan23_v1.joblib']