## Import libraries

In [1]:
! pip install wordninja

Collecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[K     |████████████████████████████████| 541 kB 605 kB/s 
[?25hBuilding wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25l- \ done
[?25h  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541553 sha256=845a92f607cba7301cf2bd738e255369165f055e717d0d2cdf04e6eaa549e788
  Stored in directory: /root/.cache/pip/wheels/dd/3f/eb/a2692e3d2b9deb1487b09ba4967dd6920bd5032bfd9ff7acfc
Successfully built wordninja
Installing collected packages: wordninja
Successfully installed wordninja-2.0.0


In [2]:
import gc
import pickle
import operator
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm

import re
import nltk
import spacy
import string
import wordninja
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tqdm.pandas()

## Load source datasets

In [3]:
train_df = pd.read_csv("../input/mh-sentiment-classification/train.csv")
train_df = train_df[~((train_df['Review']=='10-Oct')|(train_df['Review']=='0'))].copy()
train_df.drop(['author'], inplace=True, axis=1)
train_df.set_index("ID", inplace=True)
print(f"train_df: {train_df.shape}")
train_df.head()

train_df: (44095, 2)


Unnamed: 0_level_0,Review,Sentiment
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
39467,Today I'm working on my &quot;Quirky Q&quot; c...,2
30154,@ShannonElizab dont ya know? people love the h...,1
16767,ughhh rejected from the 09 mediation program. ...,0
9334,@petewentz im so jealous. i want an octo drive,0
61178,I remember all the hype around this movie when...,0


In [4]:
test_df = pd.read_csv("../input/mh-sentiment-classification/test.csv")
test_df.drop(['author'], inplace=True, axis=1)
test_df.set_index("ID", inplace=True)
print(f"test_df: {test_df.shape}")
test_df.head()

test_df: (18900, 1)


Unnamed: 0_level_0,Review
ID,Unnamed: 1_level_1
29536,@amyswarren ahhh yay! I'm getting into it. Kno...
13442,@DeliverImHungry You are right. If you ordere...
54697,I'd heard a lot of bad things about this film ...
7007,"I miss the old... HA, HA. I can't tell that pe..."
34410,@dharshana anytime


## Extract target label

In [5]:
train_df.groupby(['Sentiment']).size().reset_index().rename(columns={0:'Count'})

Unnamed: 0,Sentiment,Count
0,0,19298
1,1,6066
2,2,18731


In [6]:
temp_df = train_df.groupby(['Sentiment']).size().reset_index().rename(columns={0:'count'})
total_count = np.sum(temp_df['count'].values)
temp_df['class%'] = (temp_df['count'] / total_count) * 100
lowest_pct = min(temp_df['class%'])
temp_df['class_weight'] = lowest_pct / temp_df['class%']
class_weight = temp_df[['Sentiment', 'class_weight']].to_dict()['class_weight']
class_weight

{0: 0.3143330915120738, 1: 1.0, 2: 0.3238481661416902}

In [7]:
Ytrain = train_df['Sentiment'].values
Ytrain_oh = pd.get_dummies(train_df['Sentiment']).values
train_df.drop(['Sentiment'], inplace=True, axis=1)
print(f"Ytrain: {Ytrain.shape} \nYtrain_oh: {Ytrain_oh.shape}")

Ytrain: (44095,) 
Ytrain_oh: (44095, 3)


## Combine reviews from train & test datasets

In [8]:
combined_df = train_df.append(test_df, sort=False, ignore_index=False)

del train_df
del test_df
gc.collect()

combined_df.head()

Unnamed: 0_level_0,Review
ID,Unnamed: 1_level_1
39467,Today I'm working on my &quot;Quirky Q&quot; c...
30154,@ShannonElizab dont ya know? people love the h...
16767,ughhh rejected from the 09 mediation program. ...
9334,@petewentz im so jealous. i want an octo drive
61178,I remember all the hype around this movie when...


## Extract statistical features

### Helper Functions

In [9]:
def contraction_count(sent):
    count = 0
    count += re.subn(r"won\'t", '', sent)[1]
    count += re.subn(r"can\'t", '', sent)[1]
    count += re.subn(r"n\'t", '', sent)[1]
    count += re.subn(r"\'re", '', sent)[1]
    count += re.subn(r"\'s", '', sent)[1]
    count += re.subn(r"\'d", '', sent)[1]
    count += re.subn(r"\'ll", '', sent)[1]
    count += re.subn(r"\'t", '', sent)[1]
    count += re.subn(r"\'ve", '', sent)[1]
    count += re.subn(r"\'m", '', sent)[1]
    return count

In [10]:
def pos_count(sent):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    jj_count = 0   #Adjective
    uh_count = 0   #Interjection
    cd_count = 0   #Numerics
    
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)

    for token in sent:
        if token[1] in ['NN','NNP','NNS']:
            nn_count += 1

        if token[1] in ['PRP','PRP$']:
            pr_count += 1

        if token[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']:
            vb_count += 1

        if token[1] in ['JJ','JJR','JJS']:
            jj_count += 1

        if token[1] in ['UH']:
            uh_count += 1

        if token[1] in ['CD']:
            cd_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, jj_count, uh_count, cd_count])

In [11]:
def dialog_parser(text):
    
    tokenized = nltk.word_tokenize(text)
    
    # let's set up some lists to hold our pieces of narrative and dialog
    parsed_dialog = []
    parsed_narrative = []
    
    # and this list will be a bucket for the text we're currently exploring
    current = []

    # now let's set up values that will help us loop through the text
    length = len(tokenized)
    found_q = False
    counter = 0
    quote_open, quote_close = '``', "''"

    # now we'll start our loop saying that as long as our sentence is...
    while counter < length:
        word = tokenized[counter]

        # until we find a quotation mark, we're working with narrative
        if quote_open not in word and quote_close not in word:
            current.append(word)

        # here's what we do when we find a closed quote
        else:
            # we append the narrative we've collected & clear our our
            # current variable
            parsed_narrative.append(current)
            current = []
            
            # now current is ready to hold dialog and we're working on
            # a piece of dialog
            current.append(word)
            found_q = True

            # while we're in the quote, we're going to increment the counter
            # and append to current in this while loop
            while found_q and counter < length-1:
                counter += 1
                if quote_close not in tokenized[counter]:
                    current.append(tokenized[counter])
                else:
                    # if we find a closing quote, we add our dialog to the
                    # appropriate list, clear current and flip our found_q
                    # variable to False
                    current.append(tokenized[counter])
                    parsed_dialog.append(current)
                    current = []
                    found_q = False

        # increment the counter to move us through the text
        counter += 1
    
    if len(parsed_narrative) == 0:
        parsed_narrative.append(current)
    
    mean_dialog_word_len = 0
    
    if len(parsed_dialog) > 0:
        for text in parsed_dialog:
            join_text = " ".join(text)
            join_text = join_text.replace('"','')
            join_text = join_text.replace("''","")
            mean_dialog_word_len += len(join_text.split())
        
        mean_dialog_word_len /= float(len(parsed_dialog))
    
    mean_narrative_word_len = 0
    
    if len(parsed_narrative) > 0:
        for text in parsed_narrative:
            join_text = " ".join(text)
            join_text = join_text.replace('"','')
            join_text = join_text.replace("''","")
            mean_narrative_word_len += len(join_text.split())
        
        mean_narrative_word_len /= float(len(parsed_narrative))

    return pd.Series([len(parsed_dialog), len(parsed_narrative), mean_dialog_word_len, mean_narrative_word_len])

In [12]:
combined_df["Review_num_words"] = combined_df["Review"].progress_apply(lambda x: len(str(x).split()))
print("Review_num_words...Completed")

combined_df["Review_num_unique_words"] = combined_df["Review"].progress_apply(lambda x: len(set(str(x).split())))
print("Review_num_unique_words...Completed")

combined_df["Review_num_chars"] = combined_df["Review"].progress_apply(lambda x: len(str(x)))
print("Review_num_chars...Completed")

combined_df["Review_num_stopwords"] = combined_df["Review"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
print("Review_num_stopwords...Completed")

combined_df["Review_num_@"] = combined_df["Review"].progress_apply(lambda x: len(re.findall("(?<![@\w])@(\w{1,25})", x)))
print("Review_num_@...Completed")

combined_df["Review_num_#"] = combined_df["Review"].progress_apply(lambda x: len(re.findall("(?<![#\w])#(\w{1,25})", x)))
print("Review_num_#...Completed")

combined_df["Review_num_urls"] = combined_df["Review"].progress_apply(lambda x: len(re.findall("https?://\S+|www\.\S+", x)))
print("Review_num_urls...Completed")

combined_df["Review_num_tags"] = combined_df["Review"].progress_apply(lambda x: len(re.findall("<.*?>", x)))
print("Review_num_tags...Completed")

combined_df["Review_num_punctuations"] =combined_df['Review'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
print("Review_num_punctuations...Completed")

combined_df["Review_num_words_upper"] = combined_df["Review"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
print("Review_num_words_upper...Completed")

combined_df["Review_num_words_title"] = combined_df["Review"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
print("Review_num_words_title...Completed")

combined_df["Review_mean_word_len"] = combined_df["Review"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
print("Review_mean_word_len...Completed")

combined_df["Review_num_paragraphs"] = combined_df["Review"].progress_apply(lambda x: len(x.split('\n')))
print("Review_num_paragraphs...Completed")

combined_df["Review_num_contractions"] = combined_df["Review"].progress_apply(contraction_count)
print("Review_num_contractions...Completed")

combined_df[["Review_num_dialog",
             "Review_num_narrative",
             "Review_dialog_mean_word_len",
             "Review_narrative_mean_word_len"]] = combined_df["Review"].progress_apply(dialog_parser)
print("Dialog Parser...Completed")

combined_df['Review_polarity'] = combined_df['Review'].progress_apply(lambda x: TextBlob(x).sentiment[0])
print("Review_polarity...Completed")

combined_df['Review_subjectivity'] = combined_df['Review'].progress_apply(lambda x: TextBlob(x).sentiment[1])
print("Review_subjectivity...Completed")

combined_df[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = combined_df['Review'].progress_apply(pos_count)
print("POS Count...Completed")

print(f"\ncombined_df: {combined_df.shape}")
combined_df.head()

100%|██████████| 62995/62995 [00:00<00:00, 126748.38it/s]
 10%|█         | 6363/62995 [00:00<00:00, 63622.48it/s]

Review_num_words...Completed


100%|██████████| 62995/62995 [00:01<00:00, 61199.97it/s]
100%|██████████| 62995/62995 [00:00<00:00, 413080.91it/s]
  0%|          | 0/62995 [00:00<?, ?it/s]

Review_num_unique_words...Completed
Review_num_chars...Completed


100%|██████████| 62995/62995 [11:03<00:00, 94.87it/s] 
  4%|▎         | 2264/62995 [00:00<00:02, 22574.31it/s]

Review_num_stopwords...Completed


100%|██████████| 62995/62995 [00:02<00:00, 21512.83it/s]
  4%|▎         | 2306/62995 [00:00<00:02, 22963.17it/s]

Review_num_@...Completed


100%|██████████| 62995/62995 [00:02<00:00, 21403.11it/s]
 10%|▉         | 6230/62995 [00:00<00:00, 62298.87it/s]

Review_num_#...Completed


100%|██████████| 62995/62995 [00:01<00:00, 62516.63it/s]
 41%|████      | 25571/62995 [00:00<00:00, 255708.41it/s]

Review_num_urls...Completed


100%|██████████| 62995/62995 [00:00<00:00, 247851.33it/s]
  0%|          | 116/62995 [00:00<00:54, 1145.91it/s]

Review_num_tags...Completed


100%|██████████| 62995/62995 [00:40<00:00, 1558.98it/s]
 12%|█▏        | 7553/62995 [00:00<00:00, 75507.21it/s]

Review_num_punctuations...Completed


100%|██████████| 62995/62995 [00:00<00:00, 74525.69it/s]
 11%|█         | 6719/62995 [00:00<00:00, 67158.20it/s]

Review_num_words_upper...Completed


100%|██████████| 62995/62995 [00:00<00:00, 66794.68it/s]
  3%|▎         | 2086/62995 [00:00<00:02, 20850.18it/s]

Review_num_words_title...Completed


100%|██████████| 62995/62995 [00:03<00:00, 19920.84it/s]
100%|██████████| 62995/62995 [00:00<00:00, 412005.86it/s]
  0%|          | 0/62995 [00:00<?, ?it/s]

Review_mean_word_len...Completed
Review_num_paragraphs...Completed


100%|██████████| 62995/62995 [00:01<00:00, 48093.43it/s]
  0%|          | 55/62995 [00:00<01:54, 549.54it/s]

Review_num_contractions...Completed


100%|██████████| 62995/62995 [01:40<00:00, 629.03it/s]
  0%|          | 43/62995 [00:00<02:26, 428.86it/s]

Dialog Parser...Completed


100%|██████████| 62995/62995 [00:56<00:00, 1114.35it/s]
  0%|          | 85/62995 [00:00<01:15, 833.26it/s]

Review_polarity...Completed


100%|██████████| 62995/62995 [00:56<00:00, 1105.96it/s]
  0%|          | 2/62995 [00:00<1:08:48, 15.26it/s]

Review_subjectivity...Completed


100%|██████████| 62995/62995 [07:22<00:00, 142.25it/s]

POS Count...Completed

combined_df: (62995, 27)





Unnamed: 0_level_0,Review,Review_num_words,Review_num_unique_words,Review_num_chars,Review_num_stopwords,Review_num_@,Review_num_#,Review_num_urls,Review_num_tags,Review_num_punctuations,...,Review_dialog_mean_word_len,Review_narrative_mean_word_len,Review_polarity,Review_subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39467,Today I'm working on my &quot;Quirky Q&quot; c...,15,15,83,6,0,0,0,0,5,...,0.0,22.0,0.0,0.0,7,3,4,0,0,0
30154,@ShannonElizab dont ya know? people love the h...,9,9,58,1,1,0,0,0,2,...,0.0,11.0,0.25,0.35,5,0,2,2,0,0
16767,ughhh rejected from the 09 mediation program. ...,8,8,55,2,0,0,0,0,2,...,0.0,10.0,0.0,0.0,3,0,1,0,0,1
9334,@petewentz im so jealous. i want an octo drive,9,9,46,3,1,0,0,0,2,...,0.0,11.0,0.0,0.0,5,0,1,2,0,0
61178,I remember all the hype around this movie when...,574,339,3190,272,0,0,0,18,152,...,67.333333,95.0,0.095471,0.478622,223,40,100,38,0,7


## Load word embeddings

In [13]:
with open("../input/nlp-word-embeddings/Glove_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
glove_embeddings_index = processed_data['glove_embeddings_index']
print(f'Glove Word vectors found: {len(glove_embeddings_index)}')

del processed_data
gc.collect()

Glove Word vectors found: 2196017


26

In [14]:
with open("../input/nlp-word-embeddings/FastText_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
fasttext_embeddings_index = processed_data['fasttext_embeddings_index']
print(f'FastText Word vectors found: {len(fasttext_embeddings_index)}')

del processed_data
gc.collect()

FastText Word vectors found: 1000000


0

In [15]:
with open("../input/nlp-word-embeddings/Para_Embeddings.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
para_embeddings_index = processed_data['para_embeddings_index']
print(f'Paragram Word vectors found: {len(para_embeddings_index)}')

del processed_data
gc.collect()

Paragram Word vectors found: 1703755


0

## Text preprocessing

### Helper Functions

In [16]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [17]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    return unknown_words

In [18]:
def add_lower(embedding, vocab):
    count = 0
    
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
            
    print(f"Added {count} words to embedding")
    return embedding

In [19]:
def clean_contractions(text):
    specials = ["’", "‘", "´", "`"]
    mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

    for s in specials:
        text = text.replace(s, "'")
        
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [20]:
def unknown_punct(embed):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    unknown = ''
    
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
            
    return unknown

In [21]:
def clean_special_chars(text):
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }    
    
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': '', '':'', '·':'', '¿':'', '¨':'', '»':'', '«':''}
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

In [22]:
def sep_combined(text, glove_embed, fasttext_embed, para_embed):
    new_text = []
    
    for word in text.split():
        if not((word in glove_embed) or (word in fasttext_embed) or (word in para_embed)):
            wn_token = wordninja.split(word)
        
            if len(wn_token) > 1:
                for w in wn_token:
                    new_text.append(w)
            else:
                new_text.append(word)
        
        else:
            new_text.append(word)
    
    return " ".join(new_text)

### Get initial word coverage

In [23]:
vocab = build_vocab(combined_df['Review'])

print("\nGlove : ")
oov_glove = check_coverage(vocab, glove_embeddings_index)

print("\nParagram : ")
oov_paragram = check_coverage(vocab, para_embeddings_index)

print("\nFastText : ")
oov_fasttext = check_coverage(vocab, fasttext_embeddings_index)

100%|██████████| 62995/62995 [00:01<00:00, 33848.62it/s]



Glove : 
Found embeddings for 30.39% of vocab
Found embeddings for  87.34% of all text

Paragram : 
Found embeddings for 18.63% of vocab
Found embeddings for  77.71% of all text

FastText : 
Found embeddings for 28.64% of vocab
Found embeddings for  86.14% of all text


In [24]:
print("Glove : ")
glove_embeddings_index = add_lower(glove_embeddings_index, vocab)

print("\nParagram : ")
para_embeddings_index = add_lower(para_embeddings_index, vocab)

print("\nFastText : ")
fasttext_embeddings_index = add_lower(fasttext_embeddings_index, vocab)

Glove : 
Added 5126 words to embedding

Paragram : 
Added 4 words to embedding

FastText : 
Added 9876 words to embedding


In [25]:
combined_df['processed_review'] = combined_df['Review'].progress_apply(lambda x: x.lower())
vocab = build_vocab(combined_df['processed_review'])

print("\nGlove : ")
oov_glove = check_coverage(vocab, glove_embeddings_index)

print("\nParagram : ")
oov_paragram = check_coverage(vocab, para_embeddings_index)

print("\nFastText : ")
oov_fasttext = check_coverage(vocab, fasttext_embeddings_index)

100%|██████████| 62995/62995 [00:00<00:00, 419258.02it/s]
100%|██████████| 62995/62995 [00:01<00:00, 34312.94it/s]



Glove : 
Found embeddings for 27.67% of vocab
Found embeddings for  87.47% of all text

Paragram : 
Found embeddings for 27.89% of vocab
Found embeddings for  87.49% of all text

FastText : 
Found embeddings for 26.26% of vocab
Found embeddings for  86.26% of all text


### Remove contractions

In [26]:
combined_df['processed_review'] = combined_df['processed_review'].progress_apply(lambda x: clean_contractions(x))
vocab = build_vocab(combined_df['processed_review'])

print("\nGlove : ")
oov_glove = check_coverage(vocab, glove_embeddings_index)

print("\nParagram : ")
oov_paragram = check_coverage(vocab, para_embeddings_index)

print("\nFastText : ")
oov_fasttext = check_coverage(vocab, fasttext_embeddings_index)

100%|██████████| 62995/62995 [00:01<00:00, 38431.81it/s]
100%|██████████| 62995/62995 [00:01<00:00, 34341.98it/s]



Glove : 
Found embeddings for 27.70% of vocab
Found embeddings for  88.10% of all text

Paragram : 
Found embeddings for 27.93% of vocab
Found embeddings for  88.11% of all text

FastText : 
Found embeddings for 26.29% of vocab
Found embeddings for  87.88% of all text


### Handle punctuations

In [27]:
print("Glove :")
print(unknown_punct(glove_embeddings_index))

print("\nParagram :")
print(unknown_punct(para_embeddings_index))

print("\nFastText :")
print(unknown_punct(fasttext_embeddings_index))

Glove :
₹ 

Paragram :
₹ 

FastText :
_ ` 


In [28]:
combined_df['processed_review'] = combined_df['processed_review'].progress_apply(lambda x: clean_special_chars(x))
vocab = build_vocab(combined_df['processed_review'])

print("\nGlove : ")
oov_glove = check_coverage(vocab, glove_embeddings_index)

print("\nParagram : ")
oov_paragram = check_coverage(vocab, para_embeddings_index)

print("\nFastText : ")
oov_fasttext = check_coverage(vocab, fasttext_embeddings_index)

100%|██████████| 62995/62995 [00:03<00:00, 20991.39it/s]
100%|██████████| 62995/62995 [00:01<00:00, 34482.97it/s]



Glove : 
Found embeddings for 71.74% of vocab
Found embeddings for  99.47% of all text

Paragram : 
Found embeddings for 74.31% of vocab
Found embeddings for  99.52% of all text

FastText : 
Found embeddings for 64.77% of vocab
Found embeddings for  99.31% of all text


### Separate combined words

In [29]:
combined_df['processed_review'] = combined_df['processed_review'].progress_apply(lambda x: sep_combined(x, glove_embeddings_index, 
                                                                                                        fasttext_embeddings_index, 
                                                                                                        para_embeddings_index))
vocab = build_vocab(combined_df['processed_review'])

print("\nGlove : ")
oov_glove = check_coverage(vocab, glove_embeddings_index)

print("\nParagram : ")
oov_paragram = check_coverage(vocab, para_embeddings_index)

print("\nFastText : ")
oov_fasttext = check_coverage(vocab, fasttext_embeddings_index)

100%|██████████| 62995/62995 [00:06<00:00, 10469.68it/s]
100%|██████████| 62995/62995 [00:01<00:00, 32733.40it/s]



Glove : 
Found embeddings for 95.67% of vocab
Found embeddings for  99.93% of all text

Paragram : 
Found embeddings for 99.26% of vocab
Found embeddings for  99.99% of all text

FastText : 
Found embeddings for 85.65% of vocab
Found embeddings for  99.75% of all text


### Remove numbers

In [30]:
combined_df['processed_review'] = combined_df['processed_review'].progress_apply(lambda x: re.sub('[0-9]+', '', x))
vocab = build_vocab(combined_df['processed_review'])

print("\nGlove : ")
oov_glove = check_coverage(vocab, glove_embeddings_index)

print("\nParagram : ")
oov_paragram = check_coverage(vocab, para_embeddings_index)

print("\nFastText : ")
oov_fasttext = check_coverage(vocab, fasttext_embeddings_index)

100%|██████████| 62995/62995 [00:01<00:00, 47165.64it/s]
100%|██████████| 62995/62995 [00:01<00:00, 34330.61it/s]



Glove : 
Found embeddings for 95.78% of vocab
Found embeddings for  99.93% of all text

Paragram : 
Found embeddings for 99.41% of vocab
Found embeddings for  99.99% of all text

FastText : 
Found embeddings for 85.87% of vocab
Found embeddings for  99.76% of all text


### Miscellaneous

In [31]:
# Remove double spaces
combined_df['processed_review'] = combined_df['processed_review'].progress_apply(lambda x: re.sub('\s+',  ' ', x))

# Remove repetitive characters
combined_df['processed_review'] = combined_df['processed_review'].progress_apply(lambda x: ''.join(''.join(s)[:2] for _, s in itertools.groupby(x)))

vocab = build_vocab(combined_df['processed_review'])

print("\nGlove : ")
oov_glove = check_coverage(vocab, glove_embeddings_index)

print("\nParagram : ")
oov_paragram = check_coverage(vocab, para_embeddings_index)

print("\nFastText : ")
oov_fasttext = check_coverage(vocab, fasttext_embeddings_index)

100%|██████████| 62995/62995 [00:03<00:00, 18752.58it/s]
100%|██████████| 62995/62995 [00:13<00:00, 4751.17it/s]
100%|██████████| 62995/62995 [00:02<00:00, 29182.66it/s]



Glove : 
Found embeddings for 95.77% of vocab
Found embeddings for  99.93% of all text

Paragram : 
Found embeddings for 99.40% of vocab
Found embeddings for  99.99% of all text

FastText : 
Found embeddings for 86.38% of vocab
Found embeddings for  99.76% of all text


In [32]:
MAX_LEN = combined_df['Review_num_words'].max() + 1
text_list = combined_df['processed_review'].tolist()
print(f"Total number of reviews: {len(text_list)} \nMAX_LEN: {MAX_LEN}")

Total number of reviews: 62995 
MAX_LEN: 1840


## Generate word embeddings

In [33]:
def sent2vec(text, embeddings_index):
    words = nltk.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    
    M = []
    for w in words:
        wn_token = wordninja.split(w)
        
        if len(wn_token) > 1:
            for token in wn_token:
                try:
                    M.append(embeddings_index[token])
                except:
                    continue
        
        else:
            try:
                M.append(embeddings_index[w])
            except:
                continue
    
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    
    return v / np.sqrt((v ** 2).sum())

In [34]:
glove_vec = [sent2vec(x, glove_embeddings_index) for x in tqdm(combined_df["processed_review"].values)]
col_list = ['glove_'+str(i) for i in range(300)]
glove_vec_df = pd.DataFrame(np.array(glove_vec), columns=col_list, index=combined_df.index)
print(f"glove_vec_df: {glove_vec_df.shape}")

combined_df = pd.merge(combined_df, glove_vec_df, 
                       how="inner", on="ID", sort=False)

del glove_vec, glove_vec_df
gc.collect()

print(f"combined_df: {combined_df.shape}")
combined_df.head()

100%|██████████| 62995/62995 [05:14<00:00, 200.50it/s]


glove_vec_df: (62995, 300)
combined_df: (62995, 328)


Unnamed: 0_level_0,Review,Review_num_words,Review_num_unique_words,Review_num_chars,Review_num_stopwords,Review_num_@,Review_num_#,Review_num_urls,Review_num_tags,Review_num_punctuations,...,glove_290,glove_291,glove_292,glove_293,glove_294,glove_295,glove_296,glove_297,glove_298,glove_299
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39467,Today I'm working on my &quot;Quirky Q&quot; c...,15,15,83,6,0,0,0,0,5,...,-0.025311,-0.029377,0.01282,-0.036219,0.060338,-0.000761,0.012377,-0.010671,-0.003606,0.027269
30154,@ShannonElizab dont ya know? people love the h...,9,9,58,1,1,0,0,0,2,...,-0.030972,-0.020399,-0.036216,-0.057353,0.076628,0.018635,-0.0471,-0.008823,-0.033038,0.018111
16767,ughhh rejected from the 09 mediation program. ...,8,8,55,2,0,0,0,0,2,...,0.083348,-0.017254,0.028144,-0.01956,0.04573,-0.055121,0.023674,-0.003762,-0.025701,0.018935
9334,@petewentz im so jealous. i want an octo drive,9,9,46,3,1,0,0,0,2,...,0.013177,-0.032647,-0.010192,-0.053474,0.104402,0.00321,0.013642,0.016696,0.031464,0.000176
61178,I remember all the hype around this movie when...,574,339,3190,272,0,0,0,18,152,...,-0.057244,0.004598,-0.008276,-0.028325,0.014552,0.00751,-0.018981,-0.003701,0.003828,0.019957


In [35]:
fasttext_vec = [sent2vec(x, fasttext_embeddings_index) for x in tqdm(combined_df["processed_review"].values)]
col_list = ['fasttext_'+str(i) for i in range(300)]
fasttext_vec_df = pd.DataFrame(np.array(fasttext_vec), columns=col_list, index=combined_df.index)
print(f"fasttext_vec_df: {fasttext_vec_df.shape}")

combined_df = pd.merge(combined_df, fasttext_vec_df, 
                       how="inner", on="ID", sort=False)

del fasttext_vec, fasttext_vec_df
gc.collect()

print(f"combined_df: {combined_df.shape}")
combined_df.head()

100%|██████████| 62995/62995 [05:14<00:00, 200.19it/s]


fasttext_vec_df: (62995, 300)
combined_df: (62995, 628)


Unnamed: 0_level_0,Review,Review_num_words,Review_num_unique_words,Review_num_chars,Review_num_stopwords,Review_num_@,Review_num_#,Review_num_urls,Review_num_tags,Review_num_punctuations,...,fasttext_290,fasttext_291,fasttext_292,fasttext_293,fasttext_294,fasttext_295,fasttext_296,fasttext_297,fasttext_298,fasttext_299
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39467,Today I'm working on my &quot;Quirky Q&quot; c...,15,15,83,6,0,0,0,0,5,...,0.048533,-0.011358,0.009693,-0.001183,-0.010735,-0.015227,-0.024497,0.106152,0.070438,-0.026371
30154,@ShannonElizab dont ya know? people love the h...,9,9,58,1,1,0,0,0,2,...,0.015877,-0.024766,-0.002248,0.031481,0.024051,0.019621,-0.019488,0.069116,-0.015891,0.021862
16767,ughhh rejected from the 09 mediation program. ...,8,8,55,2,0,0,0,0,2,...,-0.005525,-0.035302,0.007879,0.035808,-0.039154,0.014443,0.01869,0.103939,-0.011758,-0.010508
9334,@petewentz im so jealous. i want an octo drive,9,9,46,3,1,0,0,0,2,...,0.027068,-0.005114,-0.009848,0.050676,0.040232,0.009613,-0.053034,0.069739,0.047785,-0.008926
61178,I remember all the hype around this movie when...,574,339,3190,272,0,0,0,18,152,...,0.005544,-0.011245,-0.000872,-0.001933,0.005125,0.002126,-0.019694,0.120882,0.013285,-0.00885


In [36]:
para_vec = [sent2vec(x, para_embeddings_index) for x in tqdm(combined_df["processed_review"].values)]
col_list = ['para_'+str(i) for i in range(300)]
para_vec_df = pd.DataFrame(np.array(para_vec), columns=col_list, index=combined_df.index)
print(f"para_vec_df: {para_vec_df.shape}")

combined_df = pd.merge(combined_df, para_vec_df, 
                       how="inner", on="ID", sort=False)

del para_vec, para_vec_df
gc.collect()

combined_df.drop(['Review','processed_review'], axis=1, inplace=True)
print(f"combined_df: {combined_df.shape}")
combined_df.head()

100%|██████████| 62995/62995 [05:13<00:00, 200.94it/s]


para_vec_df: (62995, 300)
combined_df: (62995, 926)


Unnamed: 0_level_0,Review_num_words,Review_num_unique_words,Review_num_chars,Review_num_stopwords,Review_num_@,Review_num_#,Review_num_urls,Review_num_tags,Review_num_punctuations,Review_num_words_upper,...,para_290,para_291,para_292,para_293,para_294,para_295,para_296,para_297,para_298,para_299
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
39467,15,15,83,6,0,0,0,0,5,0,...,0.060111,-0.005075,0.045606,-0.006415,0.120375,-0.04041,0.051157,-0.070714,-0.016637,0.049064
30154,9,9,58,1,1,0,0,0,2,0,...,0.015699,-0.012129,-0.013994,-0.077154,0.160594,-0.003633,-0.064221,-0.074613,-0.030375,0.017418
16767,8,8,55,2,0,0,0,0,2,1,...,0.154424,-0.007336,0.037052,-0.003967,0.084572,0.002754,0.066411,-0.061503,0.04761,0.080746
9334,9,9,46,3,1,0,0,0,2,0,...,0.002801,-0.031743,0.002923,-0.050673,0.086777,0.026475,0.023857,0.009329,0.059255,0.02946
61178,574,339,3190,272,0,0,0,18,152,8,...,0.006492,0.003607,-0.020648,-0.030807,0.109541,0.022207,0.012477,-0.068012,0.013718,0.033139


In [37]:
Xtrain = combined_df[:Ytrain.shape[0]].copy()
Xtest = combined_df[Ytrain.shape[0]:].copy()
print(f"Xtrain: {Xtrain.shape} \nXtest: {Xtest.shape}")

del combined_df
gc.collect()

Xtrain: (44095, 926) 
Xtest: (18900, 926)


20

## Generate word sequences

In [38]:
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {
        "N": wordnet.NOUN, 
        "V": wordnet.VERB, 
        "J": wordnet.ADJ, 
        "R": wordnet.ADV
    }
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

In [39]:
nlp = spacy.load('en_core_web_lg', disable=['parser','ner','tagger'])
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
docs = nlp.pipe(text_list, n_threads=2)

In [40]:
word_dict = {}
word_index = 1
lemma_dict = {}
word_sequences = []

for doc in tqdm(docs):
    word_seq = []
    
    for token in doc:
        if (token.text not in word_dict) and (token.pos_ is not "PUNCT"):
            word_dict[token.text] = word_index
            word_index += 1
            lemma_dict[token.text] = token.lemma_
            
        if token.pos_ is not "PUNCT":
            word_seq.append(word_dict[token.text])
            
    word_sequences.append(word_seq)

del docs
gc.collect()

62995it [00:59, 1063.38it/s]


27

In [41]:
train_word_sequences = word_sequences[:Ytrain.shape[0]]
test_word_sequences = word_sequences[Ytrain.shape[0]:]

train_word_sequences = pad_sequences(train_word_sequences, maxlen=MAX_LEN, padding='post')
test_word_sequences = pad_sequences(test_word_sequences, maxlen=MAX_LEN, padding='post')

del word_sequences
gc.collect()

20

## Generate embedding matrix

In [42]:
def word_embeddings(word_dict, lemma_dict, embeddings_index):
    embed_size = 300
    nb_words = len(word_dict)+1
    embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
    unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
    unknown_words = []
    
    for key in tqdm(word_dict):
        word = key
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        
        word = lemma_dict[key]
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[word_dict[key]] = embedding_vector
            continue
        
        embedding_matrix[word_dict[key]] = unknown_vector
        unknown_words.append(key)
        
    return embedding_matrix, nb_words, unknown_words

In [43]:
embed_matrix_glove, nb_words, unknown_words_glove = word_embeddings(word_dict, lemma_dict, glove_embeddings_index)
embed_matrix_fasttext, nb_words, unknown_words_fasttext = word_embeddings(word_dict, lemma_dict, fasttext_embeddings_index)
embed_matrix_para, nb_words, unknown_words_para = word_embeddings(word_dict, lemma_dict, para_embeddings_index)

embedding_matrix1 = np.concatenate((embed_matrix_glove, embed_matrix_fasttext, embed_matrix_para), axis=1)
embedding_matrix2 = (embed_matrix_glove * 0.7) + (embed_matrix_fasttext * 0.3)
embedding_matrix3 = (embed_matrix_glove * 0.6) + (embed_matrix_para * 0.4)
embedding_matrix4 = (embed_matrix_glove * 0.4) + (embed_matrix_para * 0.4) + (embed_matrix_fasttext * 0.2)

unknown_words = list(set(unknown_words_glove + unknown_words_fasttext + unknown_words_para))
print(f"Unknown words: {len(unknown_words)}")

del glove_embeddings_index, fasttext_embeddings_index, para_embeddings_index
del embed_matrix_glove, embed_matrix_fasttext, embed_matrix_para
del unknown_words_glove, unknown_words_fasttext, unknown_words_para
gc.collect()

100%|██████████| 76543/76543 [00:00<00:00, 193293.01it/s]
100%|██████████| 76543/76543 [00:00<00:00, 152685.20it/s]
100%|██████████| 76543/76543 [00:00<00:00, 165476.68it/s]


Unknown words: 4918


98

## Save processed datasets

In [44]:
data_dict = {
    'train_word_sequences': train_word_sequences,
    'test_word_sequences': test_word_sequences,
    'nb_words': nb_words,
    'MAX_LEN': MAX_LEN
}

file = open("./MH_New_Dawn_Set1.txt", 'wb')
pickle.dump(data_dict, file)
file.close()

In [45]:
data_dict = {
    'Xtrain': Xtrain,
    'Xtest': Xtest,
    'Ytrain': Ytrain,
    'Ytrain_oh': Ytrain_oh,
    'class_weight': class_weight
}

file = open("./MH_New_Dawn_Set2.txt", 'wb')
pickle.dump(data_dict, file)
file.close()

In [46]:
data_dict = {
    'embedding_matrix1': embedding_matrix1,
    'embedding_matrix2': embedding_matrix2,
    'embedding_matrix3': embedding_matrix3,
    'embedding_matrix4': embedding_matrix4
}

file = open("./MH_New_Dawn_Set3.txt", 'wb')
pickle.dump(data_dict, file)
file.close()