In [8]:
#!pip install wordcloud
#!pip install langdetect
#!pip install googletrans
#!pip install textblob
#!pip install spacy
#!python -m spacy download en
#!python -m spacy download en_core_web_sm
#!python -m spacy download en_core_web_md
#!python -m spacy download en_core_web_lg
#!pip install -U spacy-lookups-data
#!pip install langid
#!pip install google_trans_new
#!pip uninstall googletrans

In [9]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS 
from langdetect import detect
from itertools import cycle
#import googletrans
#from googletrans import Translator
from google_trans_new import google_translator 
from multiprocessing.dummy import Pool as ThreadPool
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix, classification_report
from sklearn import preprocessing
#from textblob import TextBlob
#from textblob.translate import NotTranslated
import random
import operator
import math
import tqdm
import time
import spacy
import json
import langid
from bs4 import BeautifulSoup
from string import digits
from functools import partial

### This section below contains  Useful Functions 
- As we find new functions, we will create them here.

In [None]:
language = ["es", "de", "fr", "ar", "te", "hi", "ja", "fa", "sq", "bg", "nl", "gu", "ig", "kk", "mt", "ps"]

def data_augmentation_lang_translation(message, language, aug_range=1):
    augmented_messages = []
    if hasattr(message, "decode"):
        message = message.decode("utf-8")

        
    def request(lang_tgt, text):
        t = google_translator(timeout=15)
        translate_text = t.translate(text.strip(), lang_tgt)
        return translate_text


    for j in range(0,aug_range) :
            if __name__ == "__main__" :
              pool = ThreadPool(20) # Threads
              time1 = time.time()
              try:
                  lang_tgt = sr.choice(language)
                  func = partial(request, lang_tgt)
                  results = pool.map(func, message)
              except Exception as e:
                  raise e
              pool.close()
              pool.join()

              time2 = time.time()
              print("Translating %s Descriptions to %s, a total of %s s"%("placeholder",lang_tgt,time2 - time1))
                
              pool = ThreadPool(20) # Threads
              time1 = time.time()
              try:
                  lang_tgt = 'en'
                  func = partial(request, lang_tgt)
                  results = pool.map(func, message)
              except Exception as e:
                  raise e
              pool.close()
              pool.join()

              time2 = time.time()
              #print("Translating %s Descriptions, a total of %s s"%("placeholder",lang_tgt,time2 - time1))

              augmented_messages.append(str(results))

    return augmented_messages


#pool = ThreadPool(20)
#def f(a, b, c):
#    print("{} {} {}".format(a, b, c))

#def main():
#    iterable = [1, 2, 3, 4, 5]
#    a = "hi"
#    b = "there"
#    func = partial(f, a, b)
#    pool.map(func, iterable)
#    pool.close()
#    pool.join()

#if __name__ == "__main__":
#    main()

# In this notebook we will try out if we can augment data using Translation technique

In [None]:
mydata = pd.read_csv('datasets/input_data_after_preprocessing.csv')
#mydata = pd.read_excel("datasets/input_data.xlsx")

In [None]:
mydata.head(20)

### Data Augmentation using Language Translation

In [None]:
## Dictionary for intent count
## Intent is column name
combined_description_count = mydata['Combined Description Cleaned'].value_counts().to_dict()
combined_description_count

In [None]:
## Get max intent count to match other minority classes through data augmentation

max_combined_description_count = max(combined_description_count.items(), key=operator.itemgetter(1))[1]
max_combined_description_count

In [None]:
## Loop to interate all full descriptions
newdf = pd.DataFrame()
for combined_description, count in combined_description_count.items() :
    count_diff = max_combined_description_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in tqdm.tqdm(mydata[mydata["Combined Description Cleaned"] == combined_description]["Combined Description Cleaned"]) :
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=['Combined Description Cleaned'])
            dummy1["Combined Description Cleaned"] = combined_description
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_augmentation_lang_translation(message, language, multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=['Combined Description Cleaned'])
            dummy2["Combined Description Cleaned"] = combined_description
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])
    else :
        newdf = newdf.append(mydata[mydata["Combined Description Cleaned"] == combined_description])

### Data Augmentation using spaCy

In [None]:
# load nltk's English stopwords as variable called 'stop' and don't find synonym of those words.
stop = nltk.corpus.stopwords.words('english')

In [None]:
tokenizer = make_tokenizer(mydata['Full Description Cleaned'])    ## Message is column name

X = tokenizer.texts_to_sequences(mydata['Full Description Cleaned'])

from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X, 70)

In [None]:
X

In [None]:
## Dictionary of word index
index_word = {}
for word in tokenizer.word_index.keys():
    index_word[tokenizer.word_index[word]] = word

In [None]:
index_word

In [None]:
## word list
words = [value for key, value in index_word.items()]

In [None]:
words

In [None]:
 def most_similar(word):
     by_similarity = sorted(word.vocab, key=lambda w: word.similarity(w), reverse=True)
     return [w.orth_ for w in by_similarity[:10]]

In [None]:
## Synonym dictionary
synonym_dict = {}

for word in words:
    #if (not check_oos(word)) :
        synonym_dict.update({word : tuple([w.lower_ for w in get_word_synonym(nlp.vocab[word])])})
        print(word, " : ", [w.lower_ for w in get_word_synonym(nlp.vocab[word])])

In [None]:
synonym_dict

In [None]:
## Only consider filtered synonym
import collections
value_occurrences = collections.Counter(synonym_dict.values())

filtered_synonym = {key: value for key, value in synonym_dict.items() if value_occurrences[value] == 1}

In [None]:
filtered_synonym

In [None]:
## Dictionary for Full Descrption "Cleaned" count
## "Full Descrption Cleaned" is column name
full_description_count = mydata['Full Description Cleaned'].value_counts().to_dict()
full_description_count

In [None]:
## Get max "Full Description" count to match other minority classes through data augmentation
#import operator

max_full_description_count = max(full_description_count.items(), key=operator.itemgetter(1))[1]
max_full_description_count

In [None]:
## Loop to interate all messages
newdf = pd.DataFrame()
for full_description, count in full_description_count.items() :
    count_diff = max_full_description_count - count    ## Difference to fill
    multiplication_count = math.ceil((count_diff)/count)  ## Multiplying a minority classes for multiplication_count times
    if (multiplication_count) :
        old_message_df = pd.DataFrame()
        new_message_df = pd.DataFrame()
        for message in tqdm.tqdm(mydata[mydata["Full Description Cleaned"] == full_description]["Full Description Cleaned"]) :
            ## Extracting existing minority class batch
            dummy1 = pd.DataFrame([message], columns=["Full Description Cleaned"])
            dummy1["Full Description Cleaned"] = full_description
            old_message_df = old_message_df.append(dummy1)
            
            ## Creating new augmented batch from existing minority class
            new_messages = data_augmentation_spaCy(message, multiplication_count)
            dummy2 = pd.DataFrame(new_messages, columns=["Full Description Cleaned"])
            dummy2["Full Description Cleaned"] = full_description
            new_message_df = new_message_df.append(dummy2)
        
        ## Select random data points from augmented data
        new_message_df=new_message_df.take(np.random.permutation(len(new_message_df))[:count_diff])
        
        ## Merge existing and augmented data points
        newdf = newdf.append([old_message_df,new_message_df])
    else :
        newdf = newdf.append(mydata[mydata["Full Description Cleaned"] == full_description])

In [None]:
newdf.describe()

In [None]:
newdf.head(461)

In [None]:
newdf.to_csv('datasets/spaCy_Augmented_Data.csv') 

In [None]:
with open('datasets/spaCy_synonyms.json', 'w') as fp:
    json.dump(synonym_dict, fp, sort_keys=True, indent=4)

In [None]:

with open('datasets/spaCy_filtered_synonyms.json', 'w') as fp:
    json.dump(filtered_synonym, fp, sort_keys=True, indent=4)

###### Data Augmentation using Word Embedding

In [None]:


tokenizer = make_tokenizer(mydata['Full Description Cleaned'])

In [None]:

## Dictionary of word index
index_word = {}
for word in tokenizer.word_index.keys():
    index_word[tokenizer.word_index[word]] = word

vocab_dict = tokenizer.word_index

In [None]:

## Loading word embedding
from time import time
start = time()
embed_mat = loadEmbeddingMatrix("glove840B300D", vocab_dict)
end = time()
print("Embedding loaded in ", (end-start)/60, "min")

In [None]:
from sklearn.neighbors import NearestNeighbors

synonyms_number = 5
word_number = 20000

nn = NearestNeighbors(n_neighbors=synonyms_number+1).fit(embed_mat)

neighbours_mat = nn.kneighbors(embed_mat[1:word_number])[1]

synonyms = {x[0]: x[1:] for x in neighbours_mat}

In [None]:
## Finding nearby synonym - Basically it's not actually synonym. It's near by words of targetted word. 
import nltk
from nltk.corpus import wordnet

synonym = {}
for x in range(0,100):
    try :
        synonym.update({index_word[x] : [index_word[synonyms[x][i]] for i in range(synonyms_number-1)]})
    except :
        pass

In [None]:
## Use this synonym list to replace words with it's variation
## Below code is in draft. But logic can be used to complete the task

In [None]:
## Can only change words for selected part of speech to preserve semantic meaning.

import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

def get_pos_tag (word, tagged) :
    res = [(x, y) for x, y in tagged if x == word]
    return res[0][1]

# Load the pretrained neural net
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
for desc in mydata["Full Description Cleaned"]:
    print(desc)
    # Tokenize the text
    tokenized = tokenizer.tokenize(desc)

    # Get the list of words from the entire text
    words = word_tokenize(desc)

    # Identify the parts of speech
    tagged = nltk.pos_tag(words, tagset="universal")
    
    replacements = []

    for word in words:
        synonym = []
        antonyms = []
        word_index = vocab_dict.get(word, None)

        pos_tag = get_pos_tag(word, tagged)
        if (word_index and pos_tag in ["ADJ", "ADV", "NOUN", "VERB"] and word not in nltk.corpus.stopwords.words('english')) :
            for syn in wordnet.synsets(word, eval("wordnet." + pos_tag)):
                for l in syn.lemmas() :
                    if(l.name() in [index_word[synonyms[word_index][i]] for i in range(synonyms_number-1)]):
                        synonym.append(l.name())
                    #if l.antonyms():
                    #    antonyms.append(l.antonyms()[0].name())
        
        if (synonym) :
            print(word, set(synonym))