In [4]:
# File for training word2vec model over a large sentence dataset, importing of paypal data and removing stopwords
#importing important libraries. 

# importing modules needed for comments' cleanup and model training
import gzip
import gensim 
import logging
import pandas as pd
from nltk.corpus import stopwords
from nltk import WordPunctTokenizer
import numpy as np
import pickle
import contractions
from nltk import word_tokenize, sent_tokenize
import re, string, unicodedata
from nltk.stem import PorterStemmer, WordNetLemmatizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
# importing paypal dataset and cleaning it for english feedbacks only.

feedback_data = './LandingPage.csv'
data = pd.read_csv(feedback_data)

#Assuming data structure is same for all the feedbacks from different domains

data = data[['Comments', 'Overall Rating', 'countryCode', 'languageCode',
              'Device Type', 'Browser Name', 'IP Address', 'Submission Date',
              'session_replay', 'Email Address', 'tealeafId', 'guid', 'custId']]

language = 'en'
feedback_data = data.loc[data['languageCode'] == language]
feedback_data = feedback_data.loc[data['Comments'].notnull() == True]

stopwords = './stopwords.csv'
stopset = np.array(pd.read_csv(stopwords))

In [7]:
# Preprocessing of feedback comments

def remove_between_square_brackets(text):
    text = re.sub('\([^]]*\)', '', text)
    text = re.sub('\[[^]]*\]', '', text)
    text = re.sub('\"[^]]*\"', '', text)
    return text
    
def replace_contractions(text):
    return contractions.fix(text)

def doc_clean(feedback):
    feedback_tokens = WordPunctTokenizer().tokenize(feedback)
    clean = feedback_tokens
    clean = [token.lower() for token in feedback_tokens if token.lower() not in stopset and len(token) > 2 and token.isalpha()]
    for i in range(len(clean)):
        clean[i] = unicode(clean[i],'utf-8')
    return clean

def remove_non_ascii(words):
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def stem_words(words):
    stemmer = PorterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas


def preprocessing(comment):
    temp = remove_between_square_brackets(comment)
    temp = replace_contractions(temp)
    temp = doc_clean(temp)
    temp = remove_non_ascii(temp)
    temp = remove_punctuation(temp)
    temp = lemmatize_verbs(temp)
    return temp

    

In [8]:
# Preparing data and training word2vec
#appending paypal feedbacks to dataset
raw_comments = np.array(feedback_data[['Comments', 'Overall Rating', 'IP Address', 'Submission Date'
                                       , 'session_replay', 'tealeafId', 'guid', 'custId']])

clean_comments = []
unclean_comments = []
for i in range(len(raw_comments)):
    processed = preprocessing(raw_comments[i][0])
    if(len(processed)>0):
        clean_comments.append(processed)
        unclean_comments.append(raw_comments[i])
        
        
print(len(clean_comments))

9646


In [9]:
#Saving processed comments

feedback_data.to_csv("feedback_data_en.csv")

with open('processed_comments', 'wb') as f:
    pickle.dump(clean_comments, f)
    
with open('unprocessed_comments', 'wb') as f:
    pickle.dump(unclean_comments, f)



In [16]:
#Run if you have huge corpus of feedbacks else use google's word2vec

train = clean_comments
model = gensim.models.Word2Vec (train, size=300, window=12, min_count=1, workers=10)
model.train(train,total_examples=len(train),epochs=60)
model.save("word2vec_model.pkl")

In [17]:
#checking how well our model is trained

check_word = 'error'
w2 = [check_word]
model.wv.most_similar (w2,topn=10)