In [1]:
import pandas as pd
import nltk
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
from nltk.corpus.reader.wordnet import WordNetError
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models import Phrases
from gensim.models.phrases import Phrases, Phraser
from collections import Counter
from gensim import corpora
from gensim import models
import pyLDAvis.gensim as gensimvis
import pyLDAvis
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

### 1. Google navigation app
- In this subset we explored Google’s application across languages other than 
Indonesian and English. 
- Since our entire date set includes reviews from 140 different countries, 
we were focused on 14 languages that captured the highest number of reviews 
after Indonesian and English. 
- Languages include Spanish, Chinese, Portuguese, Arabic, Russian, French, Thai, 
Italian, Turkish, German, Polish, USA, Vietnamese, Japanese, Chinese (Traditional).

In [2]:
# one time
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dianabursac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dianabursac/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dianabursac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dianabursac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
final = pd.read_csv('final_2 4.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
final.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis =1, inplace = True)

In [5]:
final['all_text_NOemoji'].fillna('NaN', inplace = True)
final_all = final
final = final.loc[final['all_text_NOemoji'] != 'NaN']

In [6]:
google = final.loc[final['App Name'] == 'Google Maps', :]

In [7]:
google2 = google.loc[(google.Country != 'English') & (google.Country != 'Indonesian'), :]
countries = google2.Country.value_counts().head(20) 
countriesIndx = countries[countries > 1000].index
google_data2 = google2[google2.Country.isin(countriesIndx)]
google_data2.Rating.value_counts().index.sort_values()
google_data2.Rating.value_counts()

5.0    38369
1.0    14034
4.0     8276
3.0     6481
2.0     4885
Name: Rating, dtype: int64

In [8]:
google_data2['cleaned_text'].fillna('NaN', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


### 2. Data Pre-Processing
- Stratified train test split was applied to ensure that the train and test sets 
have approximately the same percentage of different rating levels. 
- Since there is a large imbalance in the Google data subset across different ratings 
categories, the train data set was balanced by using random undersampling methodology. 
- As a result, Rating 1 and Rating 5 in the train data set contains an equal number 
of reviews

In [9]:
def randomsampler(X, y):
    data = pd.concat([X, y], axis = 1)
    class_rating5 = data[data.Rating == 5.0]
    class_others = data[data.Rating != 5.0]
    r1, r2, r3, r4, r5 = data.Rating.value_counts().index.sort_values()
    n1 = data.Rating.value_counts()[r1]
    class_rating5_resample = class_rating5.sample(n1, random_state = 42)
    new_data = pd.concat([class_others, class_rating5_resample], axis = 0)
    return new_data

In [10]:
X_data = google_data2.drop(["Rating"], axis =1)
y_data = google_data2["Rating"]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data,random_state = 42,test_size =0.33, stratify = y_data)
google_sampled_train = randomsampler(X_train, y_train)
google_test = pd.concat([X_test, y_test], axis = 1)

### 3. Text Pre-processing, words filtering
- General NLP techniques are used such as part of speech tagging, removal of standard English stop words, removal of the most common words, lemmatization
- Stop words are excluded as well as additional words that didn’t provide insightful meaning such as : 
   	'would’, 'also', 'see', 'something', 'please', "everything”
- In addition, we replaced words with similar meaning with a single word. For example: ‘perfect’, ‘excellent’, ‘great’, and ‘super’ were replaced with ‘excellent’

In [11]:
from collections import Counter
def count_most_common(p, vector):
    count = Counter()
    for line in vector:
        count.update(line) 
    return count.most_common(p)

In [12]:
def count_total_all_words(vector):
    count = Counter()
    for line in vector:
        count.update(line) 
    return count

In [13]:
vector = [nltk.word_tokenize(str(review)) for review in google_sampled_train['cleaned_text'].values]
stop = stopwords.words('english')
vectorNoStop = [[item for item in row if item not in stop] for row in vector]

In [14]:
most_common = count_most_common(20, vectorNoStop)
words_common, freq = zip(*most_common)
# 1. stop words
stop = stopwords.words('english')
# 2. most common
words_common = list(words_common)
words_common, freq = zip(*most_common)

In [15]:
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [16]:
items_to_replace = [
(('perfect', 'excellent',"great",'super', 'ممتاز', 'excellent_excellent','ممتاز','wonderful', 'great_app','best'),'great'), (('app', 'aplicativo', 'application'), 'app'),\
   (('good', 'cool', 'nice', 'fine', 'well', ' work_well', 'buena', 'good_good', 'muy buena'), 'good'), (('love', 'like', 'love_love'), 'love'), ('muito', 'much'),\
(('comment_comment', 'use_comment', 'comment_use', 'heard_comment', 'comment_know'), 'comment'), ('use_use', 'use'), ('open_open', "open")]

In [17]:
import re
def replacement(vector):
    for item0, item1 in items_to_replace:
        if type(item0) is tuple:
            for word in item0:
                vector = [[item1 if item == word else item for item in line] for line in vector]
        elif type(item0) is str:
                vector = [[item1 if item == item0 else item for item in line] for line in vector]
    return vector

### 4.Identifying corpus and dictionary in LDA model

- Two different values for min-frequency were explored: The dictionary with minimum 
word frequency equals one contains 19,948 unique words, while the dictionary with 
minimum words frequency equals three contains 5,463 unique words. 
- By setting min-frequency to 1 all the unique words from the training data set are captured.

In [18]:
def prep_corpus(data, additional_stopwords, no_below, no_above):
    vector = [nltk.word_tokenize(str(review)) for review in data.values]
    vector_noStop = [[item for item in row if item not in stop and item not in additional_stopwords] for row in vector]
    lemmas = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in line] for line in vector_noStop]
    vector_lemmas = replacement(lemmas)
    vector_lemmas = [[item for item in line if len(item)>3] for line in vector_lemmas]
    dictionary = corpora.Dictionary(vector_lemmas)
    dictionary.filter_extremes(no_below, no_above)
    corpus = [dictionary.doc2bow(text) for text in vector_lemmas]
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))
    return dictionary, corpus


In [19]:
def prep_corpus_test(data, additional_stopwords,dictionary):
    vector = [nltk.word_tokenize(str(review)) for review in data.values]
    vector_noStop = [[item for item in row if item not in stop and item not in additional_stopwords] for row in vector]
    lemmas = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in line] for line in vector_noStop]
    vector_lemmas = replacement(lemmas)
    vector_lemmas = [[item for item in line if len(item)>3] for line in vector_lemmas]
    corpus = [dictionary.doc2bow(text) for text in vector_lemmas]
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))
    return corpus

In [20]:
vector = [nltk.word_tokenize(str(review)) for review in google_sampled_train['cleaned_text'].values]
vector_lemmas = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in line] for line in vector]
vector_lemmas = [[item for item in line if len(item)>3] for line in vector_lemmas]
most_common = count_most_common(30, vector_lemmas)
words_common, freq = zip(*most_common)
total = count_total_all_words(vector_lemmas)

In [21]:
# 3. additional_words and most common
words1 = ['life', 'come', 'think', "thank","back", 'yeah', 'chicken', \
          'would','also', 'see', 'something', 'please', "everything",\
          'feel', "thing", 'every', 'kind', 'google', "possible", 'really', 'among']

In [22]:
words_common1 =set(list(words_common)+words1)

In [23]:
dictionary1, corpus1 = prep_corpus(google_sampled_train['cleaned_text'],additional_stopwords = words_common1, no_below = 1, no_above=1 )

Number of unique tokens: 19948
Number of documents: 31966


In [24]:
dictionary2, corpus2 = prep_corpus(google_sampled_train['cleaned_text'],additional_stopwords = words_common1, no_below = 3, no_above= 0.5)

Number of unique tokens: 5463
Number of documents: 31966


### 5. Build LDA models with 5 different topics
- For each review in the train and the test data set we determined the Dominant Topic (out of 5 LDA topics), topic keywords and dominant topic % contribution
- We use dictionary / corpus with 5463 unique tokens (min frequency = 3)

In [25]:
def format_topics_sentences(ldamodel, corpus, texts, ID, Rating_value,Reviews, app):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    reviewId = pd.Series(ID)
    rating = pd.Series(Rating_value)
    App = pd.Series(app)
    reviews = pd.Series(Reviews)
    sent_topics_df = pd.concat([sent_topics_df, contents, reviewId, rating, App, reviews], axis=1)
    return(sent_topics_df)

In [26]:
text_all = [[dictionary2[id] for id, freq in line] for line in corpus2]
reviewID = [id  for id in google_sampled_train['Review ID'].values]
Rating = [r  for r in google_sampled_train['Rating'].values]
appName = [name for name in google_sampled_train["App Name"].values]
reviews = [review for review in google_sampled_train['cleaned_text'].values]

In [27]:
test_corpus2 = prep_corpus_test(google_test['cleaned_text'], additional_stopwords = words_common1, dictionary= dictionary2)

Number of unique tokens: 5463
Number of documents: 23775


In [28]:
text_all_T = [[dictionary2[id] for id, freq in line] for line in test_corpus2]
reviewID_T = [id  for id in google_test['Review ID'].values ]
Rating_T = [r  for r in google_test['Rating'].values]
appName_T = [name for name in google_test["App Name"].values]
reviews_T = [review for review in google_test['cleaned_text'].values]

In [29]:
# LDA model with 5 topics

lda5S = models.ldamodel.LdaModel(corpus=corpus2, id2word=dictionary2, num_topics=5, passes=20,chunksize=4000,random_state=43)
topics5S = lda5S.print_topics(num_words=20)

In [30]:
# 5 topics train Small data set 

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda5S, corpus=corpus2, texts=text_all, ID = reviewID, Rating_value = Rating, app = appName, Reviews = reviews)
df_dominant_topic5S = df_topic_sents_keywords.reset_index()
df_dominant_topic5S.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'Review ID', "Rating", "App Name", "Review"]

In [31]:
# 5 topics test Small data set 

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda5S, corpus=test_corpus2, texts=text_all_T, ID = reviewID_T, Rating_value = Rating_T, app = appName_T, Reviews = reviews_T)
df_dominant_topic5S_T = df_topic_sents_keywords.reset_index()
df_dominant_topic5S_T.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'Review ID', "Rating", "App Name", "Review"]

### Validation of LDA using supervised leraning
- Conversion to document term matrices was employed on the corpus with min frequency = 3 (5463 unique words)
- Validation of LDA model with 5 topics was done on the both train and test data sets by 
using tfidf matrices and Dominant Topic as a target variable
- SVM and NB were used for classification

In [32]:
def create_Xtfidf(min, max, train, test):
    cv = TfidfVectorizer(sublinear_tf = True, min_df = min, max_df = max, norm='l2', binary = True, use_idf = True) 
    X_train = cv.fit_transform(train)
    X_test = cv.transform(test)
    return X_train, X_test

In [33]:
def prep_corpus_tfidf(data, additional_stopwords):
    vector = [nltk.word_tokenize(str(review)) for review in data.values]
    vector_noStop = [[item for item in row if item not in stop and item not in additional_stopwords] for row in vector]
    lemmas = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in line] for line in vector_noStop]
    vector_lemmas = replacement(lemmas)
    vector_lemmas = [[item for item in line if len(item)>3] for line in vector_lemmas]
    return vector_lemmas

In [34]:
lemmas_train5S = prep_corpus_tfidf(df_dominant_topic5S['Review'], additional_stopwords = words_common1)
lemmas_test5S = prep_corpus_tfidf(df_dominant_topic5S_T['Review'], additional_stopwords = words_common1)
data_train5S = [' '.join(x) for x in lemmas_train5S]
data_test5S = [' '.join(x) for x in lemmas_test5S]

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
X_train5S, X_test5S = create_Xtfidf(min =3, max = 500, train = data_train5S, test = data_test5S)

In [37]:
from sklearn.linear_model import SGDClassifier

In [38]:
def tfidf_score(X_train, X_test, y_train, y_test):
    clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=0.01, random_state=42)
    clf_svm.fit(X_train, y_train) 
    y_train_pred= clf_svm.predict(X_train)
    y_test_pred= clf_svm.predict(X_test)
    f_test = f1_score(y_test, y_test_pred, average='micro')
    f_train = f1_score(y_train, y_train_pred, average='micro')
    return f_train, f_test

In [39]:
from sklearn.metrics import classification_report
def tfidf_scoreReport_NB(X_train, X_test, y_train, y_test):
    clf = MultinomialNB()
    clf.fit(X_train, y_train) 
    y_train_pred= clf.predict(X_train)
    y_test_pred= clf.predict(X_test)
    report = classification_report(y_test,y_test_pred)
    return report

In [40]:
def tfidf_scoreReport_SV(X_train, X_test, y_train, y_test):
    clf = SGDClassifier(loss='hinge', penalty='l2',alpha=0.01, random_state=42)
    clf.fit(X_train, y_train) 
    y_train_pred= clf.predict(X_train)
    y_test_pred= clf.predict(X_test)
    report = classification_report(y_test,y_test_pred)
    return report

In [41]:
def tfidf_scoreNB(X_train, X_test, y_train, y_test):
    clf = MultinomialNB()
    clf.fit(X_train, y_train) 
    y_train_pred= clf.predict(X_train)
    y_test_pred= clf.predict(X_test)
    f_test = f1_score(y_test, y_test_pred, average='micro')
    f_train = f1_score(y_train, y_train_pred, average='micro')
    return f_train, f_test

In [42]:
from sklearn.naive_bayes import MultinomialNB

In [45]:
import numpy as np
y_train5S = np.array(df_dominant_topic5S['Dominant_Topic'])
y_test5S = np.array(df_dominant_topic5S_T['Dominant_Topic'])


In [47]:
from sklearn.metrics import f1_score
f5_tf_S, f5T_tf_S = tfidf_score(X_train5S, X_test5S, y_train5S, y_test5S)
f5_tf_S_nb, f5T_tf_S_nb = tfidf_scoreNB(X_train5S, X_test5S, y_train5S, y_test5S)

In [48]:
f5_tf_S, f5T_tf_S

(0.7065006569480072, 0.6731440588853838)