## 0. Basic Packages

In [1]:
import pandas as pd
#Set copy warning to off
pd.set_option('mode.chained_assignment', None)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

##### We import all the functions used in the previous notebooks

In [2]:
from nlp_preprocess import *

##### We add accents to stop_words and exclude the words we want

In [3]:
from nltk.corpus import stopwords

#we select the spanish stop words (by default all the words with accents have them)
stop_words = stopwords.words('spanish')

#now we have the words with accents and the same ones without accents
stop_words = add_non_accents(stop_words)

#list to exclude in stop_words (excluded based in "count_stop_words" fn)
stop_w_list = ["no", "mucho", "muchos", "tiene", "tienen", "otras", "sin", "nada", "algo"
               "tanto", "alguna", "estamos", "tengan", "tenemos", "nuestros", "fuera", "algunos", "hasta"]

#exclude previous list of words from stop_words
stop_words.difference_update(stop_w_list)

## 1. Data Wrangling

In [4]:
f1 = pd.read_excel("comments.xlsx", [0])[0]
f2 = pd.read_excel("comments.xlsx", [1])[1]

df = pd.concat([f1, f2]) 
df.rename(columns={"Initial Classification":"Initial_Classification"}, inplace=True)
print(df.shape)
df.head()

(584, 4)


Unnamed: 0,ID,Area,Comment,Initial_Classification
0,1,Dept 1,Cerrar la brecha entre los que deciden el trab...,Negativo
1,3,Dept 1,"Sobretodo, que se miren las cargas de trabajo ...",Negativo
2,4,Dept 1,1. Dar coaching a algunos Gerentes (que no sab...,Negativo
3,19,Dept 1,s,Negativo
4,29,Dept 1,Mas oportunidades de crecimiento y major salar...,Negativo


### Checking for nulls

In [5]:
#cecking for nulls
df.isna().sum()

ID                         0
Area                       0
Comment                   11
Initial_Classification     0
dtype: int64

In [6]:
#checking data type
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 584 entries, 0 to 291
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      584 non-null    int64 
 1   Area                    584 non-null    object
 2   Comment                 573 non-null    object
 3   Initial_Classification  584 non-null    object
dtypes: int64(1), object(3)
memory usage: 22.8+ KB


In [7]:
#cheking the null comments
df[df.Comment.isna()]

Unnamed: 0,ID,Area,Comment,Initial_Classification
12,120,Dept 1,,Negativo
48,561,Dept 4,,Negativo
109,1280,Dept 7,,Negativo
129,1475,Dept 9,,Negativo
158,1768,Dept 9,,Negativo
227,2481,Dept 9,,Negativo
240,2587,Dept 9,,Negativo
274,2850,Dept 10,,Negativo
277,2887,Dept 10,,Negativo
227,2481,Dept 9,,Positivo


### Updating nulls with NA

In [8]:
#we will replace with the term "NA"
df.Comment[df.Comment.isna()] = "NA"

### Lowering comments

In [9]:
df.Comment = df.Comment.str.lower()

## 2. Preprocess

### Detecting language

In [10]:
#we create a new column with the detected lang
#choose between langdetect or cld3. By default cld3, which works bettter (for es)
df["language"] = dect_lang(df.Comment, detector="langdetect")

In [11]:
#we check the non-snpanish ones
for c, l in zip(df.Comment[df.language!="es"], df.language[df.language!="es"]):
    print("Sentence:", c, "---->", "Lang:", l)

Sentence: s ----> Lang: sv
Sentence: prefiero no comentar ----> Lang: it
Sentence: na ----> Lang: tl
Sentence: more alignment in priorities to focus more and accomplish better results. we try to do too much at the same time, limiting the chances of success.  ----> Lang: en
Sentence: reduce legal and compliance bureaucracy. there is a pervasive view that lawyers and risk managers can manage the business from afar. client facing employees are handcuffed and more worrisome, at times prefer not to look for better alternatives for clients afraid that they might run afoul of one of the many new rules. also, innovation is stifled by all the roadblocks and permissions needed to launch a product or service.  ----> Lang: en
Sentence: no cambiaria nada. ----> Lang: pt
Sentence: ninguna. ----> Lang: tl
Sentence: los salarios justos  ----> Lang: lt
Sentence: nada ----> Lang: so
Sentence: mejorar la paga ----> Lang: id
Sentence: na ----> Lang: tl
Sentence: no cambiaria nada. ----> Lang: pt
Sentence:

Once we have checked the previous output we can see the english comments are well identified, and the rest are either, errors / sentences with no meaning, or spanish sentences identified as catalan, italian or other language

Therefore it is fair to consider, non-english comments as spanish comments in this case

In [12]:
#we replace non-english comments for "es" term
df.language[df.language!="en"] = "es"

In [13]:
#we re-check the list again
for c, l in zip(df.Comment[df.language!="es"], df.language[df.language!="es"]):
    print("Sentence:", c, "---->", "Lang:", l)

Sentence: more alignment in priorities to focus more and accomplish better results. we try to do too much at the same time, limiting the chances of success.  ----> Lang: en
Sentence: reduce legal and compliance bureaucracy. there is a pervasive view that lawyers and risk managers can manage the business from afar. client facing employees are handcuffed and more worrisome, at times prefer not to look for better alternatives for clients afraid that they might run afoul of one of the many new rules. also, innovation is stifled by all the roadblocks and permissions needed to launch a product or service.  ----> Lang: en
Sentence: tienen que mejorar e incentivar el work-life balance o work-life integration en vez de incentivar el burnout.  ----> Lang: en
Sentence: nothing specific ----> Lang: en
Sentence: some businesses within la empresa should be evaluated/treated differently than the rest of the company... because they are vey different in nature. employees / businesses should not be puni

### Creating Dataframe for each Language

In [14]:
df_es = df[df.language=="es"]
df_en = df[df.language=="en"]

### Positive vs Negative Comments, Preprocess & Clean

In [15]:
#separate comments by type
neg_comments_es = df_es.Comment.values[df_es.Initial_Classification=="Negativo"]
pos_comments_es = df_es.Comment.values[df_es.Initial_Classification=="Positivo"]

#clean comments
n_clean_comments_es = clean_comments(neg_comments_es)
p_clean_comments_es = clean_comments(pos_comments_es)


#1. set root to "no" if you don't want the root (lemma) of the word, but the original world (default="yes")
#2. set stop_w to "yes" if you want to include stop_words (default="no")
n_pre_comments_es = preprocess(n_clean_comments_es, root="no", stop_w="no", stop_words=stop_words)
p_pre_comments_es = preprocess(p_clean_comments_es, root="no", stop_w="no", stop_words=stop_words)

#flatten comments in a list
n_flatten_comments_es = [" ".join(comments) for comments in n_pre_comments_es]
p_flatten_comments_es = [" ".join(comments) for comments in p_pre_comments_es]

### Matrix to fit sklearn LDA

In [16]:
def make_matrix(vectorizer=str, ngrams_range=(int, int), arr=np.ndarray):
    if vectorizer == "CountVectorizer":
        vectorizer = CountVectorizer
    elif vectorizer == "TfidfVectorizer":
        vectorizer = TfidfVectorizer
    else:
        vectorizer = None
    
    #the fn raises an error if the vectorizer is not count or tf-idf
    if vectorizer != CountVectorizer and vectorizer != TfidfVectorizer:
        raise ValueError("Please select between CountVectorizer / TfidfVectorizer")
    
    #setting params
    vectorize = vectorizer(ngram_range=ngrams_range) 
    #getting matrix of counts / tf-idf
    matrix = vectorize.fit_transform(arr)
    
    return matrix, vectorize

In [17]:
tokens = input("Select between the group of comments\n'n_flatten_comments_es' (negative)\n'p_flatten_comments_es' (positive)\n")

while tokens != "n_flatten_comments_es" and tokens != "n_flatten_comments_es":
    tokens = input("Select between the one of them")
    
if tokens == "n_flatten_comments_es":
    tokens = n_flatten_comments_es
elif tokens == "p_flatten_comments_es":
    tokens = p_flatten_comments_es
    

vectors, vectorizer = make_matrix(vectorizer="CountVectorizer", ngrams_range=(3, 3), arr=tokens)

Select between the group of comments
'n_flatten_comments_es' (negative)
'p_flatten_comments_es' (positive)
n_flatten_comments_es


## 3. Topic Modeling

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

#lda model
lda = LatentDirichletAllocation(learning_method="batch", max_iter=30, n_jobs=-1)

#params to test in grid search
search_params = {"n_components":list(range(1, 21)), "learning_decay":[.5, .7, .9]}

topic_model = GridSearchCV(lda, param_grid=search_params, n_jobs=-1)

topic_model.fit(vectors)

#best model
best_lda_model = topic_model.best_estimator_

#model parameters
print("Best Model's Params: ", topic_model.best_params_)

#log likelihood Score
print("Best Log Likelihood Score: ", topic_model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(vectors))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 1}
Best Log Likelihood Score:  -6271.5745606376495
Model Perplexity:  1155.7975069308372


In [25]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, vectors, vectorizer, mds='tsne')

ValueError: Found array with 1 sample(s) (shape=(1, 1)) while a minimum of 2 is required.