In [1]:
import pandas as pd
pd.set_option("display.max_colwidth", 40)
import numpy as np
import re
import spacy
import nltk
from nltk import FreqDist
import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Load  Data

In [5]:
data = pd.read_csv("comments_lemmentized.csv", index_col=0, sep=';')

In [6]:
data.shape

(1303, 1)

AttributeError: 'DataFrame' object has no attribute 'language'

In [8]:
data.head()

Unnamed: 0,lemmantized_reviews
0,"['premium', 'cottage', 'renovate', '..."
1,"['nice', 'environment', 'peaceful', ..."
2,"['expectation', 'high', 'read', 'rev..."
3,"['quite', 'center', 'parcs', 'france..."
4,"['new', 'year', 'trip', 'family', 's..."


In [9]:
#the lemmantized_reviews are interpreted as a string by the read_csv command. We convert it into a list. 
data['lemmantized_reviews'] = data['lemmantized_reviews'].str.replace(r"[\[\]\']", "").str.split(", ")

In [7]:
data['lemmantized_reviews'][0][:10] #checking if it's a list

['premium',
 'cottage',
 'renovate',
 'out',
 'ring',
 'park',
 'mean',
 'dome',
 'shop',
 'minute']

In [10]:
data.dropna(subset=['lemmantized_reviews'], inplace=True)

In [11]:
data.shape

(1303, 1)

## Remove words

In [12]:
# We list the words we want to get rid of
list_stop_words = ["good", "great", "one", "nt", "u2013", "", "would", "get", "center", "parcs"]

In [13]:
def remove_stop_word(x, list_stop_words=list_stop_words):
    clean_x = [l for l in x if l not in list_stop_words]
    return clean_x

In [14]:
data['lemmantized_reviews'] = data['lemmantized_reviews'].apply(remove_stop_word)

In [15]:
data.head()

Unnamed: 0,lemmantized_reviews
0,"[premium, cottage, renovate, ring, p..."
1,"[nice, environment, peaceful, facili..."
2,"[expectation, high, read, review, we..."
3,"[quite, france, normandy, old, franc..."
4,"[new, year, trip, family, stay, week..."


In [16]:
data.shape

(1303, 1)

## Building an LDA model

In [17]:
dictionary = corpora.Dictionary(data['lemmantized_reviews'])

In [18]:
len(dictionary)

10911

In [19]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in data['lemmantized_reviews']]

In [20]:
import warnings
warnings.filterwarnings('ignore')

# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
num_topics = 5
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, 
                num_topics=num_topics, 
                alpha=[0.0001] * num_topics, 
                eta=[0.0001] * len(dictionary),
                chunksize=2000,
                passes=4,
                random_state=100,
               )

In [21]:
lda_model.print_topics(num_words=8)

[(0,
  '0.013*"cottage" + 0.008*"time" + 0.007*"restaurant" + 0.007*"kid" + 0.006*"pool" + 0.006*"family" + 0.006*"day" + 0.006*"staff"'),
 (1,
  '0.008*"area" + 0.008*"park" + 0.007*"day" + 0.007*"lot" + 0.007*"activity" + 0.007*"pool" + 0.006*"site" + 0.006*"restaurant"'),
 (2,
  '0.014*"cottage" + 0.008*"pool" + 0.008*"area" + 0.007*"park" + 0.007*"staff" + 0.007*"time" + 0.006*"child" + 0.006*"restaurant"'),
 (3,
  '0.009*"time" + 0.008*"activity" + 0.008*"kid" + 0.008*"staff" + 0.007*"much" + 0.006*"park" + 0.006*"also" + 0.006*"english"'),
 (4,
  '0.013*"cottage" + 0.008*"time" + 0.007*"really" + 0.007*"kid" + 0.006*"activity" + 0.006*"pool" + 0.006*"park" + 0.006*"child"')]

## Topic visualization

In [22]:
# Visualize the topics
pd.options.display.max_colwidth = 2000
viz = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary, mds='tsne')

In [23]:
pyLDAvis.enable_notebook()
viz
#pyLDAvis.show(viz)