In [1]:
# Libraries
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 

# Sklearn
import pyLDAvis
import pyLDAvis.sklearn
from pyLDAvis import sklearn as sklearn_lda
pyLDAvis.enable_notebook()
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Gensim 
from gensim.utils import simple_preprocess
import spacy

# nltk
import nltk
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import nltk

sns.set_style('whitegrid')
%matplotlib inline

ModuleNotFoundError: No module named 'seaborn'

In [None]:
df = pd.read_csv('Data/airline_final.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df = df['content']

In [None]:
# Print out first 5 rows
df.head()

In [None]:
# Remove punctuation
#df = df.map(lambda x: re.sub('[\-,\.!?]', '', x))

In [None]:
# Stopwords
stopwords = stopwords.words('english')
stopwords.extend(['flight','from','or','the','go','in','get','make','never','want','could','even','good','burg','jo','kona','ey'])

In [None]:
# Remove non-english words
words = set(nltk.corpus.words.words())

for i,sen in tqdm(enumerate(df)):
    df[i] = " ".join(w for w in nltk.wordpunct_tokenize(sen) \
     if w.lower() in words or not w.isalpha())

In [None]:
df[100]

In [None]:
len(df)

In [None]:
sent = df[100]
sent = " ".join(w for w in nltk.wordpunct_tokenize(sent) if w.lower() in words or not w.isalpha())
sent

In [None]:
sent = "burg jo kona cape cork town curacao ey polo tripoli"
sent = " ".join(w for w in nltk.wordpunct_tokenize(sent) if w.lower() in words or not w.isalpha())
sent

### Convert to document-term matrix

In [None]:
# Helper function for plotting
def plot_10_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

In [None]:
# Initialise the count vectorizer with the English stop words
tvec = TfidfVectorizer(stop_words=stopwords)

In [None]:
# Fit and transform the processed titles
count_data = tvec.fit_transform(df)

In [None]:
# Visualise the 10 most common words
plot_10_most_common_words(count_data, tvec)

### Fit Latent Dirichlet Allocation Models

In [None]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [None]:
# Helper function
def print_topics(model, tvec, n_top_words):
    words = tvec.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [None]:
# Tweak the two parameters below
number_topics = 5
number_words = 10

In [None]:
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)

In [None]:
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, tvec, number_words)

### Visualizing the models with pyLDAvis

In [None]:
pyLDAvis.sklearn.prepare(lda, count_data, tvec)