In [2]:
import pandas as pd

In [6]:
all_data = []
with open('dataset/45k.txt') as f:
    all_data = f.readlines()
final_data = []
count=0
for data in all_data:
    count+=1
    final_data.append(data.strip())
lst = final_data
df = pd.DataFrame(lst[1:])

In [8]:
df['index'] = df.index

In [18]:
documents = df

# Data Pre-processing

We will perform the following steps:

- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- Words that have fewer than 3 characters are removed.
- All stopwords are removed.
- Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
- Words are stemmed — words are reduced to their root form.

In [26]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
stemmer = SnowballStemmer("english")

In [27]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [37]:
processed_docs = documents[0].map(preprocess)

# Bag of Words on the Data set

- Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [33]:
dictionary = gensim.corpora.Dictionary(processed_docs)

# Filter out tokens that appear in

- less than 15 documents (absolute number) or
- more than 0.5 documents (fraction of total corpus size, not absolute number).
- after the above two steps, keep only the first 100000 most frequent tokens.

In [34]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Gensim doc2bow

For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [35]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# TF-IDF

Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document

In [54]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

# Running LDA using Bag of Words
- Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [62]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict);

In [59]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=2, workers=2)

will explore the words occuring in that topic and its relative weight.

In [63]:
get_lda_topics(lda_model,2)

Unnamed: 0,Topic # 01,Topic # 02
0,bodyposit,bodi
1,metoo,love
2,loveyourself,feel
3,plussiz,like
4,love,want
5,plussizefashion,know
6,selflov,peopl
7,share,look
8,beauti,think
9,metoomov,time


# Running LDA using TF-IDF

In [65]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=2, id2word=dictionary, passes=2, workers=4)

In [67]:
get_lda_topics(lda_model_tfidf,2)

Unnamed: 0,Topic # 01,Topic # 02
0,bodyposit,love
1,plussiz,bodi
2,loveyourself,like
3,metoo,feel
4,bodi,need
5,plussizefashion,know
6,selflov,want
7,love,peopl
8,honormycurv,think
9,beauti,thing


# NMF

In [69]:
from sklearn.decomposition import NMF

In [76]:

import numpy as np
 
from sklearn.datasets import fetch_20newsgroups
 
from sklearn.feature_extraction.text import TfidfVectorizer
 
from sklearn.decomposition import NMF


In [79]:
data= lst[1:]
vectorizer = TfidfVectorizer(max_features=20000, min_df=10, stop_words='english')
X = vectorizer.fit_transform(data)
idx_to_word = np.array(vectorizer.get_feature_names())
nmf = NMF(n_components=2, solver="mu")
W = nmf.fit_transform(X)
H = nmf.components_

In [87]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(2):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict).iloc[:n_top_words,:n_top_words];

In [90]:
get_nmf_topics(nmf,20)

Unnamed: 0,Topic # 01,Topic # 02
0,body,story
1,love,believewomen
2,just,believesurvivors
3,like,metoomeredith
4,feel,metoo
5,people,enoughisenough
6,know,whyididntreport
7,make,anonymously
8,time,iamasurvivor
9,look,itsnotyourfault
