## **Loading Data**

In [1]:
# mounting my google drive 

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!cp 'drive/My Drive/Datasets/archive.zip' '/content/'

In [None]:
# unzipping the dataset

!unzip archive.zip

Archive:  archive.zip
  inflating: blogtext.csv            


In [None]:
import pandas as pd
train = pd.read_csv("blogtext.csv")

In [None]:
train.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


## **Data Preprocessing**

### **removing unnecessary columns**

In [None]:
train = train[['topic','text']]

In [None]:
train.head()

Unnamed: 0,topic,text
0,Student,"Info has been found (+/- 100 pages,..."
1,Student,These are the team members: Drewe...
2,Student,In het kader van kernfusie op aarde...
3,Student,testing!!! testing!!!
4,InvestmentBanking,Thanks to Yahoo!'s Toolbar I can ...


### **steps**:
* Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All stopwords are removed.
* Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are stemmed — words are reduced to their root form.

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(13)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
stemmer = SnowballStemmer('english')

In [4]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
doc_sample = train['text'][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['', '', '', '', '', '', '', '', '', '', '', 'Info', 'has', 'been', 'found', '(+/-', '100', 'pages,', 'and', '4.5', 'MB', 'of', '.pdf', 'files)', 'Now', 'i', 'have', 'to', 'wait', 'untill', 'our', 'team', 'leader', 'has', 'processed', 'it', 'and', 'learns', 'html.', '', '', '', '', '', '', '', '', '']


 tokenized and lemmatized document: 
['info', 'page', 'file', 'wait', 'until', 'team', 'leader', 'process', 'learn', 'html']


In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
train['clean_doc'] = train['text'].progress_map(preprocess)

In [None]:
train.head(10)

Unnamed: 0,topic,text,clean_doc
0,Student,"Info has been found (+/- 100 pages,...","[info, page, file, wait, until, team, leader, ..."
1,Student,These are the team members: Drewe...,"[team, member, drew, laag, urllink, mail, ruiy..."
2,Student,In het kader van kernfusie op aarde...,"[kader, kernfusi, aard, maak, eigen, waterstof..."
3,Student,testing!!! testing!!!,"[test, test]"
4,InvestmentBanking,Thanks to Yahoo!'s Toolbar I can ...,"[thank, yahoo, toolbar, captur, url, popup, me..."
5,InvestmentBanking,I had an interesting conversation...,"[interest, convers, morn, talk, korean, money,..."
6,InvestmentBanking,Somehow Coca-Cola has a way of su...,"[coca, cola, sum, thing, earli, flagship, jing..."
7,InvestmentBanking,"If anything, Korea is a country o...","[korea, countri, extrem, base, think, come, ko..."
8,InvestmentBanking,Take a read of this news article ...,"[read, news, articl, urllink, joongang, ilbo, ..."
9,InvestmentBanking,I surf the English news sites a l...,"[surf, english, news, sit, look, tidbit, korea..."


In [None]:
train = train.drop(['text'], axis = 1)

In [None]:
train.head(10)

Unnamed: 0,topic,clean_doc
0,Student,"[info, page, file, wait, until, team, leader, ..."
1,Student,"[team, member, drew, laag, urllink, mail, ruiy..."
2,Student,"[kader, kernfusi, aard, maak, eigen, waterstof..."
3,Student,"[test, test]"
4,InvestmentBanking,"[thank, yahoo, toolbar, captur, url, popup, me..."
5,InvestmentBanking,"[interest, convers, morn, talk, korean, money,..."
6,InvestmentBanking,"[coca, cola, sum, thing, earli, flagship, jing..."
7,InvestmentBanking,"[korea, countri, extrem, base, think, come, ko..."
8,InvestmentBanking,"[read, news, articl, urllink, joongang, ilbo, ..."
9,InvestmentBanking,"[surf, english, news, sit, look, tidbit, korea..."


In [None]:
train.to_csv('/content/drive/My Drive/preprocessed_blog.csv')

### **Exploratory Analysis**

In [2]:
# slow method

# train = pd.read_csv("/content/drive/My Drive/preprocessed_blog.csv")
# print(train.head(10))

# from ast import literal_eval
# train['clean_doc'] = train['clean_doc'].map(literal_eval)

In [None]:
train.to_pickle('/content/drive/My Drive/preprocessed_blog.pkl')

In [5]:
import pandas as pd

train = pd.read_pickle('/content/drive/My Drive/preprocessed_blog.pkl')

In [6]:
print(train['clean_doc'][0])

['info', 'page', 'file', 'wait', 'until', 'team', 'leader', 'process', 'learn', 'html']


In [7]:
train.count()

Unnamed: 0    681284
topic         681284
clean_doc     681284
dtype: int64

In [8]:
train['topic'].value_counts()

indUnk                     251015
Student                    153903
Technology                  42055
Arts                        32449
Education                   29633
Communications-Media        20140
Internet                    16006
Non-Profit                  14700
Engineering                 11653
Law                          9040
Publishing                   7753
Science                      7269
Government                   6907
Consulting                   5862
Religion                     5235
Fashion                      4851
Marketing                    4769
Advertising                  4676
BusinessServices             4500
Banking                      4049
Chemicals                    3928
Telecommunications           3891
Accounting                   3832
Military                     3128
Museums-Libraries            3096
Sports-Recreation            3038
HumanResources               3010
RealEstate                   2870
Transportation               2326
Manufacturing 

In [9]:
len(train['topic'].value_counts())

40

## **Bag of Words on the clean docs**

### **dictionary**

In [28]:
dictionary = gensim.corpora.Dictionary(train['clean_doc'])

In [10]:
for i in range(20):
    print(i, dictionary[i])

0 file
1 html
2 info
3 leader
4 learn
5 page
6 process
7 team
8 until
9 wait
10 bryan
11 drew
12 mail
13 member
14 urllink
15 abl
16 absolut
17 accident
18 accomplish
19 accord


### **Filter out tokens that appear in**
* less than 15 documents (absolute number) or
* more than 0.5 documents (fraction of total corpus size, not absolute number).
* after the above two steps, keep only the first 100000 most frequent tokens.

In [11]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [12]:
for i in range(20):
    print(i, dictionary[i])

0 file
1 html
2 info
3 leader
4 learn
5 page
6 process
7 team
8 until
9 wait
10 bryan
11 drew
12 mail
13 member
14 urllink
15 abl
16 absolut
17 accident
18 accomplish
19 accord


In [13]:
with open('/content/drive/My Drive/dictionary.pickle', 'wb') as handle:
    pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
import pickle

with open('/content/drive/My Drive/dictionary.pickle', 'rb') as handle:
    dictionary = pickle.load(handle)

### **Gensim doc2bow**

In [11]:
bow_corpus = [dictionary.doc2bow(doc) for doc in train['clean_doc']]

In [40]:
with open('/content/drive/My Drive/bow_corpus.pickle', 'wb') as handle:
    pickle.dump(bow_corpus, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
with open('/content/drive/My Drive/bow_corpus.pickle', 'rb') as handle:
    bow_corpus = pickle.load(handle)

In [12]:
from pprint import pprint

In [13]:
print('Preprocessed Text')
print(train['clean_doc'][500])
for i in range(len(bow_corpus[500])):
    print("Word {} (\"{}\") appears {} time.".format(bow_corpus[500][i][0], 
                                                    dictionary[bow_corpus[500][i][0]], 
                                                    bow_corpus[500][i][1]))

Preprocessed Text
['monday', 'start', 'possibl', 'process', 'buy', 'place', 'realtor', 'paper', 'sign', 'go', 'commit', 'scari', 'thing', 'idea', 'home', 'ownership', 'sign', 'away', 'life', 'year', 'good', 'idea', 'financi', 'mental', 'good', 'singl', 'girlfriend', 'buy', 'place', 'friend', 'richmond', 'buy', 'hous', 'think', 'ball', 'home', 'ownership', 'deal', 'hous', 'deal', 'person', 'mow', 'prune', 'roof', 'replac', 'window', 'wash', 'bare', 'handl', 'keep', 'room', 'apart', 'order', 'shape', 'imagin', 'letter', 'neighbor', 'concern', 'jungl', 'certain', 'grow', 'outsid', 'home', 'condo', 'condo', 'sound', 'good', 'like', 'apart', 'paint', 'wall', 'deduct', 'condo', 'search', 'hard', 'begin', 'stag', 'see', 'person', 'bunch', 'onlin', 'catch', 'think', 'wait', 'perfect', 'fall', 'picki', 'open', 'mind', 'tell', 'realtor', 'interest', 'citi', 'properti', 'construct', 'want', 'live', 'wear', 'wise', 'place', 'nice', 'tri', 'sell', 'place', 'realtor', 'take', 'note', 'check', 'ring'

### **TF-IDF**

In [None]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
corpus_tfidf[0]

[(0, 0.3298123436868355),
 (1, 0.3558990332081455),
 (2, 0.3585221676972244),
 (3, 0.3338008779074768),
 (4, 0.21615377189705884),
 (5, 0.25751837545870365),
 (6, 0.29673467081264626),
 (7, 0.27147372926271435),
 (8, 0.46649600154635723),
 (9, 0.17942237454157067)]

## **Running LDA using Bag of Words**

In [14]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=22, id2word=dictionary, passes=1, workers=3)

In [15]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.032*"go" + 0.025*"like" + 0.016*"good" + 0.014*"today" + 0.012*"yeah" + 0.010*"come" + 0.009*"think" + 0.009*"watch" + 0.009*"gonna" + 0.009*"play"
Topic: 1 
Words: 0.026*"say" + 0.017*"tell" + 0.017*"year" + 0.015*"famili" + 0.014*"go" + 0.013*"know" + 0.013*"time" + 0.013*"like" + 0.012*"think" + 0.011*"want"
Topic: 2 
Words: 0.015*"like" + 0.012*"look" + 0.009*"wear" + 0.009*"hair" + 0.006*"black" + 0.006*"room" + 0.006*"hous" + 0.006*"white" + 0.006*"water" + 0.005*"walk"
Topic: 3 
Words: 0.786*"nbsp" + 0.004*"think" + 0.004*"like" + 0.003*"time" + 0.003*"know" + 0.003*"go" + 0.002*"want" + 0.002*"good" + 0.002*"come" + 0.002*"look"
Topic: 4 
Words: 0.032*"peopl" + 0.023*"think" + 0.016*"like" + 0.016*"thing" + 0.013*"work" + 0.011*"go" + 0.009*"fuck" + 0.009*"money" + 0.007*"know" + 0.007*"problem"
Topic: 5 
Words: 0.224*"urllink" + 0.029*"movi" + 0.013*"film" + 0.009*"pictur" + 0.006*"watch" + 0.006*"charact" + 0.006*"bring" + 0.005*"http" + 0.005*"photo" + 0.0

In [16]:
# Compute Coherence Score using c_v
from gensim.models.coherencemodel import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=train['clean_doc'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.47214424226038093


In [17]:
with open('/content/drive/My Drive/lda_model.pickle', 'wb') as handle:
    pickle.dump(lda_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('/content/drive/My Drive/lda_model.pickle', 'rb') as handle:
    lda_model = pickle.load(handle)

In [19]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'

# preprocessing and to dictionary
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

# topic scores
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6309009194374084	 Topic: 0.009*"bush" + 0.009*"say" + 0.008*"state" + 0.007*"american" + 0.007*"peopl"
Score: 0.21758389472961426	 Topic: 0.034*"blog" + 0.027*"post" + 0.021*"read" + 0.014*"book" + 0.013*"write"
