## **Loading Data**

In [None]:
# mounting my google drive 

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!cp 'drive/My Drive/Datasets/archive.zip' '/content/'

In [None]:
# unzipping the dataset

!unzip archive.zip

Archive:  archive.zip
  inflating: blogtext.csv            


In [None]:
import pandas as pd
train = pd.read_csv("blogtext.csv")

In [None]:
train.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


## **Data Preprocessing**

### **removing unnecessary columns**

In [None]:
train = train[['topic','text']]

In [None]:
train.head()

Unnamed: 0,topic,text
0,Student,"Info has been found (+/- 100 pages,..."
1,Student,These are the team members: Drewe...
2,Student,In het kader van kernfusie op aarde...
3,Student,testing!!! testing!!!
4,InvestmentBanking,Thanks to Yahoo!'s Toolbar I can ...


### **steps**:
* Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All stopwords are removed.
* Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are stemmed — words are reduced to their root form.

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(13)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
stemmer = SnowballStemmer('english')

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
doc_sample = train['text'][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['', '', '', '', '', '', '', '', '', '', '', 'Info', 'has', 'been', 'found', '(+/-', '100', 'pages,', 'and', '4.5', 'MB', 'of', '.pdf', 'files)', 'Now', 'i', 'have', 'to', 'wait', 'untill', 'our', 'team', 'leader', 'has', 'processed', 'it', 'and', 'learns', 'html.', '', '', '', '', '', '', '', '', '']


 tokenized and lemmatized document: 
['info', 'page', 'file', 'wait', 'until', 'team', 'leader', 'process', 'learn', 'html']


In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
train['clean_doc'] = train['text'].progress_map(preprocess)

In [None]:
train.head(10)

Unnamed: 0,topic,text,clean_doc
0,Student,"Info has been found (+/- 100 pages,...","[info, page, file, wait, until, team, leader, ..."
1,Student,These are the team members: Drewe...,"[team, member, drew, laag, urllink, mail, ruiy..."
2,Student,In het kader van kernfusie op aarde...,"[kader, kernfusi, aard, maak, eigen, waterstof..."
3,Student,testing!!! testing!!!,"[test, test]"
4,InvestmentBanking,Thanks to Yahoo!'s Toolbar I can ...,"[thank, yahoo, toolbar, captur, url, popup, me..."
5,InvestmentBanking,I had an interesting conversation...,"[interest, convers, morn, talk, korean, money,..."
6,InvestmentBanking,Somehow Coca-Cola has a way of su...,"[coca, cola, sum, thing, earli, flagship, jing..."
7,InvestmentBanking,"If anything, Korea is a country o...","[korea, countri, extrem, base, think, come, ko..."
8,InvestmentBanking,Take a read of this news article ...,"[read, news, articl, urllink, joongang, ilbo, ..."
9,InvestmentBanking,I surf the English news sites a l...,"[surf, english, news, sit, look, tidbit, korea..."


In [None]:
train = train.drop(['text'], axis = 1)

In [None]:
train.head(10)

Unnamed: 0,topic,clean_doc
0,Student,"[info, page, file, wait, until, team, leader, ..."
1,Student,"[team, member, drew, laag, urllink, mail, ruiy..."
2,Student,"[kader, kernfusi, aard, maak, eigen, waterstof..."
3,Student,"[test, test]"
4,InvestmentBanking,"[thank, yahoo, toolbar, captur, url, popup, me..."
5,InvestmentBanking,"[interest, convers, morn, talk, korean, money,..."
6,InvestmentBanking,"[coca, cola, sum, thing, earli, flagship, jing..."
7,InvestmentBanking,"[korea, countri, extrem, base, think, come, ko..."
8,InvestmentBanking,"[read, news, articl, urllink, joongang, ilbo, ..."
9,InvestmentBanking,"[surf, english, news, sit, look, tidbit, korea..."


In [None]:
train.to_csv('/content/drive/My Drive/preprocessed_blog.csv')

### **Exploratory Analysis**

In [None]:
import pandas as pd
train = pd.read_csv("/content/drive/My Drive/preprocessed_blog.csv")

In [None]:
train.head(10)

Unnamed: 0.1,Unnamed: 0,topic,clean_doc
0,0,Student,"['info', 'page', 'file', 'wait', 'until', 'tea..."
1,1,Student,"['team', 'member', 'drew', 'laag', 'urllink', ..."
2,2,Student,"['kader', 'kernfusi', 'aard', 'maak', 'eigen',..."
3,3,Student,"['test', 'test']"
4,4,InvestmentBanking,"['thank', 'yahoo', 'toolbar', 'captur', 'url',..."
5,5,InvestmentBanking,"['interest', 'convers', 'morn', 'talk', 'korea..."
6,6,InvestmentBanking,"['coca', 'cola', 'sum', 'thing', 'earli', 'fla..."
7,7,InvestmentBanking,"['korea', 'countri', 'extrem', 'base', 'think'..."
8,8,InvestmentBanking,"['read', 'news', 'articl', 'urllink', 'joongan..."
9,9,InvestmentBanking,"['surf', 'english', 'news', 'sit', 'look', 'ti..."


In [None]:
from ast import literal_eval
train['clean_doc'] = train['clean_doc'].map(literal_eval)

In [None]:
train.to_pickle('/content/drive/My Drive/preprocessed_blog.pkl')

In [None]:
train = pd.read_pickle('/content/drive/My Drive/preprocessed_blog.pkl')

In [None]:
print(train['clean_doc'][0])

['info', 'page', 'file', 'wait', 'until', 'team', 'leader', 'process', 'learn', 'html']


In [None]:
train.count()

Unnamed: 0    681284
topic         681284
clean_doc     681284
dtype: int64

In [None]:
train['topic'].value_counts()

indUnk                     251015
Student                    153903
Technology                  42055
Arts                        32449
Education                   29633
Communications-Media        20140
Internet                    16006
Non-Profit                  14700
Engineering                 11653
Law                          9040
Publishing                   7753
Science                      7269
Government                   6907
Consulting                   5862
Religion                     5235
Fashion                      4851
Marketing                    4769
Advertising                  4676
BusinessServices             4500
Banking                      4049
Chemicals                    3928
Telecommunications           3891
Accounting                   3832
Military                     3128
Museums-Libraries            3096
Sports-Recreation            3038
HumanResources               3010
RealEstate                   2870
Transportation               2326
Manufacturing 

## **Bag of Words on the clean docs**

### **dictionary**

In [None]:
dictionary = gensim.corpora.Dictionary(train['clean_doc'])

In [None]:
for i in range(20):
    print(i, dictionary[i])

0 file
1 html
2 info
3 leader
4 learn
5 page
6 process
7 team
8 until
9 wait
10 aalder
11 bryan
12 drew
13 laag
14 mail
15 member
16 ruiyu
17 urllink
18 aard
19 abl


### **Filter out tokens that appear in**
* less than 15 documents (absolute number) or
* more than 0.5 documents (fraction of total corpus size, not absolute number).
* after the above two steps, keep only the first 100000 most frequent tokens.

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
for i in range(20):
    print(i, dictionary[i])

0 file
1 html
2 info
3 leader
4 learn
5 page
6 process
7 team
8 until
9 wait
10 bryan
11 drew
12 mail
13 member
14 urllink
15 abl
16 absolut
17 accident
18 accomplish
19 accord


### **Gensim doc2bow**

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in train['clean_doc']]

In [None]:
from pprint import pprint

In [None]:
print('Preprocessed Text')
print(train['clean_doc'][500])
for i in range(len(bow_corpus[500])):
    print("Word {} (\"{}\") appears {} time.".format(bow_corpus[500][i][0], 
                                                    dictionary[bow_corpus[500][i][0]], 
                                                    bow_corpus[500][i][1]))

Preprocessed Text
['monday', 'start', 'possibl', 'process', 'buy', 'place', 'realtor', 'paper', 'sign', 'go', 'commit', 'scari', 'thing', 'idea', 'home', 'ownership', 'sign', 'away', 'life', 'year', 'good', 'idea', 'financi', 'mental', 'good', 'singl', 'girlfriend', 'buy', 'place', 'friend', 'richmond', 'buy', 'hous', 'think', 'ball', 'home', 'ownership', 'deal', 'hous', 'deal', 'person', 'mow', 'prune', 'roof', 'replac', 'window', 'wash', 'bare', 'handl', 'keep', 'room', 'apart', 'order', 'shape', 'imagin', 'letter', 'neighbor', 'concern', 'jungl', 'certain', 'grow', 'outsid', 'home', 'condo', 'condo', 'sound', 'good', 'like', 'apart', 'paint', 'wall', 'deduct', 'condo', 'search', 'hard', 'begin', 'stag', 'see', 'person', 'bunch', 'onlin', 'catch', 'think', 'wait', 'perfect', 'fall', 'picki', 'open', 'mind', 'tell', 'realtor', 'interest', 'citi', 'properti', 'construct', 'want', 'live', 'wear', 'wise', 'place', 'nice', 'tri', 'sell', 'place', 'realtor', 'take', 'note', 'check', 'ring'

### **TF-IDF**

In [None]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
corpus_tfidf[0]

[(0, 0.3298123436868355),
 (1, 0.3558990332081455),
 (2, 0.3585221676972244),
 (3, 0.3338008779074768),
 (4, 0.21615377189705884),
 (5, 0.25751837545870365),
 (6, 0.29673467081264626),
 (7, 0.27147372926271435),
 (8, 0.46649600154635723),
 (9, 0.17942237454157067)]

## **Running LDA using Bag of Words**

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=1, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.026*"go" + 0.015*"good" + 0.013*"play" + 0.013*"like" + 0.011*"time" + 0.011*"night" + 0.009*"watch" + 0.009*"movi" + 0.009*"today" + 0.009*"come"
Topic: 1 
Words: 0.023*"go" + 0.022*"like" + 0.015*"haha" + 0.015*"today" + 0.010*"yeah" + 0.010*"come" + 0.010*"say" + 0.009*"home" + 0.009*"gonna" + 0.008*"time"
Topic: 2 
Words: 0.010*"like" + 0.009*"look" + 0.007*"go" + 0.006*"walk" + 0.006*"littl" + 0.005*"time" + 0.005*"come" + 0.005*"night" + 0.005*"head" + 0.005*"drink"
Topic: 3 
Words: 0.811*"nbsp" + 0.008*"quotejil" + 0.006*"quotejoel" + 0.002*"know" + 0.002*"think" + 0.002*"time" + 0.002*"like" + 0.002*"go" + 0.001*"kendra" + 0.001*"want"
Topic: 4 
Words: 0.026*"work" + 0.014*"time" + 0.012*"go" + 0.012*"week" + 0.009*"need" + 0.009*"think" + 0.008*"year" + 0.008*"thing" + 0.008*"today" + 0.007*"good"
Topic: 5 
Words: 0.104*"urllink" + 0.017*"blog" + 0.015*"post" + 0.013*"read" + 0.011*"book" + 0.008*"site" + 0.008*"link" + 0.007*"write" + 0.006*"pictur" + 0.006

In [None]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'

# preprocessing and to dictionary
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

# topic scores
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6574388742446899	 Topic: 0.011*"peopl" + 0.008*"say" + 0.007*"bush" + 0.006*"american" + 0.006*"state"
Score: 0.20921683311462402	 Topic: 0.104*"urllink" + 0.017*"blog" + 0.015*"post" + 0.013*"read" + 0.011*"book"
Score: 0.01667087711393833	 Topic: 0.026*"work" + 0.014*"time" + 0.012*"go" + 0.012*"week" + 0.009*"need"
Score: 0.01666918583214283	 Topic: 0.029*"know" + 0.026*"like" + 0.025*"think" + 0.019*"want" + 0.017*"thing"
Score: 0.01666867919266224	 Topic: 0.010*"year" + 0.005*"team" + 0.005*"student" + 0.005*"time" + 0.005*"class"
Score: 0.016667895019054413	 Topic: 0.020*"love" + 0.013*"life" + 0.008*"heart" + 0.007*"world" + 0.007*"live"
Score: 0.016667209565639496	 Topic: 0.026*"go" + 0.015*"good" + 0.013*"play" + 0.013*"like" + 0.011*"time"
Score: 0.016666879877448082	 Topic: 0.811*"nbsp" + 0.008*"quotejil" + 0.006*"quotejoel" + 0.002*"know" + 0.002*"think"
Score: 0.0166668388992548	 Topic: 0.010*"like" + 0.009*"look" + 0.007*"go" + 0.006*"walk" + 0.006*"littl"
Score: