In [49]:
import pandas as pd 

data = pd.read_csv('Eluvio_DS_Challenge.csv')

data_text = data['title']

documents = data_text

n = len(documents)
train_n = int(n*0.8)

train = documents[:train_n]
validate = documents[train_n:]

In [50]:
print(len(documents))
print(len(train))
print(len(validate))

509236
407388
101848


In [6]:
documents[:5]

0                  Scores killed in Pakistan clashes
1                   Japan resumes refuelling mission
2                    US presses Egypt on Gaza border
3       Jump-start economy: Give health care to all 
4    Council of Europe bashes EU&UN terror blacklist
Name: title, dtype: object

In [12]:
import gensim 
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np 

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Elias\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [15]:
print(WordNetLemmatizer().lemmatize('went', pos = 'v'))

go


In [16]:
stemmer = SnowballStemmer('english')

original_words = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned',
'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer', 'plotted']

singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})


Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [17]:
# function for preprocessing

def lemmatize_stem(text):

    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos = 'v'))

def preprocess(text):

    result = []

    for token in gensim.utils.simple_preprocess(text):

        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:

            result.append(lemmatize_stem(token))

    return result

In [20]:
doc_num = 3000

doc_sample = documents[:doc_num].values[0]

print("[NOTICE} riginal document..")

words = []

for word in doc_sample.split(' '):
    words.append(word)

print(words)
print('\n\nTokemized and Lemmatized document: ')
print(preprocess(doc_sample))

[NOTICE} riginal document..
['Scores', 'killed', 'in', 'Pakistan', 'clashes']


Tokemized and Lemmatized document: 
['score', 'kill', 'pakistan', 'clash']


In [51]:
train = pd.DataFrame(train)
validate = pd.DataFrame(validate)

processed_doc = train['title'].map(preprocess)

In [52]:
processed_doc[:10]

0                       [score, kill, pakistan, clash]
1                      [japan, resum, refuel, mission]
2                         [press, egypt, gaza, border]
3                 [jump, start, economi, health, care]
4            [council, europ, bash, terror, blacklist]
5    [presto, farmer, unveil, illeg, mock, tudor, c...
6    [strike, protest, gridlock, poland, ukrain, bo...
7                                  [mismanag, program]
8                 [nicola, sarkozi, threaten, ryanair]
9    [plan, missil, shield, polish, town, resist, v...
Name: title, dtype: object

In [53]:
dictionary = gensim.corpora.Dictionary(processed_doc)

In [54]:
count = 0

for k, v in dictionary.iteritems():

    print(k, v)

    count += 1

    if count > 10:
        break 

0 clash
1 kill
2 pakistan
3 score
4 japan
5 mission
6 refuel
7 resum
8 border
9 egypt
10 gaza


In [55]:
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

In [56]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_doc]


In [57]:
bow_corpus[doc_num]

[(54, 1), (254, 1), (2238, 1), (2921, 1), (4903, 1)]

In [58]:
bow_doc_4310 = bow_corpus[doc_num]

for i in range(len(bow_doc_4310)):

    print('word {} (\' {} \') appears {} times'.format(bow_doc_4310[i][0], dictionary[bow_doc_4310[i][0]], bow_doc_4310[i][1]))

word 54 (' time ') appears 1 times
word 254 (' china ') appears 1 times
word 2238 (' tibet ') appears 1 times
word 2921 (' seal ') appears 1 times
word 4903 (' gateway ') appears 1 times


In [59]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
print(tfidf)

TfidfModel(num_docs=407388, num_nnz=3406602)


In [60]:
corpus_tfidf = tfidf[bow_corpus]

print(corpus_tfidf[1])

[(4, 0.3015512525870183), (5, 0.4164584373227593), (6, 0.7092460874421703), (7, 0.4822853921630416)]


In [61]:
from pprint import pprint

for doc in corpus_tfidf:

    pprint(doc)

    break

[(0, 0.5083535236881371),
 (1, 0.2993426062832652),
 (2, 0.42515432580274004),
 (3, 0.6864506524642643)]


In [77]:
lda_model = gensim.models.LdaMulticore(
    bow_corpus,
    num_topics= 2,
    id2word= dictionary,
    passes= 2,
    workers = 2
)

In [78]:
for idx, topic in lda_model.print_topics(-1):

    print('Topics {} \nWords: {} '.format(idx, topic))
    print('\n')

Topics 0 
Words: 0.014*"say" + 0.010*"russia" + 0.009*"china" + 0.009*"syria" + 0.008*"kill" + 0.007*"isi" + 0.007*"russian" + 0.007*"state" + 0.007*"iran" + 0.006*"ukrain" 


Topics 1 
Words: 0.008*"polic" + 0.006*"year" + 0.006*"protest" + 0.005*"say" + 0.005*"isra" + 0.005*"death" + 0.005*"right" + 0.005*"palestinian" + 0.005*"kill" + 0.005*"arrest" 




In [79]:
processed_doc[4000]

['china', 'seal', 'gateway', 'tibet', 'time']

In [80]:
doc_num = 4000
for index, score in sorted(lda_model[bow_corpus[doc_num]], key = lambda tup: -1*tup[1]):

    print('\nScore {}\t \nTopics: {}'.format(score, lda_model.print_topic(index, 10)))


Score 0.6808820366859436	 
Topics: 0.014*"say" + 0.010*"russia" + 0.009*"china" + 0.009*"syria" + 0.008*"kill" + 0.007*"isi" + 0.007*"russian" + 0.007*"state" + 0.007*"iran" + 0.006*"ukrain"

Score 0.3191179633140564	 
Topics: 0.008*"polic" + 0.006*"year" + 0.006*"protest" + 0.005*"say" + 0.005*"isra" + 0.005*"death" + 0.005*"right" + 0.005*"palestinian" + 0.005*"kill" + 0.005*"arrest"


In [83]:
unseen = validate['title'].tolist()[0]

bow_vector = dictionary.doc2bow(preprocess(unseen))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):

    print('Score: {}\t Topic: {}'.format(score, lda_model.print_topic(index, 10)))

Score: 0.5964456796646118	 Topic: 0.014*"say" + 0.010*"russia" + 0.009*"china" + 0.009*"syria" + 0.008*"kill" + 0.007*"isi" + 0.007*"russian" + 0.007*"state" + 0.007*"iran" + 0.006*"ukrain"
Score: 0.40355435013771057	 Topic: 0.008*"polic" + 0.006*"year" + 0.006*"protest" + 0.005*"say" + 0.005*"isra" + 0.005*"death" + 0.005*"right" + 0.005*"palestinian" + 0.005*"kill" + 0.005*"arrest"


In [84]:
check = validate['title'].tolist()[5]

check_vec = dictionary.doc2bow(preprocess(check))

for index, score in sorted(lda_model[check_vec], key=lambda tup: -1*tup[1]):

    print('Score: {}\t Topic: {}'.format(score, lda_model.print_topic(index, 5)))

Score: 0.6998955607414246	 Topic: 0.014*"say" + 0.010*"russia" + 0.009*"china" + 0.009*"syria" + 0.008*"kill"
Score: 0.30010440945625305	 Topic: 0.008*"polic" + 0.006*"year" + 0.006*"protest" + 0.005*"say" + 0.005*"isra"
