In [74]:
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2020)

import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fatihbeyhan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [75]:
data = pd.read_csv('extracted_data.csv').drop('Unnamed: 0', axis=1)
data.columns = ['url','section','location','headline','text','label']

In [76]:
df = pd.DataFrame([data.headline+". "+data.text,data.label]).transpose()
df.columns = ['text','label']

In [77]:
documents = pd.DataFrame(df.text)
documents['index'] = df.index
documents.head()

Unnamed: 0,text,index
0,Readers mail. SC ruling on alimony The Supreme...,0
1,"Niqaab is ‘the elephant in the courtroom’, say...",1
2,Grandparents honoured. Staff and students of G...,2
3,Fishermen call off strike. The country boat fi...,3
4,"Badal meets PM, seeks clemency for Bhullar. Pu...",4


In [78]:
def lemma(news):
    l = WordNetLemmatizer()
    sentences = news.split(".")
    return [l.lemmatize(word).lower() for sentence in sentences for word in sentence.split() if word not in stopwords if word.isalpha() if len(word)> 2 if word.lower() not in ["said","the","also","government","would","one"]]

In [79]:
documents['text'] = documents['text'].map(lemma)

In [80]:
documents.text

0      [readers, mail, ruling, alimony, supreme, cour...
1      [niqaab, elephant, say, judge, person, may, gi...
2      [grandparents, honoured, staff, student, globa...
3      [fishermen, call, strike, country, boat, fishe...
4      [badal, meet, seek, clemency, bhullar, punjab,...
                             ...                        
576    [profit, motive, taking, cue, scheme, implemen...
577    [music, concert, musical, tribute, legendary, ...
578    [blind, pursuit, audience, tired, watching, ac...
579    [pakistan, kill, militant, airstrikes, pakista...
580    [centre, hold, talk, end, manipur, crisis, cen...
Name: text, Length: 581, dtype: object

In [81]:
processed_docs = documents['text']
processed_docs[:10]

0    [readers, mail, ruling, alimony, supreme, cour...
1    [niqaab, elephant, say, judge, person, may, gi...
2    [grandparents, honoured, staff, student, globa...
3    [fishermen, call, strike, country, boat, fishe...
4    [badal, meet, seek, clemency, bhullar, punjab,...
5    [shutdown, affect, life, srinagar, normal, lif...
6    [mysoora, reddy, call, roadside, protest, spec...
7    [kin, protest, hospital, family, man, died, ch...
8    [state, police, force, fully, mobilised, home,...
9    [idol, shop, closed, several, hundred, shop, k...
Name: text, dtype: object

# Bag of Words

In [111]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [112]:
len(dictionary.keys())

14066

In [113]:
dictionary.filter_extremes(no_below=15)

In [114]:
len(dictionary.keys())

746

# Gensim doc2bow


In [115]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[230]

[(7, 1),
 (23, 1),
 (40, 1),
 (49, 1),
 (75, 1),
 (152, 1),
 (193, 1),
 (207, 1),
 (263, 1),
 (271, 1),
 (309, 1),
 (490, 1),
 (666, 1)]

# TF-IDF

In [117]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

# LDA with BOW

In [94]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=2, workers=2)

In [95]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.004*"district" + 0.003*"state" + 0.003*"minister" + 0.003*"police" + 0.003*"student" + 0.003*"india" + 0.002*"court" + 0.002*"two" + 0.002*"take" + 0.002*"year"
Topic: 1 
Words: 0.005*"police" + 0.004*"state" + 0.003*"minister" + 0.003*"two" + 0.003*"leader" + 0.002*"city" + 0.002*"people" + 0.002*"year" + 0.002*"time" + 0.002*"district"


# LDA with TF-IDF

In [122]:
keys = []
lda_model_tfidf = gensim.models.LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=4, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    keys.append(topic[:10])
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.008*"student" + 0.007*"year" + 0.007*"two" + 0.006*"india" + 0.006*"new" + 0.006*"police" + 0.006*"school" + 0.006*"district" + 0.006*"college" + 0.006*"road"
Topic: 1 Word: 0.013*"police" + 0.013*"state" + 0.010*"minister" + 0.010*"party" + 0.009*"district" + 0.009*"leader" + 0.008*"court" + 0.007*"case" + 0.006*"chief" + 0.006*"people"


In [126]:
results=dict([(i,0) for i in keys])

In [127]:
for _ in data[data.label == 0].index:
    for index, score in sorted(lda_model_tfidf[corpus_tfidf[_]], key=lambda tup: -1*tup[1]):
        key = (lda_model_tfidf.print_topic(index, 5)[:10])

        results[key] += 1
        break

In [128]:
results

{'0.008*"stu': 273, '0.013*"pol': 162}

In [130]:
scores = {}

for _ in data.index:
    s = {}
    for index, score in sorted(lda_model_tfidf[corpus_tfidf[_]], key=lambda tup: -1*tup[1]):
    
        key = (lda_model_tfidf.print_topic(index, 5)[:10])

        s[key] = score
        #print(key,score)
    scores[_] = s
    

dfscore = []
for score in scores:
    score = scores[score]
    dfscore.append([score['0.008*"stu'],score['0.013*"pol']])
    
dfscore = pd.DataFrame(dfscore)
dfscore.columns = ['no_protest','protest']

dfscore.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,571,572,573,574,575,576,577,578,579,580
no_protest,0.215787,0.141548,0.849486,0.222383,0.409522,0.231869,0.27347,0.649805,0.10441,0.117688,...,0.822658,0.29123,0.875944,0.845006,0.86571,0.857943,0.682252,0.14278,0.359748,0.120421
protest,0.784213,0.858452,0.150514,0.777617,0.590478,0.768131,0.72653,0.350195,0.89559,0.882312,...,0.177342,0.70877,0.124056,0.154994,0.13429,0.142057,0.317748,0.85722,0.640252,0.879579


In [131]:
dfscore.transpose()[data[data.label==1].label.index]

Unnamed: 0,3,5,7,9,14,21,28,36,40,41,...,540,544,546,548,556,562,570,572,579,580
no_protest,0.222383,0.231869,0.649805,0.117688,0.117215,0.373045,0.119246,0.819095,0.179839,0.1913,...,0.11473,0.497935,0.14094,0.119967,0.853197,0.292298,0.561348,0.29123,0.359748,0.120421
protest,0.777617,0.768131,0.350195,0.882312,0.882785,0.626955,0.880754,0.180905,0.820161,0.8087,...,0.88527,0.502065,0.85906,0.880033,0.146803,0.707702,0.438652,0.70877,0.640252,0.879579


In [132]:
dfscore

Unnamed: 0,no_protest,protest
0,0.215787,0.784213
1,0.141548,0.858452
2,0.849486,0.150514
3,0.222383,0.777617
4,0.409522,0.590478
...,...,...
576,0.857943,0.142057
577,0.682252,0.317748
578,0.142780,0.857220
579,0.359748,0.640252


In [133]:
dfscore.to_csv('lda_scores.csv')