In [1]:
import pandas as pd
import numpy as np
import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from pprint import pprint

In [20]:
politics = pd.read_csv("politics.csv", encoding = "ISO-8859-1")
politics["date"] = pd.to_datetime(politics["date"], format = "%Y-%m-%d")
print(politics.head())


           source       date  \
0  business_daily 2017-10-01   
1  business_daily 2017-10-07   
2  business_daily 2017-10-10   
3  business_daily 2017-10-11   
4  business_daily 2017-10-11   

                                            headline type  uhuru  raila   ruto  
0   Jubilee law change to let Uhuru advertise freely   h2   True  False   True  
1                 Uhuru orders probe on Gikomba fire   li   True  False  False  
2  Four injured during anti-IEBC protests as Rail...   li   True   True  False  
3  Nasa legislators to boycott House business in ...   h2  False   True  False  
4  Political crisis deepens as Raila exits presid...   li  False   True  False  


In [21]:
politics["date"].describe()

count                   23608
unique                   2851
top       2017-10-11 00:00:00
freq                       57
first     2008-10-14 00:00:00
last      2019-08-29 00:00:00
Name: date, dtype: object

In [22]:
analyser = SentimentIntensityAnalyzer()
#politics["headline"]  = politics["headline"].str.lower()
politics["sentiment_score"] = politics["headline"].apply(lambda x: analyser.polarity_scores(x)["compound"])
politics.head(10)

Unnamed: 0,source,date,headline,type,uhuru,raila,ruto,sentiment_score
0,business_daily,2017-10-01,Jubilee law change to let Uhuru advertise freely,h2,True,False,True,0.4404
1,business_daily,2017-10-07,Uhuru orders probe on Gikomba fire,li,True,False,False,-0.34
2,business_daily,2017-10-10,Four injured during anti-IEBC protests as Rail...,li,True,True,False,-0.5574
3,business_daily,2017-10-11,Nasa legislators to boycott House business in ...,h2,False,True,False,-0.3182
4,business_daily,2017-10-11,Political crisis deepens as Raila exits presid...,li,False,True,False,-0.6249
5,business_daily,2017-10-11,Raila Odinga quits repeat poll,li,False,True,False,0.0
6,business_daily,2017-10-13,Proposed election laws now await Uhurus signature,li,True,False,False,0.1027
7,business_daily,2017-10-13,Showdown looms as Nasa defies Matiangi ban on ...,li,False,True,False,-0.6369
8,business_daily,2017-10-14,Uhuru receives proposed election law,h2,True,False,False,0.0
9,business_daily,2017-10-14,Nasa defies CS protests in CBD,li,False,True,False,-0.2263


In [23]:
def tag_pos(headline):
    headline = nltk.word_tokenize(headline)
    headline = nltk.pos_tag(headline)
    return(headline)

In [24]:
politics["tags_pos"] = politics["headline"].apply(lambda x: tag_pos(x))
politics[["headline", "tags_pos"]].head() #Not perfect as we have Uhuru labelled as an adjective in the second row

Unnamed: 0,headline,tags_pos
0,Jubilee law change to let Uhuru advertise freely,"[(Jubilee, NNP), (law, NN), (change, NN), (to,..."
1,Uhuru orders probe on Gikomba fire,"[(Uhuru, JJ), (orders, NNS), (probe, VBP), (on..."
2,Four injured during anti-IEBC protests as Rail...,"[(Four, CD), (injured, JJ), (during, IN), (ant..."
3,Nasa legislators to boycott House business in ...,"[(Nasa, NNP), (legislators, NNS), (to, TO), (b..."
4,Political crisis deepens as Raila exits presid...,"[(Political, JJ), (crisis, NN), (deepens, NNS)..."


In [8]:
#extact proper nouns
nouns_list = []
for row in range(0,len(politics)):
    nouns = [item[0] for item in politics["tags_pos"][row] if item[1] in ["NNS","NNS","NNP", "NNPS", "POS"]]
    nouns_list.append(nouns)

In [25]:
politics["nouns"] = pd.Series(nouns_list)
print(politics[["headline", "nouns"]].head())
print(politics[["headline", "nouns"]].tail())

                                            headline  \
0   Jubilee law change to let Uhuru advertise freely   
1                 Uhuru orders probe on Gikomba fire   
2  Four injured during anti-IEBC protests as Rail...   
3  Nasa legislators to boycott House business in ...   
4  Political crisis deepens as Raila exits presid...   

                                             nouns  
0                    [Jubilee, law, change, Uhuru]  
1                          [orders, Gikomba, fire]  
2                    [protests, Raila, Uhuru, row]  
3  [Nasa, legislators, House, business, poll, row]  
4                   [crisis, deepens, Raila, race]  
                                                headline  \
23603  I don't have any problem competing with Raila ...   
23604  Murkomen apologises to Miguna over Jubilee har...   
23605       I'm not in the 222 presidential race - Raila   
23606   Ruto using harambees to bribe the church - Raila   
23607  Ruto showcases his football skills at 

In [2]:
#entity recognition
ner = en_core_web_sm.load()

In [None]:
headline = ner("Jubilee law change to let Uhuru advertise freely")
pprint([(X.text, X.label_) for X in headline.ents]) 

In [3]:
headline = ner("Michael Jordan meets Uhuru at Kasarani Stadium")
pprint([(X.text, X.label_) for X in headline.ents]) 

[('Michael Jordan', 'PERSON'), ('Uhuru', 'PERSON'), ('Kasarani Stadium', 'FAC')]


In [13]:
headline = ner("Uhuru orders probe on Gikomba fire")
pprint([(X.text, X.label_) for X in headline.ents]) # because of the training dataset used, entities specific to Kenya incorrectlt identified.
# Uhuru identifies as GPE(Geopolitical Entity instead of PERSON)
# Main goal is to get the entities for SNA and the label is not particularly important

[('Uhuru', 'GPE'), ('Gikomba', 'GPE')]


In [8]:
def extract_entities(text):
    ent_text = ner(text)
    entities_list = []
    for ent in ent_text.ents:
        entity = ent.text
        entities_list.append(entity)
    return(entities_list)


In [28]:
headline_entities = politics["headline"].apply(lambda x: extract_entities(x))
politics["entities"] = pd.Series(headline_entities, index = politics.index)
politics[["headline", "nouns", "entities"]].head()

Unnamed: 0,headline,nouns,entities
0,Jubilee law change to let Uhuru advertise freely,"[Jubilee, law, change, Uhuru]",[Uhuru]
1,Uhuru orders probe on Gikomba fire,"[orders, Gikomba, fire]","[Uhuru, Gikomba]"
2,Four injured during anti-IEBC protests as Rail...,"[protests, Raila, Uhuru, row]","[Four, Raila Uhuru row]"
3,Nasa legislators to boycott House business in ...,"[Nasa, legislators, House, business, poll, row]","[Nasa, House]"
4,Political crisis deepens as Raila exits presid...,"[crisis, deepens, Raila, race]",[Raila]


In [29]:
politics[["headline", "nouns", "entities"]].tail() 

Unnamed: 0,headline,nouns,entities
23603,I don't have any problem competing with Raila ...,"[problem, Raila]","[Raila, 222-Ruto]"
23604,Murkomen apologises to Miguna over Jubilee har...,"[Murkomen, Miguna, Jubilee, harassmen]","[Murkomen, Miguna, Jubilee]"
23605,I'm not in the 222 presidential race - Raila,"[race, Raila]",[222]
23606,Ruto using harambees to bribe the church - Raila,"[Ruto, harambees, church, Raila]",[]
23607,Ruto showcases his football skills at KICOSCA ...,"[Ruto, football, skills, KICOSCA, games, Kericho]","[KICOSCA, Kericho]"


In [30]:
politics.to_csv("politics_with_entities.csv", index = False)

In [13]:
corruption_news = pd.read_csv("corruption_news2.csv", encoding = "ISO-8859-1")
corruption_news["date"] = pd.to_datetime(corruption_news["date"], format = "%Y-%m-%d")
print(corruption_news.head())


        date        source                                           headline
0 2009-02-05  daily_nation                  S Africa drops Zuma graft charges
1 2009-10-04      standard  Buried skeletons emerge to haunt corruption de...
2 2009-12-30      standard  Why Cabinet has evolved into architect of gran...
3 2010-02-02  daily_nation               US lauds Kenyan efforts on graft war
4 2010-02-13      standard     Graft: Four PSs on suspension yet to know fate


In [14]:
entities = corruption_news["headline"].apply(lambda x: extract_entities(x))
corruption_news["entities"] = pd.Series(entities, index = corruption_news.index)
corruption_news[["headline", "entities"]].head()

Unnamed: 0,headline,entities
0,S Africa drops Zuma graft charges,"[S Africa, Zuma]"
1,Buried skeletons emerge to haunt corruption de...,[]
2,Why Cabinet has evolved into architect of gran...,[Cabinet]
3,US lauds Kenyan efforts on graft war,"[US, Kenyan]"
4,Graft: Four PSs on suspension yet to know fate,"[Graft, Four]"


In [17]:
corruption_news.to_csv("corruption_entities0.csv", index = False)