<div class="alert alert-block alert-success"> <center> <h1> <b> Newsfeed Dashboard v1 </b> </h1> </center></div>

<div class="alert alert-block alert-danger"> <b> BEFORE RUNNING THIS NOTEBOOK, PLEASE MAKE SURE YOU INSTALLED CORE-NLP FOR SENTIMENT ANALYSIS.</b> </div>



**To Install CoreNLP, please follow the instructions on stack overflow: [Stanford NLP for Python](https://stackoverflow.com/questions/32879532/stanford-nlp-for-python). Note that you also need Java to be installed.** 

**Once the package is installed, you only need to compute these two following lines on your terminal each time you run this code:**

$ cd stanford-corenlp-full-2018-10-05

$ java -mx5g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 10000

**If any package is not loaded, try installing it using pip install** <i><u> package name </u></i>


### Import Libraries

In [41]:
import requests
from lxml import etree
import numpy as np
import pandas as pd
import datetime as dt
import json
import shapely
import shapely.geometry
import fiona
from pycorenlp import StanfordCoreNLP
import time

### Functions to get data from EMM

In [25]:
def datetorss(date_object):
    return date_object.strftime('%Y-%m-%d')+date_object.strftime('T%H')+'%3A'+date_object.strftime('%M')+'%3A'+date_object.strftime('%S')+'Z'

In [26]:
def get_data_cat_date(last_date,start_date,cat):
    print('Running:',cat,'from:',start_date,'to:',last_date)

    original_title = list()
    links = list()
    description = list()
    pubdate = list()
    orig_lang = list()
    category = list()
    subcategory = list()
    location = list()
    trans_title = list()
    trans_descrip = list()
    entity = list()
    link = 'http://emm.newsbrief.eu/rss/rss?language=all&type=search&mode=advanced&dateto='+datetorss(last_date)+'&datefrom='+datetorss(start_date)+'&category=UNDP_'+cat+'&datefrom='+start_date.strftime('%Y-%m-%d')+'&dateto='+last_date.strftime('%Y-%m-%d')
    cont = requests.get(link)
    data_string = cont.content
    root = etree.XML(data_string)
    for element in root.iter('item'):
        original_title.append(element.find('title').text)
        links.append(element.find('link').text)
        description.append(element.find('description').text)
        pubdate.append(element.find('pubDate').text)
        orig_lang.append(element.find('{http://www.iso.org/3166}language').text)
        category.append(cat)
        u = []
        for j in element.findall('category'):
            if j.text != 'UNDP_'+cat:
                u.append(j.text)
        subcategory.append(u)
        try:
            location.append(element.find('{http://www.georss.org/georss}point').text)
        except:
            location.append(np.nan)
        try:
            trans_title.append(element.find('{http://emm.jrc.it}title').text)
            trans_descrip.append(element.find('{http://emm.jrc.it}description').text)
        except:
            trans_title.append(original_title[-1])
            trans_descrip.append(description[-1])
        m = []
        for child in element.findall('{http://emm.jrc.it}entity'):
            #print(child.attrib['name'])
            m.append(child.attrib['name'])
        entity.append(m)
    data = pd.DataFrame({'Original Title':original_title, 
                         'Link':links,
                         'Original Description':description,
                         'Publication Date':pubdate,
                         'Original Language':orig_lang,
                         'Category':category,
                         'Subcategory':subcategory,
                         'Location':location,
                         'Translated Title': trans_title,
                         'Translated Description': trans_descrip,
                         'Entity': entity})
    data['Publication Date'] = data['Publication Date'].apply(lambda date: dt.datetime.strptime(date.split(' ',1)[1],'%d %b %Y %H:%M:%S %z'))
    return data


In [27]:
def loop(cat,last_date,start_date): 
    data = pd.DataFrame({'Original Title':[],
                                 'Link':[],
                                 'Original Description':[],
                                 'Publication Date':[],
                                 'Original Language':[],
                                 'Category':[],
                                 'Subcategory':[],
                                 'Location':[],
                                 'Translated Title': [],
                                 'Translated Description': [],
                                 'Entity': []})
    i = 0
    while start_date<last_date:
        data = pd.concat([data,get_data_cat_date(last_date,start_date,cat)])
        print('Loop number:',i+1,'--- Data size:',len(data))
        print('----------------------')
        if len(data) < 100:
            return data
        data['Publication Date'] = data['Publication Date'].apply(lambda x: x.replace(tzinfo=None))
        data.sort_values(by = 'Publication Date',ascending = False,inplace = True)
        data.index = range(len(data))
        last_date_2 = last_date
        last_date = data['Publication Date'].min().replace(tzinfo=None)
        if last_date_2 == last_date:
            if last_date - start_date >= dt.timedelta(days = 1):
                last_date = last_date - dt.timedelta(hours = 1)
            else:
                break
        i+=1
    return data


In [28]:
def get_dataset():
    start_date = input("Enter the begining date (yyyy/mm/dd/hh/mm/ss): ")
    while True:
        try:
            start_date = dt.datetime.strptime(start_date,'%Y/%m/%d/%H/%M/%S')
            break
        except:
            start_date = input("-\nWrong entry format.\nEnter the begining date (yyyy/mm/dd/hh/mm/ss): ")
    last_date = input("Enter the ending date (yyyy/mm/dd/hh/mm/ss): ")
    while True:
        try:
            last_date = dt.datetime.strptime(last_date,'%Y/%m/%d/%H/%M/%S')
            break
        except:
            last_date = input("-\nWrong entry format.\nEnter the begining date (yyyy/mm/dd/hh/mm/ss): ")
    categories = ['Conflict','Drought','Ecology','Genocide','HumanitarianAid','ManMadeDisasters','NaturalDisasters','PoliticalUnrest',
             'ScienceandTechnology','Security','Society','TerroristAttack','WaterConflict']
    data = pd.DataFrame({'Original Title':[],
                                 'Link':[],
                                 'Original Description':[],
                                 'Publication Date':[],
                                 'Original Language':[],
                                 'Category':[],
                                 'Subcategory':[],
                                 'Location':[],
                                 'Translated Title': [],
                                 'Translated Description': [],
                                 'Entity': []})
    for cat in categories:
        data = pd.concat([data, loop(cat,last_date,start_date)])
    return data
        

## Processing the scraped dataset

### Get Article Relevance (Count of articles)

In [29]:
def get_relevant(data):
    df = data
    count = data.groupby('Original Title').count()['Link']
    count_df = pd.DataFrame({'Original Title': count.index, 'Count': count.values})
    df = df.drop_duplicates(subset = 'Original Title').merge(count_df, on = 'Original Title', how = 'left')
    return df


### Get Country

In [30]:
def distance(lat1,lon1,lat2,lon2):
    from math import sin, cos, sqrt, atan2, radians
    # approximate radius of earth in km
    R = 6373.0

    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return(distance)

def get_country(lat, lon, borders,coord):
    try:
        closest = [(k,distance(lat,lon,coord[k][0],coord[k][1])) for k in coord.keys()]
        closest.sort(key=lambda elem: elem[1])
        # Find the associated biome
        for j in (closest):
            border_polygon = shapely.geometry.asShape(borders[j[0]]['geometry'])
            point = shapely.geometry.Point(lon, lat)
            if border_polygon.contains(point):
                country = borders[j[0]]['properties']['NAME']
                break
            else:
                country = np.nan
        return country
    except:
        return np.nan

def get_lat(x):
    try:
        return float((str(x)).split(' ')[0])
    except:
        return np.nan
    
def get_lon(x):
    try:
        return float((str(x)).split(' ')[1])
    except:
        return np.nan
        

def add_country(data,countries_borders, coord):
    data['Latitude'] = data['Location'].apply(lambda x: get_lat(x))
    data['Longitude'] = data['Location'].apply(lambda x: get_lon(x))
    data['Country'] = data.apply(lambda x: get_country(x['Latitude'],x['Longitude'], countries_borders,coord), axis = 1)
    return data

### Get Sentiment

In [32]:
def coreNLPanalysis(text,nlp):
    res = nlp.annotate(text,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 500000, 'ssplit.isOneSentence': True
                   })
    try:
        return res['sentences'][0]['sentimentDistribution'] 
    except:
        return [np.nan]*5
    
def get_sentiment(data,nlp):
    data[['Very Neg', 'Neg', 'Neutral', 'Pos', 'Very Pos']] = data.apply(lambda x: coreNLPanalysis(x['Translated Description'],nlp),
                                                                                                   axis = 1, result_type = 'expand')
    return data

### Get Emotions

In [33]:
def get_emotion_dict():
    nrc = "./External_files/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt"
    count=0
    emotion_dict=dict()
    with open(nrc,'r') as f:
        all_lines = list()
        for line in f:
            if count < 46:
                count+=1
                continue
            line = line.strip().split('\t')
            if int(line[2]) == 1:
                if emotion_dict.get(line[0]):
                    emotion_dict[line[0]].append(line[1])
                else:
                    emotion_dict[line[0]] = [line[1]]
    return emotion_dict

def emotion_analyzer(text,emotion_dict=get_emotion_dict()):
    #Set up the result dictionary
    emotions = {x for y in emotion_dict.values() for x in y} 
    #list comprehension - emotion for (emotion_list in emotion_dict.values() for emotion in emotion_list
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0

    #Analyze the text and normalize by total number of words
    total_words = len(text.split())
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1/total_words
    # Keys: Sadness, Joy, Surprise, Anticipation, Fear, Negative, Anger, Trust, Positive, Disgust
    return list(emotion_count.values())

def get_emotion(data):
    emotion_dict = get_emotion_dict()
    data[['Sadness', 'Joy', 'Surprise', 'Anticipation', 'Fear', 'Neg', 'Anger', 'Trust', 'Positive', 'Disgust']] = data.apply(lambda x: emotion_analyzer(x['Translated Description'],emotion_dict=emotion_dict),
                                                                                                   axis = 1, result_type = 'expand')
    data.drop(['Positive', 'Neg'], axis=1, inplace = True)
    #Columns: Sadness, Joy, Surprise, Anticipation, Fear, Negative, Anger, Trust, Positive, Disgust
    return data


### Finale Function

In [37]:
def final_data():
    data = get_dataset()
    print('---------------------------------------------------')
    print('Running Relevance of data')
    data = get_relevant(data)
    print('---------------------------------------------------')
    print('Running countries')
    countries_borders = fiona.open('./External_files/Country_borders/Borders.shp')
    n = 246
    coord = dict()
    for i in range(n):
        lat = float(countries_borders[i]['properties']['Info de _1'])
        lon = float(countries_borders[i]['properties']['Info de gÃ'])
        coord[i] = (lat,lon)
    data = add_country(data,countries_borders,coord)
    print('---------------------------------------------------')
    
    while True:
        try:
            nlp = StanfordCoreNLP('http://localhost:9000')
            print('Running Sentiment')
            print("Success")
            data = get_sentiment(data,nlp)
            break
        except:
            print('Running Sentiment')
            print('Unable. Please connect to CoreNLP server')
            time.sleep(20)
    print('---------------------------------------------------')
    print('Running Emotions')
    data = get_emotion(data)
    return data

In [38]:
data = final_data()

Enter the begining date (yyyy/mm/dd/hh/mm/ss): 2019/29/03/10/00/00
-
Wrong entry format.
Enter the begining date (yyyy/mm/dd/hh/mm/ss): 2019/03/29/10/00/00
Enter the ending date (yyyy/mm/dd/hh/mm/ss): 2019/03/29/10/10/00
Running: Conflict from: 2019-03-29 10:00:00 to: 2019-03-29 10:10:00
Loop number: 1 --- Data size: 10
----------------------
Running: Drought from: 2019-03-29 10:00:00 to: 2019-03-29 10:10:00
Loop number: 1 --- Data size: 1
----------------------
Running: Ecology from: 2019-03-29 10:00:00 to: 2019-03-29 10:10:00
Loop number: 1 --- Data size: 5
----------------------
Running: Genocide from: 2019-03-29 10:00:00 to: 2019-03-29 10:10:00
Loop number: 1 --- Data size: 0
----------------------
Running: HumanitarianAid from: 2019-03-29 10:00:00 to: 2019-03-29 10:10:00
Loop number: 1 --- Data size: 1
----------------------
Running: ManMadeDisasters from: 2019-03-29 10:00:00 to: 2019-03-29 10:10:00
Loop number: 1 --- Data size: 34
----------------------
Running: NaturalDisasters 

In [40]:
data

Unnamed: 0,Original Title,Link,Original Description,Publication Date,Original Language,Category,Subcategory,Location,Translated Title,Translated Description,...,Pos,Very Pos,Sadness,Joy,Surprise,Anticipation,Fear,Anger,Trust,Disgust
0,Mobilisation palestinienne à haut risque samed...,https://www.corsematin.com/article/france-mond...,Par Adel Zaanoun. Gaza City (Palestinian Terri...,2019-03-29 11:10:00+01:00,fr,Conflict,[Conflict],31.837038 35.440538,Palestinian Mobilisation high-risk Saturday to...,Palestinian Mobilisation high-risk Saturday to...,...,0.097546,0.013919,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,الآلاف يحيون مراسم تكريم ضحايا اعتداء كرايست ت...,https://www.dw.com/ar/%D8%A7%D9%84%D8%A2%D9%84...,بحضور رئيسة وزراء نيوزيلندا جاسيندا أرديرن ونظ...,2019-03-29 11:10:00+01:00,ar,Conflict,[Conflict],48.2021 16.321,Thousands mark a ceremony honoring the victims...,Thousands mark a ceremony honoring the victims...,...,0.141569,0.024454,0.100000,0.000000,0.000000,0.000000,0.000000,0.100000,0.000000,0.000000
2,10h30 Mobilisation palestinienne à haut risque...,https://www.la-croix.com/Monde/Mobilisation-pa...,Les Palestiniens de Gaza sont appelés à se ras...,2019-03-29 11:09:00+01:00,fr,Conflict,[Conflict],31.52425 34.445808,10.30 palestinian Mobilisation high-risk Satur...,10.30 palestinian Mobilisation high-risk Satur...,...,0.062046,0.022424,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Trois morts lors d'une fusillade aux Comores, ...",https://la1ere.francetvinfo.fr/trois-morts-lor...,autour de la principale caserne militaire de M...,2019-03-29 11:08:00+01:00,fr,Conflict,[Conflict],-11.7005 43.2434,"Three dead in a shooting to Comoros, in full p...","Three dead in a shooting to Comoros, in full p...",...,0.014559,0.007426,0.090909,0.090909,0.000000,0.000000,0.000000,0.000000,0.000000,0.090909
4,"Charaktere, Häuser, Staffeln - &#034;Game of T...",https://www.focus.de/kultur/kino_tv/game-of-th...,"""Game of Thrones"" ist eine der erfolgreichsten...",2019-03-29 11:04:00+01:00,de,Conflict,[Conflict],30.305941 -93.66259,"Silent, houses, staggered - & # 034; Game of T...","Silent, houses, staggered - & # 034; Game of T...",...,0.220585,0.054863,0.043478,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,At War: Afghan War Casualty Report: March 22-28,https://www.nytimes.com/2019/03/29/magazine/af...,The following report compiles all significant ...,2019-03-29 11:04:00+01:00,en,Conflict,"[Conflict, TerroristAttack]",36.76519 68.798782,"Silent, houses, staggered - & # 034; Game of T...","Silent, houses, staggered - & # 034; Game of T...",...,0.220585,0.054863,0.043478,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,Afganistan'da saldırılarda 90 sivil hayatını k...,https://www.ntv.com.tr/dunya/afganistandaki-sa...,Afganistan'da son bir ayda düzenlenen hava sal...,2019-03-29 11:02:00+01:00,tr,Conflict,"[Security, Conflict, ManMadeDisasters, Terrori...",34.5309 69.136757,Palestinian Mobilisation high-risk Saturday to...,Palestinian Mobilisation high-risk Saturday to...,...,0.097546,0.013919,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,Gaz kaçağını çakmakla kontrol eden çift ağır y...,https://www.ntv.com.tr/turkiye/gaz-kacagini-ca...,Tüp bebek tedavisi için geldikleri Erzurum'da ...,2019-03-29 11:02:00+01:00,tr,Conflict,"[Conflict, ManMadeDisasters]",39.9044 41.2918,Palestinian Mobilisation high-risk Saturday to...,Palestinian Mobilisation high-risk Saturday to...,...,0.097546,0.013919,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,"2030 yılından geldiğini iddia etmişti, kehanet...",https://www.sondakika.com/fotogaleri/2030-yili...,"Noah, 2019 yılının ilk aylarında dünya genelin...",2019-03-29 11:01:00+01:00,tr,Conflict,[Conflict],38.9051 -77.0162,Palestinian Mobilisation high-risk Saturday to...,Palestinian Mobilisation high-risk Saturday to...,...,0.097546,0.013919,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,2018'de 62 milyon kişi olumsuz hava koşulların...,https://www.ntv.com.tr/dunya/bm-2018de-62-mily...,Birleşmiş Milletler'e (BM) bağlı Dünya Meteoro...,2019-03-29 11:02:00+01:00,tr,Drought,"[UNbodies, Ecology, NaturalDisasters]",-25.9622 32.5737,2018'de 62 milyon kişi olumsuz hava koşulların...,Birleşmiş Milletler'e (BM) bağlı Dünya Meteoro...,...,0.002817,0.008724,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
