## Take the given dirty raw data and refine it.

In [224]:
import pickle as pkl
import pandas as pd
import numpy as np
import bson
import datetime
from dateutil.parser import parse
from datetime import timedelta

In [210]:
# 1 is Pro, -1 is Anti
def getLabeledData(csvFile, pklFile):
    def combine(s):
        return list(s)
    
    def stance(df):
        grouped = df.groupby(['ID'])
        result = grouped['Label'].agg(['sum', 'count'])
        sums = grouped['Label'].sum()
        result['Label'] = np.select(condlist=[sums>0, sums==0, sums<0], choicelist=[1, 0, -1], 
            default=np.nan)
        result = result.drop(columns=['sum', 'count'])
        
        result['Texts'] = grouped['Statement'].agg([combine])
        
        return result

    pred_data = pd.read_csv(csvFile) 

    pred_data['Label'] = pred_data['Label'].replace(['PRO'], 1)
    pred_data['Label'] = pred_data['Label'].replace(['ANTI'], -1)
    
    #return pred_data
    return stance(pred_data)

In [212]:
## We match the given data, from the big courpus to obtain full articles, 
## the ones that cannot be found in the corpus are disregarded.
def getFields(article_texts, bsonFile):
    # text, articleTitle, publishedDate, articleUrl 
    texts, titles, dates, url = [], [], [], []
    
    with open(bsonFile, 'rb') as f:
        data = bson.decode_all(f.read())
    
    for sents in article_texts:
        found = False
        for item in data:
            if all([sent in item['text'] for sent in sents]):
                texts.append(item['text'])
                titles.append(item['articleTitle'])
                dates.append(item['publishedDate'])
                url.append(item['articleUrl'])
                found = True
                break
        if not found:
            texts.append('')
            titles.append('')
            dates.append(np.nan)
            url.append('')

    return texts, titles, url, dates

In [254]:
def cleanUp(df):
    df = df.drop(columns=['Texts'])
    df = df[df.Label != 0]
    df = df[df.text != '']
    return df

## Articles collected, and mapped to a simulated timeline
def joinDfs(df1, df2):
    df = pd.concat([df1, df2])
    df['Label'] = df['Label'].replace([-1], 0)
    df = df.rename(columns={'date':'actualDate'})
    
    #Shuffle
    df = df.sample(frac=1).reset_index(drop=True)
    
    #Change dates, divide it over 90 total days
    imagDates, total = [], len(df)
    eachCount = total/90
    startDate = datetime.date(2011, 7, 1)
    for i in range(total):
         imagDates.append(startDate + timedelta(i/eachCount) )
    df['date'] = imagDates
    
    #Transform data Strutcure to the correct format
    df.insert(0, 'id', df.index)
    df['Label'] = df['Label'].astype(int)
    df['Label'] = [[x] for x in df['Label']]
    df = df[['id', 'Label', 'text', 'date', 'title', 'url', 'actualDate']]

    return df

In [211]:
demon = getLabeledData("demon-predictions.csv", 'demon.pkl')
aadhar = getLabeledData("aadhar-predictions.csv", 'aadhar.pkl')

In [213]:
demon['text'], demon['title'], demon['url'], demon['date'] = \
    getFields(demon['Texts'], '../Base Implementation/data/demonetization-all.bson')
aadhar['text'], aadhar['title'], aadhar['url'], aadhar['date'] = \
    getFields(aadhar['Texts'], '../Base Implementation/data/aadhar-all.bson')

In [255]:
demonCl = cleanUp(demon)
aadharCl = cleanUp(aadhar)

In [261]:
joined = joinDfs(demonCl, aadharCl)
# id, Label, text, date, title, url, actualDate
joined.head()

Unnamed: 0,id,Label,text,date,title,url,actualDate
0,0,[1],Union Minister for Electronics & Information T...,2011-07-01,100 government websites to be differently-able...,http://indianexpress.com/article/india/india-n...,2016-08-24
1,1,[0],By giving them all the pro-poor subsidies and ...,2011-07-01,"Rs 36,000 crore saved in one year through Jan ...",http://indianexpress.com/article/india/india-n...,2016-11-14
2,2,[1],"'UID data collection accurate'New Delhi, Jan 2...",2011-07-01,'UID data collection accurate',http://www.deccanherald.com/content/221680/uid...,2012-01-23
3,3,[0],Punjab Congress chief Captain Amarinder Singh ...,2011-07-01,PM Modi Not Serious about Farmers' Woes: Capta...,http://www.hindustantimes.com/punjab/pm-modi-n...,2017-01-02
4,4,[1],NEW DELHI: Four days before Prime Minister Nar...,2011-07-01,Aadhaar gets a lifeline as Nandan Nilekani imp...,http://timesofindia.indiatimes.com//india/Aadh...,2014-11-21


In [262]:
data = joined.values

In [266]:
pkl.dump(data, open("data.pkl", "wb"))