# I. Data extraction/preparation

In [1]:
import json
import numpy as np
import pandas as pd

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 800)
from ast import literal_eval


import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
data_path = './Data'

In [26]:
file= data_path +'/osfstorage-archive/replicationdata/experiment_data.json'

In [27]:
with open(file) as f:
    data = json.load(f)

In [28]:
data.keys()

dict_keys(['meta_split', 'holdin_instances', 'holdin_labels', 'meta_holdin_indices', 'holdout_instances', 'holdout_labels', 'meta_holdout_indices', 'meta_stats_alldata', 'meta_stats_holdin', 'meta_stats_holdout'])

In [29]:
data['meta_split']

{'function': 'sklearn.model_selection.train_test_split',
 'random_state': 92,
 'test_size': 0.1,
 'shuffle': True,
 'date': '2017-12-15_16:14:1513350895'}

### All data

In [30]:
list(data['meta_stats_alldata']['label_count'].keys())

['-1',
 'Profit',
 'Dividend',
 'MergerAcquisition',
 'SalesVolume',
 'BuyRating',
 'QuarterlyResults',
 'TargetPrice',
 'ShareRepurchase',
 'Turnover',
 'Debt']

### Holdin data: Sentence-level classification

In [31]:
type(data['holdin_instances']),len(data['holdin_instances'])

(list, 8943)

In [32]:
data['holdin_instances'][:3]

['It will not say what it has spent on the project , but it is unlikely to be a large share of the annual pound(s)3bn capex bill .',
 "Sir John Bond , chairman , told the bank 's annual meeting that recent rises in interest rates and a slowing housing market were affecting consumer confidence and the level of bad debts .",
 'Unilever was criticised by shareholders at its annual meeting in London last month for the lack of diversity on its board .']

In [33]:
type(data['holdin_instances']),len(data['holdin_instances'])

(list, 8943)

In [34]:
type(data['holdin_labels']),len(data['holdin_labels'])

(list, 8943)

In [35]:
data['holdin_labels'][:20]

[-1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 -1,
 ['ShareRepurchase'],
 -1,
 -1,
 -1,
 -1,
 -1,
 ['Profit'],
 ['QuarterlyResults', 'Profit'],
 -1,
 -1,
 ['Dividend']]

In [36]:
label_types=[]
for llabel in data['holdin_labels']:
    if type(llabel)==list:
        for llabel1 in llabel:
            if not llabel1 in label_types:
                label_types.append(llabel1)
            
    else:
        if not llabel==-1:
            print('error')

In [37]:
label_types

['ShareRepurchase',
 'Profit',
 'QuarterlyResults',
 'Dividend',
 'BuyRating',
 'SalesVolume',
 'Turnover',
 'MergerAcquisition',
 'TargetPrice',
 'Debt']

In [38]:
len(data['meta_holdin_indices'])

8943

In [39]:
data['meta_holdin_indices'][:5]

[5750, 4305, 7684, 5360, 1887]

### Combine data

In [40]:
sentences = data['holdin_instances'] + data['holdout_instances']
labels = data['holdin_labels'] + data['holdout_labels']
datatype = len(data['holdin_labels'])*['holdin']+len(data['holdout_labels'])*['holdout']

In [41]:
df = pd.DataFrame({'sentence':sentences,'label':labels,'datatype':datatype})

In [42]:
df.head()

Unnamed: 0,sentence,label,datatype
0,"It will not say what it has spent on the project , but it is unlikely to be a large share of the annual pound(s)3bn capex bill .",-1,holdin
1,"Sir John Bond , chairman , told the bank 's annual meeting that recent rises in interest rates and a slowing housing market were affecting consumer confidence and the level of bad debts .",-1,holdin
2,Unilever was criticised by shareholders at its annual meeting in London last month for the lack of diversity on its board .,-1,holdin
3,"Mr Sarin 's statement came as he announced that he planned to step down as chief executive in July from the world 's largest mobile operator by revenue , and that Vittorio Colao , Vodafone 's deputy chief executive , would succeed him .",-1,holdin
4,BASF was advised by Deutsche Bank .,-1,holdin


### Add article id, publication and full article text belonging to the sentence by using  the full articles annotated by the Brat tool.

In [44]:
import os
title=[]
file_content=[]
date=[]
file_id=[]
for i,file in enumerate(os.listdir(data_path+"/osfstorage-archive/bratannotationfiles")):
    if file.endswith(".txt"):
        file_name=os.path.join(data_path+"/osfstorage-archive/bratannotationfiles", file)
        ftmp = file_name[:-4].split('/')[-1]
        title.append((' '.join(ftmp.split('_')[:-1]).strip()))
        date.append(ftmp.split('_')[-1])
        file_id.append(i)
        with open(file_name) as f:
            lines = f.readlines()
            lines = ' '.join(lines)
            file_content.append(lines.replace('\n',''))

In [45]:
df_full_corpus=pd.DataFrame({'file_id':file_id,'title':title,'date':date,'text':file_content})

In [46]:
df_full_corpus.head(2)

Unnamed: 0,file_id,title,date,text
0,1,british land looking to rebuild its portfolio,18-11-2009,"British Land looking to rebuild its portfolio REAL ESTATE An improvement in retail rents offset a fall in demand for office space at British Land in the first half , as the property company that launched a pound(s)740m rights issue in February insisted it was back in acquisitive mode . Chris Grigg , chief executive , said yesterday that a fall in the number of retailers facing administration had helped boost net income from its retail portfolio by 2.7 per cent in the six months to September 30 , although a 2.3 per cent decline in office income reduced overall like-for-like rental income growth to 0.7 per cent over the period . The company , which last year wrote off more than pound(s)3bn from the value of its estate , saw its net asset value slide by 6.5 per cent over the six months . ..."
1,3,vodafone prepares for smart move,6-11-2010,"Vodafone prepares for smart move MOBILE & TELECOMS ; News analysis ; Smartphones and sell-offs will head the group 's strategic review , writes Andrew Parker Vodafone will next Tuesday unveil the outcome of an eagerly-awaited strategic review of the mobile phone operator . Vittorio Colao , Vodafone 's chief executive , is expected to give a strong signal that he will sell more of the group 's minority stakes in overseas mobile operators He is also due to provide investors with Vodafone 's answer to the biggest question hanging over all mobile operators : how to secure long-term revenue growth from the surging consumer demand for smartphones . Mr Colao is enjoying better relations with shareholders than at the start of the year , when some were questioning his strategy . Investors are h..."


Link document meta data to sentences

In [48]:
def find_doc(sentence):
    for i,c in enumerate(file_content):
        if sentence in c:
            return title[i]
    print('counld not find document')
    return ''
def find_doc2(sentence):
    for i,c in enumerate(file_content):
        if sentence in c:
            return date[i]
    print('counld not find document')
    return ''
def find_doc3(sentence):
    for i,c in enumerate(file_content):
        if sentence in c:
            return file_id[i]
    print('counld not find document')
    return ''

In [51]:
df['title'] = df['sentence'].apply(find_doc)
df['publication_date'] = df['sentence'].apply(find_doc2)
df['file_id'] = df['sentence'].apply(find_doc3)

In [52]:
df.head()

Unnamed: 0,sentence,label,datatype,title,publication_date,file_id
0,"It will not say what it has spent on the project , but it is unlikely to be a large share of the annual pound(s)3bn capex bill .",-1,holdin,tesco,25-09-2013,833
1,"Sir John Bond , chairman , told the bank 's annual meeting that recent rises in interest rates and a slowing housing market were affecting consumer confidence and the level of bad debts .",-1,holdin,FT other HSBC,28-05-2005,393
2,Unilever was criticised by shareholders at its annual meeting in London last month for the lack of diversity on its board .,-1,holdin,unilever to broaden mix of board food producers,24-06-2006,569
3,"Mr Sarin 's statement came as he announced that he planned to step down as chief executive in July from the world 's largest mobile operator by revenue , and that Vittorio Colao , Vodafone 's deputy chief executive , would succeed him .",-1,holdin,sarin sees growth for vodafone in africa and asia as he heads for exit,28-05-2008,950
4,BASF was advised by Deutsche Bank .,-1,holdin,basf continues acquisition drive with $3bn agreed offer for ciba,16-09-2008,980


In [53]:
list_unique_labels = list(data['meta_stats_alldata']['label_count'].keys())

In [54]:
list_unique_labels

['-1',
 'Profit',
 'Dividend',
 'MergerAcquisition',
 'SalesVolume',
 'BuyRating',
 'QuarterlyResults',
 'TargetPrice',
 'ShareRepurchase',
 'Turnover',
 'Debt']

In [55]:
for label1 in list_unique_labels:
    df[label1] = 0

In [56]:
def check_labels(row):
    llabels = row['label']
    if type(llabels) == list:
        for lel in llabels:
            row[lel] = 1
    else:
        if llabels == -1:
            row['-1'] = 1
    return row

In [57]:
df_new = df.apply(check_labels,axis=1)

### Quick Checks

In [58]:
multi_labels = ['-1',
       'Profit', 'Dividend', 'MergerAcquisition', 'SalesVolume', 'BuyRating',
       'QuarterlyResults', 'TargetPrice', 'ShareRepurchase', 'Turnover',
       'Debt']

In [59]:
df_new[multi_labels].apply(pd.Series.value_counts)

Unnamed: 0,-1,Profit,Dividend,MergerAcquisition,SalesVolume,BuyRating,QuarterlyResults,TargetPrice,ShareRepurchase,Turnover,Debt
0,2114,9286,9755,9684,9450,9710,9670,9843,9876,9697,9877
1,7823,651,182,253,487,227,267,94,61,240,60


In [60]:
df_new.head(2)

Unnamed: 0,sentence,label,datatype,title,publication_date,file_id,-1,Profit,Dividend,MergerAcquisition,SalesVolume,BuyRating,QuarterlyResults,TargetPrice,ShareRepurchase,Turnover,Debt
0,"It will not say what it has spent on the project , but it is unlikely to be a large share of the annual pound(s)3bn capex bill .",-1,holdin,tesco,25-09-2013,833,1,0,0,0,0,0,0,0,0,0,0
1,"Sir John Bond , chairman , told the bank 's annual meeting that recent rises in interest rates and a slowing housing market were affecting consumer confidence and the level of bad debts .",-1,holdin,FT other HSBC,28-05-2005,393,1,0,0,0,0,0,0,0,0,0,0


In [61]:
df_new.to_csv('Data/jacobs_corpus.csv',index=False)