# INM432: Big Data - Coursework (Part II): 
## Employing pySpark to UK parliament reports of House of Lords and Commons to study the effects on GBP exchange rates

### Alexandros Dimitrios Nalmpantis; Georgios Kyriakopoulos (2017)

* The file scraping proccess requires additional modules.
* These can be install with the following commands to a termninal:
    pip install <"name of module"> eg: pip install tqdm
    or with conda install <"name of module"> eg: conda install tqdm

In [1]:
# Import modules for spark ML, math and operators
import re
from operator import add
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark import SparkContext
from math import log
import time
from pprint import pprint
import sys
# Import modules for downloading links
import wget
import pandas as pd
# Import modules for scraping links
from bs4 import BeautifulSoup
import urllib.request
import re
import datetime
from datetime import date,timedelta
import os
# Import midules for parsing pdf's,progress bars and handling errors
import warnings
from tqdm import tnrange, tqdm_notebook
from tika import parser
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

### 1 Download the required pdf practicals and convert to text

###     1.1 Wrapped procedures of scraping, downloading and converting to text files

In [2]:
# Data control function that controls wether the data will be scraped or were provided by students
def data_control(page,start_date,trg):
    if trg=='yes':
        os.chdir(os.getcwd)
    else:
        html_page = urllib.request.urlopen(page) #request page with urllib packages
        soup = BeautifulSoup(html_page) #pass the page to beautiful soup in order to extract the links contained in webpage 
        #print (soup) #visually inspect the html structure
        hl = [] #set hyperlink array to store the extracted links
        ##search html for hyperlinks starting with qna
        for hyperlink in soup.findAll('a', attrs={'href': re.compile("^http://qna")}): 
            hl.append(hyperlink.get('href')) #store the hyperlinks found on an array
        #    print (link.get('href'))
        
        url=[hl[1][:-20]+'Lords-',hl[1][:-20]+'Commons-'] #take first result and cut the dates and category of either lords or commons
    
        #date interval search set and downloading of the pdf files
        ##create interval search date
        today=datetime.datetime.today() #today's date set
        cur_date = date(today.year,today.month,today.day)  # set current date in format of YYYY-MM-DD
    
        dt = cur_date - start_date #calculate interval in days to use for loop
        #make directory to downloaded files
        try:
            os.makedirs(os.getcwd()+'/parliament_practicals') #make directory to downloaded files
        except:
            pass
    
        #loop throught the interval with 1 day step and append the date to url along with categories of either house of Lords or Commons
        for ul in url:
        #    print ('Downloading: ',str('House of '+ul[112:-2]+'s'))
            for i in tnrange(dt.days + 1,desc='Downloading: '+str(ul[112:-2]+'s')):
                try: #test for errors and pass since there are dates that the House of Lords do not convene and HTTP request returns error; Also store results on folder parliament practicals
                    filename = wget.download(ul+str(start_date + timedelta(days=i))+'.pdf',os.getcwd()+'/parliament_practicals')
                except:
                    next  
    
# function to convert the downloaded pdfs to text files
def convert_pdf_to_text(trg):
    if trg=='yes': # user input in case data are already given in appropriate format
        print ('====Data were given====')
    else:
        try: # test if directory textfiles already exists otherwise make the directory
            os.makedirs(os.getcwd()+'/textfiles') #make directory to downloaded files
        except:
            pass
        list_of_files=os.listdir(os.getcwd()+'/parliament_practicals') # create a list of pdf files to be converted
        for i in tnrange(len(list_of_files),desc='Converting pdf to txt'): # iterate throught the files on the list and install progress bar
            if list_of_files[i].endswith(".pdf"): # check that file input is pdf file
                parsedPDF=parser.from_file(os.getcwd()+'/parliament_practicals/'+list_of_files[i]) # parse pdf file
                text_file = open(os.getcwd()+'/textfiles/'+list_of_files[i][:-4]+'.txt', 'a') # create new filename with extension .txt
                text_file.write(parsedPDF["content"]) # write parsed pdf to text
                text_file.close() # close text file
            else: # if file other than pdf continue loop
                next

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [77]:
driver = webdriver.Chrome()
driver.get("http://www.bankofengland.co.uk/boeapps/iadb/Rates.asp?into=GBP")
today=datetime.datetime.today()    

In [74]:
# Select day 1 from dropdown start date
select_day = driver.find_element_by_id("TD");
if today.day==1:
    select_day.send_keys("0")
elif today.day>19 or today.day<9:
    select_day.send_keys("1")
else:
    key=19-(today.day-2)
    i=0
    for i in range(1,key+1):
        select_day.send_keys("1")

In [75]:
# Select month june from dropdown month
select_month = driver.find_element_by_name("TM");
if today.month<6:
    select_month.send_keys("j")
elif today.month==6:
    select_month.send_keys("0")
else:
    i=2
    for i in range(1,i+1):
        select_month.send_keys("j")

In [76]:
select_year = driver.find_element_by_name("TY");
select_year.send_keys("2")

In [79]:
start_date = date(2016, 6, 1)
start_date.year

2016

In [None]:
page="http://www.bankofengland.co.uk/boeapps/iadb/index.asp?Travel=NIxRSx&TD="+today.day+"&TM=Apr&TY="+today.year+"&into=GBP&CurrMonth="+today.month+"&startDD="+start_date.day+"&startMM="+start_date.month+"&startYYYY="+start_date.year+"&From=Rates&C=C8J&G0Xtop.x=1&G0Xtop.y=1"

###     1.2 User input of parameters and function calling

In [3]:
# If data are given for time saving purposes then set the following parameter to yes
trg='no'
# set link to parliament daily questions and answers reports
page="http://www.parliament.uk/business/publications/written-questions-answers-statements/daily-reports/" # set link to parliament daily questions and answers reports
# set from which date to current date the function will download reports in YYYY-M-D format below
start_date = date(2016, 6, 1)

# Call function to either download the data or set current folder as working folder...please make sure that
# if data are give then those should be stored on the folder: 'parliament_practicals'
# We suggest to run the scraping function since it only takes 2minutes for downloading a year of reports

data_control(page,start_date,trg) # call set data function
convert_pdf_to_text(trg) # convert to pdf function







2017-04-11 20:31:39,522 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.14/tika-server-1.14.jar to /var/folders/0c/dkmpfbdd6h96whwytkxqnp880000gp/T/tika-server.jar.
2017-04-11 20:31:53,407 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.14/tika-server-1.14.jar.md5 to /var/folders/0c/dkmpfbdd6h96whwytkxqnp880000gp/T/tika-server.jar.md5.





### 2 Load spark and read and split files to f,w tuples

In [4]:
# try this in case of "sc undefined" errors, also works outside the Notebook environment
sc = SparkContext()

In [6]:
def splitFileWords(file_text): # function (a) builds (file, word) tuples from (file, text) tuples
    f,t = file_text # define the input to the function
    file_word_List = [] # create an empty (file,word) list
    word_List = re.split('\W+',t) # split texts into words using regular expression
    for w in word_List: 
        file_word_List.append((f,w.lower())) # append words in lowercase to their corresponding file
    return file_word_List

def read_file_word_RDD(argDir): # function (b) builds (file, word) tuples using function (a) (which builds (file, word) tuples from (file, text) tuples 
    file_text_RDD = sc.wholeTextFiles(argDir)# read the files and build (file, text) tuples
    file_word_RDD = file_text_RDD.flatMap(splitFileWords) #use function (a)to build (file, word) tuples
    #print('Read {} files from directory {}'.format(file_text_RDD.count(), argDir)) # print count and location of files used
    #print('file word count histogram')
    #print(file_word_RDD.map(lambda fwL: (len(fwL[1]))).histogram([0,10,100,500, 1000, 5000, 10000])) # print word-count histogram 
    return file_word_RDD 

file_word_RDD = read_file_word_RDD('/Users/squirel/Desktop/BD_Project/textfiles') # apply function (b) on the text corpus for the analysis 
pprint(file_word_RDD.take(2)) # print (file, word) tuples indicatively

[('file:/Users/squirel/Desktop/BD_Project/textfiles/Written-Questions-Answers-Statements-Daily-Report-Commons-2016-06-03.txt',
  ''),
 ('file:/Users/squirel/Desktop/BD_Project/textfiles/Written-Questions-Answers-Statements-Daily-Report-Commons-2016-06-03.txt',
  'daily')]
