# Test Dataset Preparation

In [1]:
import pickle

import pandas as pd
import requests
import scrapy
from scrapy import Selector
from scrapy.crawler import CrawlerProcess

In [10]:
with open('date_dict.pickle', 'rb') as handle:
    date_dict = pickle.load(handle)
with open('place_dict.pickle', 'rb') as handle:
    place_dict = pickle.load(handle)
with open('url_dict.pickle', 'rb') as handle:
    url_dict = pickle.load(handle)    
with open('text_dict.pickle', 'rb') as handle:
    text_dict = pickle.load(handle)    

In [14]:
source_url = 'https://archivepmo.nic.in/drmanmohansingh/all-speeches.php'
html = requests.get(source_url).content
sel = Selector( text = html )

In [15]:
# Collecting urls which couldn't be scraped
url_prefix = 'https://archivepmo.nic.in/drmanmohansingh/'
url_suffixes = sel.xpath('//div[@class = "speechPan"]/ul//li').xpath('./a/@href').extract()
urls = [url_prefix +url_suffix for url_suffix in url_suffixes]

missed_urls = list(set(urls) - set(url_dict.values()))
len(missed_urls),missed_urls[:5]

(21,
 ['https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=734',
  'https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=922',
  'https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=1016',
  'https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=664',
  'https://archivepmo.nic.in/drmanmohansingh/speech-details.php?nodeid=873'])

In [16]:
# Checking if speeches can be scraped, and if scraped are they of sufficent length. if not urls dropped 
for url_speech in missed_urls:
    html_speech = requests.get( url_speech ).content
    sel = Selector(text = html_speech)
    
    try:
        ((sel.xpath('//*[@class = "innerHead"]/text()')).extract_first())
        ((sel.xpath('//div[@class = "contentInner"]//h2[@class = "date"]/text()'))[0].extract())
        ((sel.xpath('//div[@class = "contentInner"]//h2[@class = "date"]/text()'))[1].extract())
        
        text = "".join((sel.css('div.contentInner div.rt')).css('p::text').extract())
        if len(text)<500:
            missed_urls.remove(url_speech)
            print("Something went wrong. Removing the url")
    except:
        print("Something went wrong. Removing the url")
        missed_urls.remove(url_speech)

Something went wrong. Removing the url
Something went wrong. Removing the url
Something went wrong. Removing the url


In [17]:
len(missed_urls)

18

In [18]:
#Creating another spider to collect the missed speeches so that they can be made into a test set
class MMS_Missed_Speech_Spider( scrapy.Spider ):
    name = 'mms_missed_speeches_spider'
    def start_requests( self ):
        for url in missed_urls:
            yield scrapy.Request( url = url, callback = self.parse )
    def parse( self, response ):
        #Extracting url
        speech_url = response.url
        
        #Extracting title of speech
        title = (response.xpath('//*[@class = "innerHead"]/text()')).extract_first()
        
        #Extracting date of speech
        date = (response.xpath('//div[@class = "contentInner"]//h2[@class = "date"]/text()'))[0].extract()
        
        #Extracting place of speech
        place = (response.xpath('//div[@class = "contentInner"]//h2[@class = "date"]/text()'))[1].extract()
        
        #Extracting speech text
        text  = "".join((response.css('div.contentInner div.rt')).css('p::text').extract())
        
        #Storing in respective dict_testionaries
        place_dict_test[title] = place
        date_dict_test[title] = date
        url_dict_test[title] = speech_url
        text_dict_test[title] = text

In [19]:
date_dict_test = {}
place_dict_test = {}
text_dict_test = {}
url_dict_test = {}
process = CrawlerProcess()
process.crawl(MMS_Missed_Speech_Spider)
process.start()

2020-04-09 12:17:13 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2020-04-09 12:17:13 [scrapy.utils.log] INFO: Versions: lxml 4.2.1.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.6.5 |Anaconda custom (64-bit)| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 17.5.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.4.2, Platform Windows-10-10.0.18362-SP0
2020-04-09 12:17:13 [scrapy.crawler] INFO: Overridden settings: {}
2020-04-09 12:17:13 [scrapy.extensions.telnet] INFO: Telnet Password: b26f59253623eb65
2020-04-09 12:17:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2020-04-09 12:17:16 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrap

In [20]:
len(date_dict_test),len(place_dict_test),len(text_dict_test),len(url_dict_test)

(13, 13, 13, 13)

In [21]:
#Creating test dataset
date_df = pd.DataFrame.from_dict(date_dict_test, orient='index',columns= ['date'])
place_df = pd.DataFrame.from_dict(place_dict_test, orient='index', columns= ['place'])
text_df = pd.DataFrame.from_dict(text_dict_test, orient='index', columns= ['text'])
url_df = pd.DataFrame.from_dict(url_dict_test, orient='index', columns= ['url'])

df_combined_test = pd.concat([date_df, place_df,url_df,text_df], axis=1, sort=False, join = 'inner')
df_combined_test.index.rename('title',inplace=True)
df_combined_test.reset_index(drop = False,inplace= True)

df_combined_test.head()

Unnamed: 0,title,date,place,url,text
0,PM's address to IPS Probationers,"January 5, 2010",New Delhi,https://archivepmo.nic.in/drmanmohansingh/spee...,"From your introductions, I am very encouraged ..."
1,Excerpts of address by the PM at the Combined ...,"September 13, 2010",New Delhi,https://archivepmo.nic.in/drmanmohansingh/spee...,The Nation is proud of the selfless devotion t...
2,PM's opening remarks at the All Party Meeting,"November 30, 2008",New Delhi,https://archivepmo.nic.in/drmanmohansingh/spee...,"""Esteemed Chairperson UPA, respected colleague..."
3,PM's address to the Nation,"May 17, 2014",New Delhi,https://archivepmo.nic.in/drmanmohansingh/spee...,"\r\n\tMy Fellow Citizens,\r\n\tI address you t..."
4,PM's opening remarks at the Full Planning Comm...,"April 21, 2011",New Delhi,https://archivepmo.nic.in/drmanmohansingh/spee...,"""This meeting of the Planning Commission has b..."


In [22]:
#Storing test dataset into a file
df_combined_test.to_excel('PM_MMS_Speech_test.xlsx',index=False)