In [None]:
import os
import time
import requests
import wikipedia
import numpy as np
import pandas as pd
import wikipediaapi

from ssl import *
from bs4 import BeautifulSoup
from wikipedia.exceptions import PageError, DisambiguationError

headers_Get = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'DNT': '1',
               'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1'}

# 0. Helper functions

In [None]:
def scrap_events(df):
    """ This function will gather events data from the 
        website https://www.onthisday.com/ for all classes that
        have the name event. """
    # Create a map to have to correct month names
    month_map = {'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April', 
                 'May': 'May', 'Jun': 'June', 'Jul': 'July', 'Aug': 'August',
                 'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December'}
    year_range = [x for x in range(1965, 2016)]
    for year in year_range:
        s = requests.Session()
        url = 'https://www.onthisday.com/date/{}'.format(year)
        r = s.get(url, headers=headers_Get)
        soup = BeautifulSoup(r.text, "html.parser")
        # We get all events elements from the website 
        result_set = soup.find_all('li', {'class': 'event'})
        for res in result_set:
            text = res.get_text()
            tokens = text.split(" ")
            month = tokens[0]
            day = tokens[1]
            content = " ".join(tokens[2:])
            df = df.append({'Year': year, 'Month': month_map[month],
                            'Day': day, 'Content': content}, ignore_index=True)
    return df

In [None]:
def fill_wikipedia_values(df):
    """ This function gather the url and summary of 
        related wikipedia article. """
    done = False
    while not done:
        try:
            for i, row in df.iterrows():
                if(pd.isna(row[4]) or pd.isna(row[5])):
                    y = row[0]
                    m = row[1]
                    d = row[2]
                    content = row[3]
                    res = wikipedia.search("{} {} {} {}".format(d, m, y, content), results=1)
                    if(len(res) != 0):
                        try:
                            df.loc[df['Content'] == content, 'Wikipedia'] = wikipedia.page(res[0]).url
                            df.loc[df['Content'] == content, 'Summary'] = wikipedia.summary(res[0])
                        except PageError:
                            pass
                        except DisambiguationError:
                            pass
            done = True
        except:
            continue
    return df

# 1. Fill in url and summary

In [None]:
try:
    df_event = pd.read_csv('datasets/events/events_full.csv', sep=',', index_col=0)
except:
    # We create an empty dataframe
    columns = ['Year', 'Month', 'Day', 'Content', 'Wikipedia', 'Summary']
    df_event = pd.DataFrame(columns=columns)
    df_event = scrap_events(df_event)
    df_event = fill_wikipedia_values(df_event)

# 2. Get the wikipedia url by hand

For the missing entries, we found the wikipedia url by hand. Then we extract the article's summary for those values.

In [None]:
summaries_nan = df_event[df_event['Summary'].isna()]
print("We have {} missing summaries".format(len(summaries_nan)))

In [None]:
wiki_wiki = wikipediaapi.Wikipedia('en')

for i, row in summaries_nan.iterrows():
    url = row.loc['Wikipedia']
    title = url[30:]
    page = wiki_wiki.page(title) 
    try:
        if(page.exists):
            df_event.loc[df_event['Wikipedia'] == url, 'Summary'] = page.summary
    except KeyError:
        continue

In [None]:
summaries_nan = df_event[df_event['Summary'].isna()]
print("We have {} missing summaries".format(len(summaries_nan)))

In [None]:
df_event.to_csv("datasets/events/events_full.csv", sep=',')