## Get Data

* Download page from Wikipedia
* Parse table data into a proper DataFrame
* Get latitude and longitude data from Google Maps API (using the location names)
* Get latitude and longitude data from Wikipedia using locations pages (extract `.geo-dec`)
* Compare lat and lon data, ensure they are consistent, get missing ones manually
* Save DataFrame as a clean csv file, ready to use

In [3]:
import datetime
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:
page = 'https://en.wikipedia.org/wiki/List_of_events_named_massacres'
resp = requests.get(page)
soup = BeautifulSoup(resp.text, 'lxml')

### Dates

TODO: 

* Fix the first date to be BC somehow


#### Dates

In [5]:
beg_of_rows =  list(range(0, 412, 5)) + list(range(414, 1500, 5))
table_data = soup.select('td')[2:1500]
dates = []
for i, row in enumerate(beg_of_rows):
    if i == 277:
        break
    if i == 82:
        continue
    if table_data[row].text == '1350mid-14th century':
        dates.append(datetime.datetime(1350,1,1, 0, 0, 0))
        continue
#         print(table_data[row].text)
    try:
#         print(re.findall('[0-9]{4}-\d\d-\d\d', table_data[row].text)[0])
        date = datetime.datetime.strptime((re.findall('[0-9]{4}-\d\d-\d\d', table_data[row].text)[0]), '%Y-%m-%d') 
#         print(i, date, type(date))
        dates.append(date)
    except Exception:
#         print(i, table_data[row].text, type(table_data[row].text))
        dates.append(table_data[row].text)
print(len(dates))
print(set([type(x) for x in dates]))
dates[:5]

276
{<class 'datetime.datetime'>}


[datetime.datetime(9912, 1, 1, 0, 0),
 datetime.datetime(61, 1, 1, 0, 0),
 datetime.datetime(390, 1, 1, 0, 0),
 datetime.datetime(627, 1, 1, 0, 0),
 datetime.datetime(782, 1, 1, 0, 0)]

#### Locations

In [20]:
len(dates), len(locations), len(loc_links), len(names), len(name_links), len(deaths), len(descriptions)

(276, 276, 276, 276, 276, 276, 276)

In [6]:
beg_of_rows = list(range(0, 412, 5)) + list(range(414, 1500, 5))
table_data = soup.select('td')[2:1500]
locations = []
for i, row in enumerate(beg_of_rows):
    if i == 82:
        continue
    if i ==277:
        break
    try:
        location = table_data[row+1].contents[1].text
#         print('\n', location, '\n=========')
        locations.append(location)
    except Exception as e:
#         print(table_data[row+1].text)
        locations.append(table_data[row+1].text)
#         print(i, e)

print(len(locations))
locations[:5]

276


['Kingdom of Pontus',
 'Anglesey, Britannia',
 'Thessaloniki, Macedonia',
 'Fortress of Banu Qurayza, Saudi Arabia',
 'Verden, Lower Saxony, Germany']

#### Location links

In [7]:
beg_of_rows = list(range(0, 412, 5)) + list(range(414, 1500, 5))
table_data = soup.select('td')[2:1500]
loc_links = []
link_exceptions = []
for i, row in enumerate(beg_of_rows):
    if i == 82:
        continue
    if i == 277:
        break
    try:
        loc_lnk = table_data[row+1].contents[1].a['href']
#         print('\n', loc_lnk, '\n=========')
        loc_links.append(loc_lnk)
    except Exception as e:
#         print(table_data[row+1].text)
        loc_links.append(table_data[row+1].text)
        link_exceptions.append(i)
#         print(i, e)
#         print('\n==========')

print(len(loc_links))
loc_links[:5]

276


['/wiki/Kingdom_of_Pontus',
 '/wiki/Anglesey',
 '/wiki/Thessaloniki',
 '/wiki/Saudi_Arabia',
 '/wiki/Verden_an_der_Aller']

#### Names

In [8]:
beg_of_rows = list(range(0, 412, 5)) + list(range(414, 1500, 5))
table_data = soup.select('td')[2:1500]

names = []
name_exceptions = []
for i, row in enumerate(beg_of_rows):
    if i == 82:
        continue
    if i == 277:
        break
    try:
        name = table_data[row+2].contents[0].text
#         print('\n', name, '\n=========')
        names.append(name)
    except Exception as e:
#         print(table_data[row+2].text)
        names.append(table_data[row+2].text)
        name_exceptions.append(i)
#         print(i, e)
#         print('\n========')

print(len(names))
names[:5]

276


['Asiatic Vespers',
 'Menai massacre',
 'Massacre of Thessaloniki',
 'Massacre of Banu Qurayza',
 'Massacre of Verden']

#### Name links

In [9]:
beg_of_rows = list(range(0, 412, 5)) + list(range(414, 1500, 5))
table_data = soup.select('td')[2:1500]


name_links = []
name_link_exceptions = []
for i, row in enumerate(beg_of_rows):
    if i == 82:
        continue
    if i == 277:
        break

    try:
        name_link = table_data[row+2].contents[0]['href']
#         print('\n', name_link, '\n=========')
        name_links.append(name_link)
    except Exception as e:
#         print(table_data[row+2].text)
        name_links.append(table_data[row+2].text)
        name_link_exceptions.append(i)
#         print(i, e)

print(len(name_links))
name_links[:5]

276


['/wiki/Asiatic_Vespers',
 '/wiki/Menai_massacre',
 '/wiki/Massacre_of_Thessaloniki',
 '/wiki/Invasion_of_Banu_Qurayza',
 '/wiki/Massacre_of_Verden']

#### Deaths

In [10]:
beg_of_rows = list(range(0, 412, 5)) + list(range(414, 1500, 5))
table_data = soup.select('td')[2:1500]


deaths = []
deaths_exceptions = []
for i, row in enumerate(beg_of_rows):
    if i == 82:
        continue
    if i == 277:
        break
    if i == 25:
        deaths.append(1_000_000)
        continue
    try:
        death = int(table_data[row+3].contents[0].text.replace(',', '').replace('♠', ''))
#         print('\n', death, '\n=========')
        deaths.append(death)
    except Exception as e:
#         print(table_data[row+3])
        deaths.append(table_data[row+3])
        deaths_exceptions.append(i)
#         print(i, e)
print(set([type(x) for x in deaths]))
print(len(deaths))
deaths[:5]

{<class 'int'>}
276


[80000, 1, 7000, 600, 4500]

#### Descriptions

In [11]:
beg_of_rows = list(range(0, 412, 5)) + list(range(414, 1500, 5))
table_data = soup.select('td')[2:1500]


descriptions = []
desc_exceptions = []
for i, row in enumerate(beg_of_rows):
    if i == 82:
        continue
    if i == 277:
        break

    try:
        desc = table_data[row+4].text
#         print('\n', desc, '\n=========')
        descriptions.append(desc)
    except Exception as e:
#         print('ERROR##########################################')
#         print(table_data[row+4])
        descriptions.append(table_data[row+4])
        desc_exceptions.append(i)
#         print(i, e)

print(len(descriptions))
descriptions[:3]

276


['Wholesale massacre of all Roman and Italic citizens in Asia Minor, starting the Mithridatic Wars.',
 'Gaius Suetonius Paulinus ordered the Roman army to destroy the Celtic Druid stronghold on Anglesey in Britain, sacking Druidic colleges and sacred groves. The massacre helped impose Roman religion on Britain and sent Druidism into a decline from which it never recovered.[5][6]',
 'Emperor Theodosius I of Rome ordered the executions after the citizens of Thessaloniki murdered a top-level military commander during a violent protest against the arrest of a popular charioteer.[8][9]']

In [17]:
massacres_df =  pd.DataFrame({
    'date': dates,
    'location': locations,
    'location_link': loc_links,
    'name': names,
    'name_link': name_links,
    'deaths': deaths,
    'description': descriptions
})
print(massacres_df.shape)
massacres_df.to_csv('massacres.csv', index=False)
massacres_df.head(3)


(276, 7)


Unnamed: 0,date,deaths,description,location,location_link,name,name_link
0,9912-01-01 00:00:00,80000,Wholesale massacre of all Roman and Italic cit...,Kingdom of Pontus,/wiki/Kingdom_of_Pontus,Asiatic Vespers,/wiki/Asiatic_Vespers
1,0061-01-01 00:00:00,1,Gaius Suetonius Paulinus ordered the Roman arm...,"Anglesey, Britannia",/wiki/Anglesey,Menai massacre,/wiki/Menai_massacre
2,0390-01-01 00:00:00,7000,Emperor Theodosius I of Rome ordered the execu...,"Thessaloniki, Macedonia",/wiki/Thessaloniki,Massacre of Thessaloniki,/wiki/Massacre_of_Thessaloniki


### Lat / Lon Extraction

#### Google Maps API

In [30]:
massacres_df = pd.read_csv('massacres.csv')
massacres_df['date'] = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in massacres_df['date']]
set([type(x) for x in massacres_df['date']])

{datetime.datetime}

In [173]:
%%writefile api_key.py 

GOOGLE_API_KEY = 'AIzaSyCDLNYahchpE02xUD3wSxFz39N06yzePN8'

Writing api_key.py


In [31]:
from api_key import GOOGLE_API_KEY

BASE_URL = 'https://maps.googleapis.com/maps/api/geocode/json?key=' + GOOGLE_API_KEY + '&address='

responses = []
resp_exceptions = []


for i, loc in enumerate(massacres_df['location']):
    url = BASE_URL + loc
    try:
        resp = requests.get(url)
        responses.append(resp)
        if not i % 25:
            print(i, '/', len(massacres_df))
    except Exception as e:
        responses.append(url)
        resp_exceptions.append((url, e))

0 / 276
25 / 276
50 / 276
75 / 276
100 / 276
125 / 276
150 / 276
175 / 276
200 / 276
225 / 276
250 / 276
275 / 276


In [66]:
lat_long = []

ERRORS_COUNT = 0
resp_json_payload = [resp.json() if not isinstance(resp, str) else '' for resp in responses]
for json in resp_json_payload:
    try:
        lat_long.append((json['results'][0]['geometry']['location']['lng'],
                         json['results'][0]['geometry']['location']['lat'])
                        )
#         print(('lat:', json['results'][0]['geometry']['location']['lat'],
#               'lng:', json['results'][0]['geometry']['location']['lng'], '\n'))
#         print('========')
    except Exception:
        ERRORS_COUNT += 1
        lat_long.append(('', ''))
#         print('ERROR ########################')
#         print(json)
#         print('########################')


ERRORS_COUNT

38

In [108]:
massacres_df['longitude'] = [x[0] for x in lat_long]
massacres_df['latitude'] = [x[1] for x in lat_long]

massacres_latlong = massacres_df[massacres_df['latitude'].astype(str) != '']
massacres_latlong['latitude'] = massacres_latlong['latitude'].astype(float)
massacres_latlong['longitude'] = massacres_latlong['longitude'].astype(float)
massacres_latlong.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


date              object
deaths             int64
description       object
location          object
location_link     object
name              object
name_link         object
latitude         float64
longitude        float64
wiki_lat          object
wiki_lon          object
dtype: object

In [73]:
geodec_responses = []
geo_exceptions = []
wiki_base = 'https://en.wikipedia.org'

for i, city in enumerate(loc_links):
    if not i%25:
        print(i, '/', len(loc_links))
    try:
        resp = requests.get(wiki_base + city)
        geodec_responses.append(resp)
    except Exception as e:
        geodec_responses.append('')
        geo_exceptions.append([city, e])


0 / 276
25 / 276
50 / 276
75 / 276
100 / 276
125 / 276
150 / 276
175 / 276
200 / 276
225 / 276
250 / 276
275 / 276


In [101]:
wiki_lat_long = []
for i, resp in enumerate(geodec_responses):
    if not i% 25:
        print(i, '/', len(geodec_responses))
    if resp == '':
        wiki_lat_long.append('')
        continue
    try:
        soup = BeautifulSoup(resp.text, 'lxml')
        wiki_lat_long.append(soup.select('.geo-dec')[0])
    except Exception as e:
        wiki_lat_long.append('')


0 / 276
25 / 276
50 / 276
75 / 276
100 / 276
125 / 276
150 / 276
175 / 276
200 / 276
225 / 276
250 / 276
275 / 276


In [171]:
wiki_lat_long_text =  [x.text if x else '' for x in wiki_lat_long]
wiki_lat_long_text = [x.replace('°', '') if x else '' for x in wiki_lat_long_text]
wiki_lat_long_text = [x.split() if x else '' for x in wiki_lat_long_text]
wiki_lat_long_text[:15]

In [103]:
import re

wiki_lat = [float(re.sub('S|N', '', x[0])) if x else '' for x in wiki_lat_long_text]
wiki_lon = [float(re.sub('W|E', '', x[1])) if x else '' for x in wiki_lat_long_text]

In [172]:
len(wiki_lat), len(wiki_lat_long_text), len(wiki_lat_long)
[x.text if x else '' for x in wiki_lat_long[:10]]

In [106]:
massacres_df['wiki_lat'] = wiki_lat
massacres_df['wiki_lon'] = wiki_lon


In [115]:
# massacres_df.to_csv('massacres.csv', index=False)
massacres_df = pd.read_csv('massacres.csv')
massacres_df['date'] = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in massacres_df['date']]
massacres_df.head(3)

Unnamed: 0,date,deaths,description,location,location_link,name,name_link,latitude,longitude,wiki_lat,wiki_lon
0,9912-01-01 00:00:00,80000,Wholesale massacre of all Roman and Italic cit...,Kingdom of Pontus,/wiki/Kingdom_of_Pontus,Asiatic Vespers,/wiki/Asiatic_Vespers,,,,
1,0061-01-01 00:00:00,1,Gaius Suetonius Paulinus ordered the Roman arm...,"Anglesey, Britannia",/wiki/Anglesey,Menai massacre,/wiki/Menai_massacre,53.265325,-4.42914,,
2,0390-01-01 00:00:00,7000,Emperor Theodosius I of Rome ordered the execu...,"Thessaloniki, Macedonia",/wiki/Thessaloniki,Massacre of Thessaloniki,/wiki/Massacre_of_Thessaloniki,40.640063,22.944419,40.65,22.9


In [169]:
massacres_df['lat_clean'] = [lat if not pd.isna(lat) else wikilat for lat, wikilat in 
                             zip(massacres_df['latitude'], massacres_df['wiki_lat']) ]
massacres_df['lon_clean'] = [lon if not pd.isna(lon) else wikilon for lon, wikilon in 
                             zip(massacres_df['longitude'], massacres_df['wiki_lon']) ]
massacres_df.to_csv('massacres.csv', index=False)

In [168]:
massacres_df[massacres_df['lon_clean'].isna()][['name', 'latitude', 'wiki_lat', 'lat_clean','lon_clean']]

Unnamed: 0,name,latitude,wiki_lat,lat_clean,lon_clean
0,Asiatic Vespers,,,,
58,Bear River massacre,,,,
74,Adana massacre,,,,
133,Abu Shusha massacre,,,,
134,Tantura massacre,,,,
162,Binh Tai Massacre,,,,
166,Phong Nhi and Phong Nhat massacre,,,,
263,Uror massacre,,,,


In [170]:
massacres_df = pd.read_csv('massacres.csv')
massacres_df.head()

Unnamed: 0,date,deaths,description,location,location_link,name,name_link,latitude,longitude,wiki_lat,wiki_lon,lat_clean,lon_clean
0,9912-01-01 00:00:00,80000,Wholesale massacre of all Roman and Italic cit...,Kingdom of Pontus,/wiki/Kingdom_of_Pontus,Asiatic Vespers,/wiki/Asiatic_Vespers,,,,,,
1,0061-01-01 00:00:00,1,Gaius Suetonius Paulinus ordered the Roman arm...,"Anglesey, Britannia",/wiki/Anglesey,Menai massacre,/wiki/Menai_massacre,53.265325,-4.42914,,,53.265325,-4.42914
2,0390-01-01 00:00:00,7000,Emperor Theodosius I of Rome ordered the execu...,"Thessaloniki, Macedonia",/wiki/Thessaloniki,Massacre of Thessaloniki,/wiki/Massacre_of_Thessaloniki,40.640063,22.944419,40.65,22.9,40.640063,22.944419
3,0627-01-01 00:00:00,600,Muhammad ordered his followers to attack the B...,"Fortress of Banu Qurayza, Saudi Arabia",/wiki/Saudi_Arabia,Massacre of Banu Qurayza,/wiki/Invasion_of_Banu_Qurayza,,,24.65,46.767,24.65,46.767
4,0782-01-01 00:00:00,4500,"Charlemagne ordered the massacre of 4,500 impr...","Verden, Lower Saxony, Germany",/wiki/Verden_an_der_Aller,Massacre of Verden,/wiki/Massacre_of_Verden,52.934798,9.232965,52.92333,9.235,52.934798,9.232965
