# Getting the data 

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs 
import numpy as np
import datetime as dt

In [2]:
url = 'https://www.forexfactory.com/calendar?month=last'

In [3]:
response = requests.get(url)

In [4]:
data = response.text
soup = bs(data, 'lxml')

In [5]:
# Searching for elements in the table
table = soup.find('table', class_='calendar__table')
print(table.prettify())

<table class="calendar__table">
 <thead>
  <tr class="calendar__header--desktop subhead">
   <th class="calendar__date">
    Date
   </th>
   <th class="calendar__time">
    <a href="timezone.php" title="Time Options">
     9:28pm
    </a>
   </th>
   <th class="calendar__currency">
    Currency
   </th>
   <th class="calendar__impact">
    Impact
   </th>
   <th class="calendar__event">
   </th>
   <th class="calendar__detail">
    Detail
   </th>
   <th class="calendar__actual">
    Actual
   </th>
   <th class="calendar__forecast">
    Forecast
   </th>
   <th class="calendar__previous">
    Previous
   </th>
   <th class="calendar__graph">
    Graph
   </th>
  </tr>
  <tr class="calendar__header--mobile subhead">
   <th colspan="4">
    <a class="calendar__header-time" href="timezone.php" title="Time Options">
     9:28pm
    </a>
   </th>
   <th>
    Actual
   </th>
  </tr>
 </thead>
 <tr class="calendar__borderfix borderfix">
  <td>
  </td>
 </tr>
 <tr class="calendar__row calend

In [6]:
# Looping through the calendar table
list_of_rows = []


for row in table.find_all('tr', {'data-eventid':True}):
    list_of_cells = []
    
    #Filtering high-impact events
    for cell in row.find_all('td', class_=[
          'calendar__cell calendar__date date',
          'calendar__cell calendar__currency currency', 
          'calendar__cell calendar__event event', 
          'calendar__cell calendar__actual actual', 
          'calendar__cell calendar__forecast forecast', 
          'calendar__cell calendar__previous previous']):
            
        list_of_cells.append(cell.text)
    list_of_rows.append(list_of_cells)


In [7]:
df_calendar = pd.DataFrame(list_of_rows, columns=['Date','Country','Event','Actual','Forecast','Previous'])
df_calendar.iloc[:,1] = df_calendar.iloc[:,1].str.split('\n').str[1]

df_calendar = df_calendar.reset_index()
df_calendar = df_calendar.set_index(df_calendar.columns[1])
df_calendar = df_calendar.drop(columns=['index'])
df_calendar

Unnamed: 0_level_0,Country,Event,Actual,Forecast,Previous
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WedApr 1,AUD,Commodity Prices y/y,-10.2%,,-6.0%
,,,,,
,EUR,German Retail Sales m/m,1.2%,0.1%,1.0%
,,,,,
,EUR,Spanish Manufacturing PMI,45.7,44.0,50.4
...,...,...,...,...,...
,,,,,
,JPY,Final Manufacturing PMI,41.9,43.7,43.7
,,,,,
,AUD,PPI q/q,0.2%,,0.3%


In [8]:
df_calendar

Unnamed: 0_level_0,Country,Event,Actual,Forecast,Previous
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WedApr 1,AUD,Commodity Prices y/y,-10.2%,,-6.0%
,,,,,
,EUR,German Retail Sales m/m,1.2%,0.1%,1.0%
,,,,,
,EUR,Spanish Manufacturing PMI,45.7,44.0,50.4
...,...,...,...,...,...
,,,,,
,JPY,Final Manufacturing PMI,41.9,43.7,43.7
,,,,,
,AUD,PPI q/q,0.2%,,0.3%


In [9]:
new_calendar = df_calendar

## Cleaning the table 

In [10]:
new_calendar = new_calendar.dropna(how='all')
new_calendar

Unnamed: 0_level_0,Country,Event,Actual,Forecast,Previous
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WedApr 1,AUD,Commodity Prices y/y,-10.2%,,-6.0%
,EUR,German Retail Sales m/m,1.2%,0.1%,1.0%
,EUR,Spanish Manufacturing PMI,45.7,44.0,50.4
,CHF,Manufacturing PMI,43.7,42.3,49.5
,EUR,Italian Manufacturing PMI,40.3,41.1,48.7
...,...,...,...,...,...
,JPY,Tokyo Core CPI y/y,-0.1%,0.1%,0.4%
,JPY,Monetary Policy Meeting Minutes,,,
,CNY,Bank Holiday,,,
,JPY,Final Manufacturing PMI,41.9,43.7,43.7


In [11]:
# Resetting the index
new_calendar = new_calendar.reset_index()

In [12]:
# Applying the function to get rid of the day and convert the string to date
new_calendar['Date'] = [date[4:] for date in new_calendar['Date'].to_list()]

In [13]:
# Check the uniqueness of the dates
new_calendar['Date'].unique()

array(['Apr 1 ', '', 'Apr 2 ', 'Apr 3 ', 'Apr 4 ', 'Apr 5 ', 'Apr 6 ',
       'Apr 7 ', 'Apr 8 ', 'Apr 9 ', 'Apr 10 ', 'Apr 11 ', 'Apr 12 ',
       'Apr 13 ', 'Apr 14 ', 'Apr 15 ', 'Apr 16 ', 'Apr 17 ', 'Apr 18 ',
       'Apr 19 ', 'Apr 20 ', 'Apr 21 ', 'Apr 22 ', 'Apr 23 ', 'Apr 24 ',
       'Apr 25 ', 'Apr 26 ', 'Apr 27 ', 'Apr 28 ', 'Apr 29 ', 'Apr 30 '],
      dtype=object)

In [14]:
new_calendar

Unnamed: 0,Date,Country,Event,Actual,Forecast,Previous
0,Apr 1,AUD,Commodity Prices y/y,-10.2%,,-6.0%
1,,EUR,German Retail Sales m/m,1.2%,0.1%,1.0%
2,,EUR,Spanish Manufacturing PMI,45.7,44.0,50.4
3,,CHF,Manufacturing PMI,43.7,42.3,49.5
4,,EUR,Italian Manufacturing PMI,40.3,41.1,48.7
...,...,...,...,...,...,...
393,,JPY,Tokyo Core CPI y/y,-0.1%,0.1%,0.4%
394,,JPY,Monetary Policy Meeting Minutes,,,
395,,CNY,Bank Holiday,,,
396,,JPY,Final Manufacturing PMI,41.9,43.7,43.7


In [15]:
# Replacing blank cells with NaN
new_calendar['Date'] = new_calendar['Date'].replace( '',np.nan).fillna(method='ffill')

In [16]:
new_calendar.Date

0       Apr 1 
1       Apr 1 
2       Apr 1 
3       Apr 1 
4       Apr 1 
        ...   
393    Apr 30 
394    Apr 30 
395    Apr 30 
396    Apr 30 
397    Apr 30 
Name: Date, Length: 398, dtype: object

In [19]:
new_calendar['Date'] = new_calendar['Date'] + '20'

In [22]:
new_calendar['Date'] =  pd.to_datetime(new_calendar['Date'], format='%b %d %y')

In [23]:
new_calendar

Unnamed: 0,Date,Country,Event,Actual,Forecast,Previous
0,2020-04-01,AUD,Commodity Prices y/y,-10.2%,,-6.0%
1,2020-04-01,EUR,German Retail Sales m/m,1.2%,0.1%,1.0%
2,2020-04-01,EUR,Spanish Manufacturing PMI,45.7,44.0,50.4
3,2020-04-01,CHF,Manufacturing PMI,43.7,42.3,49.5
4,2020-04-01,EUR,Italian Manufacturing PMI,40.3,41.1,48.7
...,...,...,...,...,...,...
393,2020-04-30,JPY,Tokyo Core CPI y/y,-0.1%,0.1%,0.4%
394,2020-04-30,JPY,Monetary Policy Meeting Minutes,,,
395,2020-04-30,CNY,Bank Holiday,,,
396,2020-04-30,JPY,Final Manufacturing PMI,41.9,43.7,43.7


In [28]:
# Setting up the index
new_calendar.set_index('Date')

Unnamed: 0_level_0,Country,Event,Actual,Forecast,Previous
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-01,AUD,Commodity Prices y/y,-10.2%,,-6.0%
2020-04-01,EUR,German Retail Sales m/m,1.2%,0.1%,1.0%
2020-04-01,EUR,Spanish Manufacturing PMI,45.7,44.0,50.4
2020-04-01,CHF,Manufacturing PMI,43.7,42.3,49.5
2020-04-01,EUR,Italian Manufacturing PMI,40.3,41.1,48.7
...,...,...,...,...,...
2020-04-30,JPY,Tokyo Core CPI y/y,-0.1%,0.1%,0.4%
2020-04-30,JPY,Monetary Policy Meeting Minutes,,,
2020-04-30,CNY,Bank Holiday,,,
2020-04-30,JPY,Final Manufacturing PMI,41.9,43.7,43.7


## Insert to Database

In [27]:
import pymongo

In [31]:
conn = 'mongodb://localhost:27017'
client  =pymongo.MongoClient(conn)

In [32]:
# Creating the database
db = client.economuics_db

In [34]:
# Creating collection to hold event data
collection_events = db.event_data

In [38]:
# Converting the dataframe to a calendar
event_data_dict = new_calendar.to_dict("index")

In [39]:
event_data_dict

{0: {'Date': Timestamp('2020-04-01 00:00:00'),
  'Country': 'AUD',
  'Event': '  Commodity Prices y/y  ',
  'Actual': '-10.2%',
  'Forecast': '',
  'Previous': '-6.0%'},
 1: {'Date': Timestamp('2020-04-01 00:00:00'),
  'Country': 'EUR',
  'Event': '  German Retail Sales m/m  ',
  'Actual': '1.2%',
  'Forecast': '0.1%',
  'Previous': '1.0%'},
 2: {'Date': Timestamp('2020-04-01 00:00:00'),
  'Country': 'EUR',
  'Event': '  Spanish Manufacturing PMI  ',
  'Actual': '45.7',
  'Forecast': '44.0',
  'Previous': '50.4'},
 3: {'Date': Timestamp('2020-04-01 00:00:00'),
  'Country': 'CHF',
  'Event': '  Manufacturing PMI  ',
  'Actual': '43.7',
  'Forecast': '42.3',
  'Previous': '49.5'},
 4: {'Date': Timestamp('2020-04-01 00:00:00'),
  'Country': 'EUR',
  'Event': '  Italian Manufacturing PMI  ',
  'Actual': '40.3',
  'Forecast': '41.1',
  'Previous': '48.7'},
 5: {'Date': Timestamp('2020-04-01 00:00:00'),
  'Country': 'EUR',
  'Event': '  French Final Manufacturing PMI  ',
  'Actual': '43.2',


In [40]:
collection_events.insert_many([{str(k):b} for k, b in event_data_dict.items()])

<pymongo.results.InsertManyResult at 0x1fa3ca0d608>

In [41]:
for doc in collection_events.find():
    print(doc)

{'_id': ObjectId('5eb33cb2aab8b110daff9bfa'), '0': {'Date': datetime.datetime(2020, 4, 1, 0, 0), 'Country': 'AUD', 'Event': '  Commodity Prices y/y  ', 'Actual': '-10.2%', 'Forecast': '', 'Previous': '-6.0%'}}
{'_id': ObjectId('5eb33cb2aab8b110daff9bfb'), '1': {'Date': datetime.datetime(2020, 4, 1, 0, 0), 'Country': 'EUR', 'Event': '  German Retail Sales m/m  ', 'Actual': '1.2%', 'Forecast': '0.1%', 'Previous': '1.0%'}}
{'_id': ObjectId('5eb33cb2aab8b110daff9bfc'), '2': {'Date': datetime.datetime(2020, 4, 1, 0, 0), 'Country': 'EUR', 'Event': '  Spanish Manufacturing PMI  ', 'Actual': '45.7', 'Forecast': '44.0', 'Previous': '50.4'}}
{'_id': ObjectId('5eb33cb2aab8b110daff9bfd'), '3': {'Date': datetime.datetime(2020, 4, 1, 0, 0), 'Country': 'CHF', 'Event': '  Manufacturing PMI  ', 'Actual': '43.7', 'Forecast': '42.3', 'Previous': '49.5'}}
{'_id': ObjectId('5eb33cb2aab8b110daff9bfe'), '4': {'Date': datetime.datetime(2020, 4, 1, 0, 0), 'Country': 'EUR', 'Event': '  Italian Manufacturing PMI