In [33]:
import datetime
import numpy as np
import pandas as pd

from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.initializers import Constant
from keras.layers import Conv1D, Dense, Dropout, Embedding, GlobalMaxPooling1D, Input, MaxPooling1D
from keras.models import Model, load_model
from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import MinMaxScaler

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

from google.colab import drive
drive.mount('/content/drive')
# Mandatory files to have, and their paths:
DAILY_NEWS_PATH = "/content/drive/My Drive/NLP_DL/resources/daily_news.csv"  # path to the .csv file containing the daily news data
FOREX_PATH = "/content/drive/My Drive/NLP_DL/resources/forex_data.csv"  # path to the .csv file containing the (preprocessed) foreign exchange rate data
GLOVE_PATH = "/content/drive/My Drive/NLP_DL/resources/glove.6B.100d.txt"  # path to the .txt file containing the pre-trained word embedding model (Glove)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data loading, preprocessing
to-be-detailed

Daily news data: 2008-08-08 to 2016-07-01

Forex data: 2010-11-14 to 2019-09-30

Strategy for merge: for each daily news data, check if there is a forex data on the following day in the forex dataset -> if no, remove those days

This logic will implicitly synchronize the datasets, therefore we do not have to worry about the different time periods they contain

Loading FOREX data

In [34]:
forex_df = pd.read_csv(FOREX_PATH)
print("Do we have data for each day? {}\n".format(all(forex_df)))
print("First date: {}".format(forex_df['Date'].iloc[0]))
print("Last date: {}\n\n".format(forex_df['Date'].iloc[-1]))
forex_df

Do we have data for each day? True

First date: 2010-11-14
Last date: 2019-09-30




Unnamed: 0,Date,BidQuote,AskQuote
0,2010-11-14,276.124167,276.274167
1,2010-11-15,276.518511,276.668511
2,2010-11-16,277.247576,277.397576
3,2010-11-17,277.254591,277.404591
4,2010-11-18,275.087612,275.237612
...,...,...,...
2721,2019-09-24,334.777242,334.893755
2722,2019-09-25,334.387243,334.492309
2723,2019-09-26,334.917537,335.031652
2724,2019-09-27,335.554946,335.670201


Loading daily news data

In [35]:
daily_news_df = pd.read_csv(DAILY_NEWS_PATH)
print("Do we have data for each day? {}\n".format(all(daily_news_df)))
print("First date: {}".format(daily_news_df['Date'].iloc[0]))
print("Last date: {}\n\n".format(daily_news_df['Date'].iloc[-1]))
daily_news_df

Do we have data for each day? True

First date: 2008-08-08
Last date: 2016-07-01




Unnamed: 0,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",b'Georgian troops retreat from S. Osettain cap...,b'Did the U.S. Prep Georgia for War with Russia?',b'Rice Gives Green Light for Israel to Attack ...,b'Announcing:Class Action Lawsuit on Behalf of...,"b""So---Russia and Georgia are at war and the N...","b""China tells Bush to stay out of other countr...",b'Did World War III start today?',b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,b'Welcome To World War IV! Now In High Definit...,"b""Georgia's move, a mistake of monumental prop...",b'Russia presses deeper into Georgia; U.S. say...,b'Abhinav Bindra wins first ever Individual Ol...,b' U.S. ship heads for Arctic to define territ...,b'Drivers in a Jerusalem taxi station threaten...,b'The French Team is Stunned by Phelps and the...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."
2,2008-08-12,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...","b""The US military was surprised by the timing ...",b'U.S. Beats War Drum as Iran Dumps the Dollar',"b'Gorbachev: ""Georgian military attacked the S...",b'CNN use footage of Tskhinvali ruins to cover...,b'Beginning a war as the Olympics were opening...,b'55 pyramids as large as the Luxor stacked in...,b'The 11 Top Party Cities in the World',b'U.S. troops still in Georgia (did you know t...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man..."
3,2008-08-13,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',"b""The commander of a Navy air reconnaissance s...","b""92% of CNN readers: Russia's actions in Geor...",b'USA to send fleet into Black Sea to help Geo...,"b""US warns against Israeli plan to strike agai...","b""In an intriguing cyberalliance, two Estonian...",b'The CNN Effect: Georgia Schools Russia in In...,b'Why Russias response to Georgia was right',b'Elephants extinct by 2020?',b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...
4,2008-08-14,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",b'Russia exaggerating South Ossetian death tol...,b' Musharraf expected to resign rather than fa...,b'Moscow Made Plans Months Ago to Invade Georgia',b'Why Russias response to Georgia was right',b'Nigeria has handed over the potentially oil-...,b'The US and Poland have agreed a preliminary ...,b'Russia apparently is sabotaging infrastructu...,b'Bank analyst forecast Georgian crisis 2 days...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1984,2016-06-27,Barclays and RBS shares suspended from trading...,Pope says Church should ask forgiveness from g...,Poland 'shocked' by xenophobic abuse of Poles ...,"There will be no second referendum, cabinet ag...","Scotland welcome to join EU, Merkel ally says",Sterling dips below Friday's 31-year low amid ...,No negative news about South African President...,Surge in Hate Crimes in the U.K. Following U.K...,Weapons shipped into Jordan by the CIA and Sau...,Angela Merkel said the U.K. must file exit pap...,In a birth offering hope to a threatened speci...,Sky News Journalist Left Speechless As Leave M...,Giant panda in Macau gives birth to twins,Get out now: EU leader tells Britain it must i...,Sea turtle 'beaten and left for dead' on beach...,German lawyers to probe Erdogan over alleged w...,"Boris Johnson says the UK will continue to ""in...",Richard Branson is calling on the UK governmen...,Turkey 'sorry for downing Russian jet',Edward Snowden lawyer vows new push for pardon...,Brexit opinion poll reveals majority don't wan...,"Conservative MP Leave Campaigner: ""The leave c...","Economists predict UK recession, further weake...","New EU 'superstate plan by France, Germany: Cr...",Pakistani clerics declare transgender marriage...
1985,2016-06-28,"2,500 Scientists To Australia: If You Want To ...","The personal details of 112,000 French police ...",S&amp;P cuts United Kingdom sovereign credit r...,Huge helium deposit found in Africa,CEO of the South African state broadcaster qui...,"Brexit cost investors $2 trillion, the worst o...",Hong Kong democracy activists call for return ...,Brexit: Iceland president says UK can join 'tr...,UK's Osborne: 'Absolutely' going to have to cu...,'Do not let Scotland down now' : Scottish MEP ...,British pound could hit history-making dollar ...,"Merkel vows to strengthen EU, tells UK no 'che...","""Ryanair will not deploy new aircraft on route...","People, ever more greedy and stupid, destroy t...",Siemens freezes new UK wind power investment f...,"US, Canada and Mexico pledge 50% of power from...",There is increasing evidence that Australia is...,"Richard Branson, the founder of Virgin Group, ...","37,000-yr-old skull from Borneo reveals surpri...",Palestinians stone Western Wall worshipers; po...,Jean-Claude Juncker asks Farage: Why are you h...,"""Romanians for Remainians"" offering a new home...",Brexit: Gibraltar in talks with Scotland to st...,8 Suicide Bombers Strike Lebanon,Mexico's security forces routinely use 'sexual...
1986,2016-06-29,Explosion At Airport In Istanbul,Yemeni former president: Terrorism is the offs...,UK must accept freedom of movement to access E...,Devastated: scientists too late to captive bre...,British Labor Party leader Jeremy Corbyn loses...,A Muslim Shop in the UK Was Just Firebombed Wh...,Mexican Authorities Sexually Torture Women in ...,UK shares and pound continue to recover,Iceland historian Johannesson wins presidentia...,99-Million-Yr-Old Bird Wings Found Encased in ...,A chatbot programmed by a British teenager has...,The Philippine president-elect said Monday he ...,Former Belgian Prime Minister ridicules Nigel ...,Brexiteer Nigel Farage To EU: 'You're Not Laug...,Islamic State bombings in southern Yemen kill ...,"Escape Tunnel, Dug by Hand, Is Found at Holoca...",The land under Beijing is sinking by as much a...,Car bomb and Anti-Islamic attack on Mosque in ...,Emaciated lions in Taiz Zoo are trapped in blo...,Rupert Murdoch describes Brexit as 'wonderful'...,More than 40 killed in Yemen suicide attacks,Google Found Disastrous Symantec and Norton Vu...,Extremist violence on the rise in Germany: Dom...,BBC News: Labour MPs pass Corbyn no-confidence...,Tiny New Zealand town with 'too many jobs' lau...
1987,2016-06-30,Jamaica proposes marijuana dispensers for tour...,Stephen Hawking says pollution and 'stupidity'...,Boris Johnson says he will not run for Tory pa...,Six gay men in Ivory Coast were abused and for...,Switzerland denies citizenship to Muslim immig...,Palestinian terrorist stabs israeli teen girl ...,Puerto Rico will default on $1 billion of debt...,Republic of Ireland fans to be awarded medal f...,Afghan suicide bomber 'kills up to 40' - BBC News,US airstrikes kill at least 250 ISIS fighters ...,Turkish Cop Who Took Down Istanbul Gunman Hail...,Cannabis compounds could treat Alzheimer's by ...,Japan's top court has approved blanket surveil...,CIA Gave Romania Millions to Host Secret Prisons,Groups urge U.N. to suspend Saudi Arabia from ...,Googles free wifi at Indian railway stations i...,Mounting evidence suggests 'hobbits' were wipe...,The men who carried out Tuesday's terror attac...,Calls to suspend Saudi Arabia from UN Human Ri...,More Than 100 Nobel Laureates Call Out Greenpe...,British pedophile sentenced to 85 years in US ...,"US permitted 1,200 offshore fracks in Gulf of ...",We will be swimming in ridicule - French beach...,UEFA says no minutes of silence for Istanbul v...,Law Enforcement Sources: Gun Used in Paris Ter...


Synchronizing daily news and forex data

In [36]:
def increase_date_by_one_day(series_element):
    year, month, day = series_element.split('-')
    date = datetime.datetime(int(year), int(month), int(day))
    next_date = date + datetime.timedelta(days=1)

    return next_date.strftime("%Y") + "-" + next_date.strftime("%m") + "-" + next_date.strftime("%d")

def decrease_date_by_one_day(series_element):
    year, month, day = series_element.split('-')
    date = datetime.datetime(int(year), int(month), int(day))
    next_date = date - datetime.timedelta(days=1)

    return next_date.strftime("%Y") + "-" + next_date.strftime("%m") + "-" + next_date.strftime("%d")

daily_news_df['Date'] = daily_news_df['Date'].apply(func=increase_date_by_one_day)
merged_df = daily_news_df.merge(forex_df, how='outer', left_on='Date', right_on='Date')
merged_df.sort_values('Date', axis=0, ascending=True, inplace=True)
merged_df.dropna(axis=0, how='any', inplace=True)

merged_df['Date'] = merged_df['Date'].apply(func=decrease_date_by_one_day)
merged_df.reset_index(drop=True, inplace=True)
merged_df

Unnamed: 0,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25,BidQuote,AskQuote
0,2010-11-15,Boris Johnson: George W. Bush cant fight for f...,"Taliban Commander: ""There are no al-Qa'ida fi...",Settlers torch West Bank olive trees - Israeli...,Maariyamma is likely to be killed by her child...,Ireland's young flee abroad as economic meltdo...,"""A two-year-old girl suffering from leukemia d...",Police in Egypt have been accused of beating a...,"""Hong Kong is officially the world's most wast...",Netanyahu presents security cabinet with Clint...,An Entire Village Flees Mexican Drug Violence,A Russian Banker And His 7-Man Team Beat Up A ...,Greece's Deficit Revised to Largest in EU as D...,'Unlawful sex': lovers sentenced to 100 lashes...,U.K. Politician Arrested for 'Glib' Twitter Th...,Hard questions about the new American handouts...,The Internet's new billion: New web users in c...,Interpol hunts two Israelis for Kosovo organ t...,UK Peer: Israel feeds global terrorism - A lon...,Mexico Arrests 12-Year-Old Cartel Hitman,TSA Caves On Molesting Pilots,Impending indictments in Hariri killing could ...,At least 32 killed in building collapse in Ind...,Shanghai high-rise on fire,Letter From Iran - \nSatellite TV channel co-o...,British wife of a millionaire businessman kill...,277.247576,277.397576
1,2010-11-16,"While you're getting groped by the TSA, here's...",Second squadron of 20 F-35s is an offer hard t...,R. Congresswoman Ros-Lehtinen opposes transfer...,"McDonald's, KFC, PepsiCo &amp; Mars Invited to...",Death sentence for migrant worker - Reddit! I ...,"Preparing for 2016 Olympics, Brazil invades sl...",Guantnamo Bay detainees to be paid millions in...,"Haitians riot, blame UN troops for cholera: Pr...","Mexican paper reports drug war, despite threats",My eyes are sweating: Hero dog who survived th...,Haiti rioters attack UN troops - Anti-UN riots...,More and more Irish people are leaving the str...,Mexico's $80M boom industry: Bulletproof cars,The toll in the collapse of a five-story resid...,How I Became An Oligarch - \nSpeech from Russi...,The difference between life and death in Haiti...,Pakistan vs. Haiti - Which Received More Aid? ...,Extradition of 'arms dealer' Viktor Bout goes ...,Bunga-Bunga Nation: Berlusconi's Italy Hurts W...,Sultan of the Muslim World | Modern Turkey bor...,While the US government is entertaining mind b...,UK government settles out of court with 16 Gua...,Google says China is breaking net laws: \nComp...,UK: London Metropolitan Police force closes do...,Ireland isn't working: Celtic Tiger becomes si...,277.254591,277.404591
2,2010-11-17,"Major spy scandal as Norway, Denmark, Finland,...",Freedom Of Speech 1 - Police 0. British police...,Evangelical Christians in Brazil have banned t...,UK Government comes out against net neutrality...,BBC picks up the TSA touching my junk story,Ireland: ANGRY protesters gathered at the home...,Tigers Near Extinction: The total number world...,Canada's Senate kills climate change bill\n,Madagascar military officers claim they have t...,Europe's corruption capital: How corruption in...,Australia wants to censor all images of smokin...,How China swallowed 15% of 'Net traffic for 18...,China's 'Me Generation' Sends Divorce Rate Soa...,Muslims set fire to at least 10 houses belongi...,"""Darfur, Zimbabwe, Burma, North Korea, anywher...",The suffering of Iraq's Christians - Some call...,Kim Jong-Un purging senior party and military ...,Sudan registers to vote to split country: Thou...,Get ready to kiss goodbye to net neutrality in...,Israeli retreat on Lebanon border may split vi...,Organs and bones were illegally harvested from...,World's newest nation would start almost from ...,Prime Minister of Madagascar vows to squash re...,Yemen's New Escort Service - Starved for reven...,"Rogue Trader Jerome Kerviel, sentenced to five...",275.087612,275.237612
3,2010-11-18,Remand request issued for WikiLeaks founder Ju...,Almost all borrowers in one of Indias largest ...,"13 dead, as strange disease spreads in Uganda:...",Here we go again. Bomb in Luggage found in Nam...,Dutch government wants to ban tourists from bu...,Labour MP Tom Watson has tabled a motion on ne...,Julian Assange challenges Swedish arrest ruling,Chinese woman sentenced to a year in labour ca...,"Here is a link to the blog by Anna Arden, who ...",What's the newest propaganda for occupying Afg...,Why 300 million more people are suddenly poor,"Suu Kyi, UN Secretary General Ban call for rel...",Drug Cartel Violence Erupts in Mexico's Border...,Arrest order for Wikileaks' Julian Assange iss...,China sentences woman to labor camp for Twitte...,French Embassy in Tehran Attacked By Iranian R...,"Pentagon says 2014 Afghan deadline ""aspiration...",India's microcredit industry may be near to co...,"Is China drunk on its new great-power status, ...",South Africas largest circulation weekly newsp...,Hong Kong diagnoses first bird flu case in sev...,Chinas Censors Misfire in Abuse-of-Power Case ...,"Economic crash to drive 100,000 out of Ireland",Unemployed English Girl to Wed Soldier from We...,Japan is ageing faster than any country in his...,274.197428,274.347428
4,2010-11-22,"Tim Berners-Lee, creator of the world wide web...",BBC News - Tigers threatened by extinction as ...,Vladimir Putin and World Bank chief stage a ca...,A Russia Today news crew has been detained in ...,WTF...Vatican directive on child protection wo...,29 Miners trapped in a coal mine in New Zealan...,At least 339 people killed in stampede in Camb...,'These are human beings we are dealing with' -...,Gallery of photos from November 15th apartment...,[6 year old] A-Long doesnt actually believe th...,Ireland is having a Lehmans moment: The next 2...,Iraq has run out of money to pay for widows' b...,Ireland fears civil unrest as bank crisis deep...,Cell ringtone sparks Indonesian tribal war,Cuba cutting everything but security - Cuba is...,"Chihuahua, Mexico - Cartels threaten to hurt s...",Japan's justice minister resigns after causing...,TSA Forces Woman To Cut Off Her Nipple Rings W...,US Embassy: Beijing Air Quality Is 'Crazy Bad'...,Republic of Ireland confirms EU financial resc...,WHO: A billion people cannot afford any health...,Israeli Soldiers Avoid Jail in Human Shield Ca...,Debt-Crippled Ireland Asks EU for Massive Bail...,Football betting now legal in Vietnam - The Mi...,BBC News - North Korea nuclear plant confirms ...,275.578741,275.728741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,2016-06-23,Today The United Kingdom decides whether to re...,"E-cigarettes should not be banned in public, m...",Report: China is still harvesting organs from ...,"Man opens fire at cinema complex in Germany, s...","Erdoan: Europe, you dont want us because were ...",Asian millionaires now control more wealth tha...,A Japanese porn industry association has apolo...,University students are being warned when clas...,Afghan interpreters 'betrayed' by UK and US,Contagious cancer cells are spreading between ...,51 Killed in China by Powerful Tornado,Teacher Killings Ignite Calls for Revolution i...,Solar plane lands in Spain after three-day Atl...,Brexit supporters urged to take own pens to po...,Cities forge world's largest alliance to curb ...,"Colombia, FARC announce full ceasefire, 'last ...",Gunmen kill Sufi devotional singer Amjad Sabri...,India launches 20 satellites in single mission,F-16s to be manufactured soon in an assembly l...,Australia's gun laws stopped mass shootings an...,French cement company in Syria buys oil from I...,Pope to visit Armenia after irking Turkey with...,Merkel says NATO must be strengthened,"China cracks down on online comments, click-ba...",The prime minister of India is set to get a br...,317.277425,317.490940
1125,2016-06-27,Barclays and RBS shares suspended from trading...,Pope says Church should ask forgiveness from g...,Poland 'shocked' by xenophobic abuse of Poles ...,"There will be no second referendum, cabinet ag...","Scotland welcome to join EU, Merkel ally says",Sterling dips below Friday's 31-year low amid ...,No negative news about South African President...,Surge in Hate Crimes in the U.K. Following U.K...,Weapons shipped into Jordan by the CIA and Sau...,Angela Merkel said the U.K. must file exit pap...,In a birth offering hope to a threatened speci...,Sky News Journalist Left Speechless As Leave M...,Giant panda in Macau gives birth to twins,Get out now: EU leader tells Britain it must i...,Sea turtle 'beaten and left for dead' on beach...,German lawyers to probe Erdogan over alleged w...,"Boris Johnson says the UK will continue to ""in...",Richard Branson is calling on the UK governmen...,Turkey 'sorry for downing Russian jet',Edward Snowden lawyer vows new push for pardon...,Brexit opinion poll reveals majority don't wan...,"Conservative MP Leave Campaigner: ""The leave c...","Economists predict UK recession, further weake...","New EU 'superstate plan by France, Germany: Cr...",Pakistani clerics declare transgender marriage...,317.050972,317.174937
1126,2016-06-28,"2,500 Scientists To Australia: If You Want To ...","The personal details of 112,000 French police ...",S&amp;P cuts United Kingdom sovereign credit r...,Huge helium deposit found in Africa,CEO of the South African state broadcaster qui...,"Brexit cost investors $2 trillion, the worst o...",Hong Kong democracy activists call for return ...,Brexit: Iceland president says UK can join 'tr...,UK's Osborne: 'Absolutely' going to have to cu...,'Do not let Scotland down now' : Scottish MEP ...,British pound could hit history-making dollar ...,"Merkel vows to strengthen EU, tells UK no 'che...","""Ryanair will not deploy new aircraft on route...","People, ever more greedy and stupid, destroy t...",Siemens freezes new UK wind power investment f...,"US, Canada and Mexico pledge 50% of power from...",There is increasing evidence that Australia is...,"Richard Branson, the founder of Virgin Group, ...","37,000-yr-old skull from Borneo reveals surpri...",Palestinians stone Western Wall worshipers; po...,Jean-Claude Juncker asks Farage: Why are you h...,"""Romanians for Remainians"" offering a new home...",Brexit: Gibraltar in talks with Scotland to st...,8 Suicide Bombers Strike Lebanon,Mexico's security forces routinely use 'sexual...,316.879670,316.990106
1127,2016-06-29,Explosion At Airport In Istanbul,Yemeni former president: Terrorism is the offs...,UK must accept freedom of movement to access E...,Devastated: scientists too late to captive bre...,British Labor Party leader Jeremy Corbyn loses...,A Muslim Shop in the UK Was Just Firebombed Wh...,Mexican Authorities Sexually Torture Women in ...,UK shares and pound continue to recover,Iceland historian Johannesson wins presidentia...,99-Million-Yr-Old Bird Wings Found Encased in ...,A chatbot programmed by a British teenager has...,The Philippine president-elect said Monday he ...,Former Belgian Prime Minister ridicules Nigel ...,Brexiteer Nigel Farage To EU: 'You're Not Laug...,Islamic State bombings in southern Yemen kill ...,"Escape Tunnel, Dug by Hand, Is Found at Holoca...",The land under Beijing is sinking by as much a...,Car bomb and Anti-Islamic attack on Mosque in ...,Emaciated lions in Taiz Zoo are trapped in blo...,Rupert Murdoch describes Brexit as 'wonderful'...,More than 40 killed in Yemen suicide attacks,Google Found Disastrous Symantec and Norton Vu...,Extremist violence on the rise in Germany: Dom...,BBC News: Labour MPs pass Corbyn no-confidence...,Tiny New Zealand town with 'too many jobs' lau...,316.388392,316.502362


In [37]:
# Check the maximum difference between the dates
max_diff = 0
first_date = ""
second_date = ""
dates = merged_df['Date']

for i in range(len(dates)-1):
    current_year, current_month, current_day = dates.iloc[i].split('-')
    next_year, next_month, next_day = dates.iloc[i+1].split('-')
    current_date = datetime.datetime(int(current_year), int(current_month), int(current_day))
    next_date = datetime.datetime(int(next_year), int(next_month), int(next_day))

    date_diff = next_date - current_date
    date_diff = abs(float(date_diff.days))
    if (i == 0) or (date_diff > max_diff):
        max_diff = date_diff
        first_date = dates.iloc[i]
        second_date = dates.iloc[i+1]

print("Maximum missing days between two following data points: {} days\n".format(max_diff-1))  # Need to substract 1 to get the number of missing days from the "day difference"
print("Between: {} and {}\n\n".format(first_date, second_date))
print("First date of the merged dataset: {}\n".format(merged_df['Date'].iloc[0]))
print("Last date of the merged dataset: {}\n".format(merged_df['Date'].iloc[-1]))

Maximum missing days between two following data points: 5.0 days

Between: 2012-10-25 and 2012-10-31


First date of the merged dataset: 2010-11-15

Last date of the merged dataset: 2016-06-30



In [38]:
forex_data = merged_df[['BidQuote', 'AskQuote']]
daily_news_data = merged_df.drop(['Date', 'BidQuote', 'AskQuote'], axis=1)

forex_data

Unnamed: 0,BidQuote,AskQuote
0,277.247576,277.397576
1,277.254591,277.404591
2,275.087612,275.237612
3,274.197428,274.347428
4,275.578741,275.728741
...,...,...
1124,317.277425,317.490940
1125,317.050972,317.174937
1126,316.879670,316.990106
1127,316.388392,316.502362


In [39]:
daily_news_data

Unnamed: 0,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,Boris Johnson: George W. Bush cant fight for f...,"Taliban Commander: ""There are no al-Qa'ida fi...",Settlers torch West Bank olive trees - Israeli...,Maariyamma is likely to be killed by her child...,Ireland's young flee abroad as economic meltdo...,"""A two-year-old girl suffering from leukemia d...",Police in Egypt have been accused of beating a...,"""Hong Kong is officially the world's most wast...",Netanyahu presents security cabinet with Clint...,An Entire Village Flees Mexican Drug Violence,A Russian Banker And His 7-Man Team Beat Up A ...,Greece's Deficit Revised to Largest in EU as D...,'Unlawful sex': lovers sentenced to 100 lashes...,U.K. Politician Arrested for 'Glib' Twitter Th...,Hard questions about the new American handouts...,The Internet's new billion: New web users in c...,Interpol hunts two Israelis for Kosovo organ t...,UK Peer: Israel feeds global terrorism - A lon...,Mexico Arrests 12-Year-Old Cartel Hitman,TSA Caves On Molesting Pilots,Impending indictments in Hariri killing could ...,At least 32 killed in building collapse in Ind...,Shanghai high-rise on fire,Letter From Iran - \nSatellite TV channel co-o...,British wife of a millionaire businessman kill...
1,"While you're getting groped by the TSA, here's...",Second squadron of 20 F-35s is an offer hard t...,R. Congresswoman Ros-Lehtinen opposes transfer...,"McDonald's, KFC, PepsiCo &amp; Mars Invited to...",Death sentence for migrant worker - Reddit! I ...,"Preparing for 2016 Olympics, Brazil invades sl...",Guantnamo Bay detainees to be paid millions in...,"Haitians riot, blame UN troops for cholera: Pr...","Mexican paper reports drug war, despite threats",My eyes are sweating: Hero dog who survived th...,Haiti rioters attack UN troops - Anti-UN riots...,More and more Irish people are leaving the str...,Mexico's $80M boom industry: Bulletproof cars,The toll in the collapse of a five-story resid...,How I Became An Oligarch - \nSpeech from Russi...,The difference between life and death in Haiti...,Pakistan vs. Haiti - Which Received More Aid? ...,Extradition of 'arms dealer' Viktor Bout goes ...,Bunga-Bunga Nation: Berlusconi's Italy Hurts W...,Sultan of the Muslim World | Modern Turkey bor...,While the US government is entertaining mind b...,UK government settles out of court with 16 Gua...,Google says China is breaking net laws: \nComp...,UK: London Metropolitan Police force closes do...,Ireland isn't working: Celtic Tiger becomes si...
2,"Major spy scandal as Norway, Denmark, Finland,...",Freedom Of Speech 1 - Police 0. British police...,Evangelical Christians in Brazil have banned t...,UK Government comes out against net neutrality...,BBC picks up the TSA touching my junk story,Ireland: ANGRY protesters gathered at the home...,Tigers Near Extinction: The total number world...,Canada's Senate kills climate change bill\n,Madagascar military officers claim they have t...,Europe's corruption capital: How corruption in...,Australia wants to censor all images of smokin...,How China swallowed 15% of 'Net traffic for 18...,China's 'Me Generation' Sends Divorce Rate Soa...,Muslims set fire to at least 10 houses belongi...,"""Darfur, Zimbabwe, Burma, North Korea, anywher...",The suffering of Iraq's Christians - Some call...,Kim Jong-Un purging senior party and military ...,Sudan registers to vote to split country: Thou...,Get ready to kiss goodbye to net neutrality in...,Israeli retreat on Lebanon border may split vi...,Organs and bones were illegally harvested from...,World's newest nation would start almost from ...,Prime Minister of Madagascar vows to squash re...,Yemen's New Escort Service - Starved for reven...,"Rogue Trader Jerome Kerviel, sentenced to five..."
3,Remand request issued for WikiLeaks founder Ju...,Almost all borrowers in one of Indias largest ...,"13 dead, as strange disease spreads in Uganda:...",Here we go again. Bomb in Luggage found in Nam...,Dutch government wants to ban tourists from bu...,Labour MP Tom Watson has tabled a motion on ne...,Julian Assange challenges Swedish arrest ruling,Chinese woman sentenced to a year in labour ca...,"Here is a link to the blog by Anna Arden, who ...",What's the newest propaganda for occupying Afg...,Why 300 million more people are suddenly poor,"Suu Kyi, UN Secretary General Ban call for rel...",Drug Cartel Violence Erupts in Mexico's Border...,Arrest order for Wikileaks' Julian Assange iss...,China sentences woman to labor camp for Twitte...,French Embassy in Tehran Attacked By Iranian R...,"Pentagon says 2014 Afghan deadline ""aspiration...",India's microcredit industry may be near to co...,"Is China drunk on its new great-power status, ...",South Africas largest circulation weekly newsp...,Hong Kong diagnoses first bird flu case in sev...,Chinas Censors Misfire in Abuse-of-Power Case ...,"Economic crash to drive 100,000 out of Ireland",Unemployed English Girl to Wed Soldier from We...,Japan is ageing faster than any country in his...
4,"Tim Berners-Lee, creator of the world wide web...",BBC News - Tigers threatened by extinction as ...,Vladimir Putin and World Bank chief stage a ca...,A Russia Today news crew has been detained in ...,WTF...Vatican directive on child protection wo...,29 Miners trapped in a coal mine in New Zealan...,At least 339 people killed in stampede in Camb...,'These are human beings we are dealing with' -...,Gallery of photos from November 15th apartment...,[6 year old] A-Long doesnt actually believe th...,Ireland is having a Lehmans moment: The next 2...,Iraq has run out of money to pay for widows' b...,Ireland fears civil unrest as bank crisis deep...,Cell ringtone sparks Indonesian tribal war,Cuba cutting everything but security - Cuba is...,"Chihuahua, Mexico - Cartels threaten to hurt s...",Japan's justice minister resigns after causing...,TSA Forces Woman To Cut Off Her Nipple Rings W...,US Embassy: Beijing Air Quality Is 'Crazy Bad'...,Republic of Ireland confirms EU financial resc...,WHO: A billion people cannot afford any health...,Israeli Soldiers Avoid Jail in Human Shield Ca...,Debt-Crippled Ireland Asks EU for Massive Bail...,Football betting now legal in Vietnam - The Mi...,BBC News - North Korea nuclear plant confirms ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,Today The United Kingdom decides whether to re...,"E-cigarettes should not be banned in public, m...",Report: China is still harvesting organs from ...,"Man opens fire at cinema complex in Germany, s...","Erdoan: Europe, you dont want us because were ...",Asian millionaires now control more wealth tha...,A Japanese porn industry association has apolo...,University students are being warned when clas...,Afghan interpreters 'betrayed' by UK and US,Contagious cancer cells are spreading between ...,51 Killed in China by Powerful Tornado,Teacher Killings Ignite Calls for Revolution i...,Solar plane lands in Spain after three-day Atl...,Brexit supporters urged to take own pens to po...,Cities forge world's largest alliance to curb ...,"Colombia, FARC announce full ceasefire, 'last ...",Gunmen kill Sufi devotional singer Amjad Sabri...,India launches 20 satellites in single mission,F-16s to be manufactured soon in an assembly l...,Australia's gun laws stopped mass shootings an...,French cement company in Syria buys oil from I...,Pope to visit Armenia after irking Turkey with...,Merkel says NATO must be strengthened,"China cracks down on online comments, click-ba...",The prime minister of India is set to get a br...
1125,Barclays and RBS shares suspended from trading...,Pope says Church should ask forgiveness from g...,Poland 'shocked' by xenophobic abuse of Poles ...,"There will be no second referendum, cabinet ag...","Scotland welcome to join EU, Merkel ally says",Sterling dips below Friday's 31-year low amid ...,No negative news about South African President...,Surge in Hate Crimes in the U.K. Following U.K...,Weapons shipped into Jordan by the CIA and Sau...,Angela Merkel said the U.K. must file exit pap...,In a birth offering hope to a threatened speci...,Sky News Journalist Left Speechless As Leave M...,Giant panda in Macau gives birth to twins,Get out now: EU leader tells Britain it must i...,Sea turtle 'beaten and left for dead' on beach...,German lawyers to probe Erdogan over alleged w...,"Boris Johnson says the UK will continue to ""in...",Richard Branson is calling on the UK governmen...,Turkey 'sorry for downing Russian jet',Edward Snowden lawyer vows new push for pardon...,Brexit opinion poll reveals majority don't wan...,"Conservative MP Leave Campaigner: ""The leave c...","Economists predict UK recession, further weake...","New EU 'superstate plan by France, Germany: Cr...",Pakistani clerics declare transgender marriage...
1126,"2,500 Scientists To Australia: If You Want To ...","The personal details of 112,000 French police ...",S&amp;P cuts United Kingdom sovereign credit r...,Huge helium deposit found in Africa,CEO of the South African state broadcaster qui...,"Brexit cost investors $2 trillion, the worst o...",Hong Kong democracy activists call for return ...,Brexit: Iceland president says UK can join 'tr...,UK's Osborne: 'Absolutely' going to have to cu...,'Do not let Scotland down now' : Scottish MEP ...,British pound could hit history-making dollar ...,"Merkel vows to strengthen EU, tells UK no 'che...","""Ryanair will not deploy new aircraft on route...","People, ever more greedy and stupid, destroy t...",Siemens freezes new UK wind power investment f...,"US, Canada and Mexico pledge 50% of power from...",There is increasing evidence that Australia is...,"Richard Branson, the founder of Virgin Group, ...","37,000-yr-old skull from Borneo reveals surpri...",Palestinians stone Western Wall worshipers; po...,Jean-Claude Juncker asks Farage: Why are you h...,"""Romanians for Remainians"" offering a new home...",Brexit: Gibraltar in talks with Scotland to st...,8 Suicide Bombers Strike Lebanon,Mexico's security forces routinely use 'sexual...
1127,Explosion At Airport In Istanbul,Yemeni former president: Terrorism is the offs...,UK must accept freedom of movement to access E...,Devastated: scientists too late to captive bre...,British Labor Party leader Jeremy Corbyn loses...,A Muslim Shop in the UK Was Just Firebombed Wh...,Mexican Authorities Sexually Torture Women in ...,UK shares and pound continue to recover,Iceland historian Johannesson wins presidentia...,99-Million-Yr-Old Bird Wings Found Encased in ...,A chatbot programmed by a British teenager has...,The Philippine president-elect said Monday he ...,Former Belgian Prime Minister ridicules Nigel ...,Brexiteer Nigel Farage To EU: 'You're Not Laug...,Islamic State bombings in southern Yemen kill ...,"Escape Tunnel, Dug by Hand, Is Found at Holoca...",The land under Beijing is sinking by as much a...,Car bomb and Anti-Islamic attack on Mosque in ...,Emaciated lions in Taiz Zoo are trapped in blo...,Rupert Murdoch describes Brexit as 'wonderful'...,More than 40 killed in Yemen suicide attacks,Google Found Disastrous Symantec and Norton Vu...,Extremist violence on the rise in Germany: Dom...,BBC News: Labour MPs pass Corbyn no-confidence...,Tiny New Zealand town with 'too many jobs' lau...


Separate train / validation / test data

In [40]:
VALID_SPLIT = 0.15
TEST_SPLIT = 0.1

nb_samples = daily_news_data.shape[0]
valid_size = int(nb_samples*(1-TEST_SPLIT-VALID_SPLIT))
test_size = int(nb_samples*(1-TEST_SPLIT))
X_train, Y_train = daily_news_data[:valid_size], forex_data[:valid_size]
X_valid, Y_valid = daily_news_data[valid_size:test_size], forex_data[valid_size:test_size]
X_test, Y_test = daily_news_data[test_size:], forex_data[test_size:]

print("X train shape: {}".format(X_train.shape))
print("X valid shape: {}".format(X_valid.shape))
print("X test shape: {}".format(X_test.shape))
print("Y train shape: {}".format(Y_train.shape))
print("Y valid shape: {}".format(Y_valid.shape))
print("Y test shape: {}".format(Y_test.shape))

X train shape: (846, 25)
X valid shape: (170, 25)
X test shape: (113, 25)
Y train shape: (846, 2)
Y valid shape: (170, 2)
Y test shape: (113, 2)


Prepare forex data (target values for regression): min-max scaling

In [0]:
y_train_scaler = MinMaxScaler().fit(Y_train)
Y_train = y_train_scaler.transform(Y_train)

y_valid_scaler = MinMaxScaler().fit(Y_valid)
Y_valid = y_valid_scaler.transform(Y_valid)

y_test_scaler = MinMaxScaler().fit(Y_test)
Y_test = y_test_scaler.transform(Y_test)

In [42]:
print(Y_train)
print("\n")
print(Y_valid)
print("\n")
print(Y_test)

[[0.23932076 0.2392677 ]
 [0.23943979 0.2393863 ]
 [0.20266878 0.20274926]
 ...
 [0.7681021  0.76703579]
 [0.75496188 0.75343368]
 [0.7305579  0.72873231]]


[[0.5992391  0.60147973]
 [0.62297615 0.62585461]
 [0.50803807 0.51007221]
 [0.46205627 0.46326435]
 [0.50326808 0.50627741]
 [0.42885302 0.43121767]
 [0.41950874 0.42205707]
 [0.42742663 0.43301605]
 [0.40423746 0.40593047]
 [0.31815848 0.31893864]
 [0.30988888 0.30934816]
 [0.42706391 0.42723716]
 [0.48664942 0.48724231]
 [0.41673953 0.41923279]
 [0.38107365 0.38246955]
 [0.50686005 0.51058485]
 [0.38604468 0.38985477]
 [0.33249388 0.3352116 ]
 [0.44383386 0.44604608]
 [0.34865854 0.35508775]
 [0.35064672 0.35738257]
 [0.30891617 0.31519011]
 [0.35333947 0.3546282 ]
 [0.23805678 0.24551485]
 [0.11152401 0.11377392]
 [0.15731508 0.1614876 ]
 [0.18052572 0.18351509]
 [0.15903172 0.16224849]
 [0.11105724 0.11208332]
 [0.14121927 0.14278733]
 [0.13501964 0.14429081]
 [0.08998789 0.10745438]
 [0.02264005 0.02310806]
 [0.06024144 0.06

Prepare daily news data (input values for regression): 
- merge the headlines into one for a given date
- use glove word embeddings to represent the words

In [45]:
trainheadlines = []
for row in range(0, len(X_train.index)):
    trainheadlines.append(' '.join(str(x) for x in X_train.iloc[row,1:len(X_train.columns)]))

validheadlines = []
for row in range(0,len(X_valid.index)):
    validheadlines.append(' '.join(str(x) for x in X_valid.iloc[row,1:len(X_valid.columns)]))

testheadlines = []
for row in range(0,len(X_test.index)):
    testheadlines.append(' '.join(str(x) for x in X_test.iloc[row,1:len(X_test.columns)]))

print(len(trainheadlines))
print(len(validheadlines))
print(len(testheadlines))

846
170
113


In [0]:
nlp = English()

def filter_headlines(headlines):
    filtered_headlines = []

    for headline in headlines:
        headline = nlp(headline)

        # Create list of word tokens
        token_list = []
        for token in headline:
            token_list.append(token.lemma_)

        filtered_headline = ""

        for word in token_list:
            lexeme = nlp.vocab[word]
            if (lexeme.is_stop == False) and (len(word) > 1):
                if word.startswith("b'") or word.startswith('b"'):
                    word = word[2:]
                filtered_headline = filtered_headline + " " + word

        filtered_headlines.append(filtered_headline)
    return filtered_headlines

filtered_train_headlines = filter_headlines(trainheadlines)
filtered_valid_headlines = filter_headlines(validheadlines)
filtered_test_headlines = filter_headlines(testheadlines)

In [49]:
trainheadlines[0]

'Taliban Commander:  "There are no al-Qa\'ida fighters in Afghanistan any more. I have fought in the south and in the east as well as here. In seven years of operations I have not seen a single al-Qa\'ida fighter. Not one." Settlers torch West Bank olive trees - Israeli settlers have torched hundreds of Palestinian olive trees in the occupied West Bank, in another sign that extremists are targeting a key product of the Palestinian economy. Maariyamma is likely to be killed by her children because they cannot afford her. She is just one of many old parents in a southern Indian state dying in this way. But no one blinks at these ritual murders. Ireland\'s young flee abroad as economic meltdown looms. "A two-year-old girl suffering from leukemia died while waiting for an urgent referral to an Israeli hospital. Since January 2009 a total of 33 patients have died while waiting to access hospitals outside Gaza." Police in Egypt have been accused of beating a teenager to death and dumping his

In [50]:
filtered_train_headlines[0]

" Taliban Commander al Qa'ida fighter Afghanistan fight south east good seven year operation single al Qa'ida fighter Settlers torch West Bank olive tree Israeli settler torched Palestinian olive tree occupy West Bank sign extremist target key product Palestinian economy Maariyamma likely kill child afford old parent southern Indian state dye way blink ritual murder Ireland young flee abroad economic meltdown loom year old girl suffer leukemia die wait urgent referral Israeli hospital January 2009 total 33 patient die wait access hospital outside Gaza Police Egypt accuse beat teenager death dump body canal Hong Kong officially world wasteful city generate little 6.5 tonne rubbish year Netanyahu present security cabinet Clinton incentive 20 F-35 fighter plane security guarantee exchange 90-day West Bank build moratorium Entire Village Flees Mexican Drug Violence    Russian Banker 7-Man Team Beat Dutch Energy Executive Greece Deficit Revised Largest EU Debt Tops Italy Unlawful sex lover 

In [0]:
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(filtered_train_headlines) # add valid/test headlines for initialization too? (otherwise words that do not exist in the train data won't be present after tokenization)
word_index = tokenizer.word_index

sequences_train = tokenizer.texts_to_sequences(filtered_train_headlines)
sequences_valid = tokenizer.texts_to_sequences(filtered_valid_headlines)
sequences_test = tokenizer.texts_to_sequences(filtered_test_headlines)

In [52]:
len(word_index)

19470

In [53]:
def determine_max_headline_length(sequences):
    max = 0
    for i, sequence in enumerate(sequences):
        if i == 0:
            max = len(sequence)
        else:
            if len(sequence) > max:
                max = len(sequence)
    return max

MAX_SEQUENCE_LENGTH = determine_max_headline_length(sequences_train) # consider here valid/test headlines too?

X_train = sequence.pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
X_valid = sequence.pad_sequences(sequences_valid, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

(846, 381)
(170, 381)
(113, 381)


# Deep learning model
to-be-detailed

In [0]:
# https://nlp.stanford.edu/projects/glove/
# https://keras.io/examples/pretrained_word_embeddings/
# https://keras.io/layers/embeddings/#embedding

embeddings_index = {}
with open(GLOVE_PATH) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


In [55]:
print(embedding_matrix)
print("\n")
print(embedding_matrix.shape)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.44233999  0.48431     0.37283999 ...  0.38253     0.47455999
  -0.6505    ]
 [-0.043959    0.18935999  0.66109997 ... -0.14168     0.92789
   0.59057999]
 ...
 [ 0.30467001 -0.29335001  0.82725    ... -0.40208    -0.19690999
  -0.46224999]
 [-0.2419     -0.14302    -0.68474001 ... -0.39947999  0.55575001
  -0.30636001]
 [ 0.48288    -0.14586    -0.051133   ...  0.35464001 -0.26153001
   0.29449999]]


(19471, 100)


In [0]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
preds = Dense(2, activation='sigmoid')(x)
model = Model(sequence_input, preds)

# Use Adam as optimizer
opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)

model.compile(loss='mean_squared_error',
              optimizer=opt)

In [61]:
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 381)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 381, 100)          1947100   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 377, 128)          64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 75, 128)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 71, 128)           82048     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 14, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 10, 128)           8204

In [59]:
# Use callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
mc = ModelCheckpoint(filepath='best_model.h5', verbose=1, save_best_only=True)

# Train
model.fit(X_train, Y_train,
          batch_size=128,
          epochs=300,
          validation_data=(X_valid, Y_valid),
          callbacks=[es, mc])




Train on 846 samples, validate on 170 samples
Epoch 1/300






Epoch 00001: val_loss improved from inf to 0.05828, saving model to best_model.h5
Epoch 2/300

Epoch 00002: val_loss did not improve from 0.05828
Epoch 3/300

Epoch 00003: val_loss improved from 0.05828 to 0.05788, saving model to best_model.h5
Epoch 4/300

Epoch 00004: val_loss improved from 0.05788 to 0.05704, saving model to best_model.h5
Epoch 5/300

Epoch 00005: val_loss did not improve from 0.05704
Epoch 6/300

Epoch 00006: val_loss did not improve from 0.05704
Epoch 7/300

Epoch 00007: val_loss improved from 0.05704 to 0.05622, saving model to best_model.h5
Epoch 8/300

Epoch 00008: val_loss did not improve from 0.05622
Epoch 9/300

Epoch 00009: val_loss did not improve from 0.05622
Epoch 10/300

Epoch 00010: val_loss improved from 0.05622 to 0.05604, saving model to best_model.h5
Epoch 11/300

Epoch 00011: val_loss did not improve from 0.05604
Epoch 12/300

Epoch 00012: val_loss did not improve from 0.05604
Epoc

<keras.callbacks.History at 0x7fc8c44f8860>

In [60]:
# Test
saved_model = load_model('best_model.h5')
score = saved_model.evaluate(X_test, Y_test, batch_size=8)
score



0.06286979173268892

# Questions:
# 1. do we set the embedding matrix with only the samples of our (train) corpus, or rather with the whole glove data corpus?
# 2. tokenizer: add valid/test headlines for initialization too? (otherwise words that do not exist in the train data won't be present after tokenization)
# 3. determination of max sequence length: consider valid/test data as well?
# 4. lemme representation / stop words --> good idea to use them?