# Import

In [2]:
import numpy as np
import pandas as pd
from numpy import genfromtxt
import seaborn as sns
import matplotlib.pyplot as plt
import random
import sys
import matplotlib.dates as mdates

# Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/datasets/wsj_headlines_raw.csv', low_memory=False, on_bad_lines='skip')
df

Unnamed: 0,date,headline,url,theme,timestamp
0,2000-01-01,Who We Are,https://www.wsj.com/articles/SB126832890550959873,Opinion,11:59 PM ET
1,2000-01-01,Who We Are,https://www.wsj.com/articles/SB126832912359259879,Opinion,11:59 PM ET
2,2000-01-01,Who We Are,https://www.wsj.com/articles/SB126832933750960063,Opinion,11:59 PM ET
3,2000-01-01,Who We Are,https://www.wsj.com/articles/SB126832997709060067,Opinion,11:59 PM ET
4,2000-01-01,Who We Are,https://www.wsj.com/articles/SB126833018968160071,Opinion,11:59 PM ET
...,...,...,...,...,...
1199184,2024-01-01,Kim Jong Un’s 2024 Wish List: More Nuclear Bom...,https://www.wsj.com/world/asia/kim-jong-uns-20...,Asia,2:43 AM ET
1199185,2024-01-01,China’s Xi Is Resurrecting Mao’s ‘Continuous R...,https://www.wsj.com/world/china/chinas-xi-is-r...,China,12:08 AM ET
1199186,2024-01-01,Israeli Families Band Together to Keep Governm...,https://www.wsj.com/world/middle-east/israeli-...,Middle East,12:08 AM ET
1199187,2024-01-01,Western Anxiety About Chinese EVs Could Prove ...,https://www.wsj.com/business/autos/us-europe-a...,HEARD ON THE STREET,12:06 AM ET


In [5]:
df.isnull().sum()

date          0
headline      0
url           0
theme        14
timestamp     0
dtype: int64

In [6]:
df = df.drop_duplicates(subset=['date', 'headline'])
df

Unnamed: 0,date,headline,url,theme,timestamp
0,2000-01-01,Who We Are,https://www.wsj.com/articles/SB126832890550959873,Opinion,11:59 PM ET
44,2000-01-01,OPINIONJOURNAL FEDERATION,https://www.wsj.com/articles/SB126841668161761083,Opinion,11:59 PM ET
45,2000-01-01,The Future Is Now...,https://www.wsj.com/articles/SB944516725378711715,Millennium Edition: Futurology,11:59 PM ET
46,2000-01-01,"Through the Lens, Darkly",https://www.wsj.com/articles/SB944517129778273589,Millennium Edition: Futurology,11:59 PM ET
47,2000-01-01,The Words of Tomorrow,https://www.wsj.com/articles/SB944517141695981261,Millennium Edition: Futurology,11:59 PM ET
...,...,...,...,...,...
1199184,2024-01-01,Kim Jong Un’s 2024 Wish List: More Nuclear Bom...,https://www.wsj.com/world/asia/kim-jong-uns-20...,Asia,2:43 AM ET
1199185,2024-01-01,China’s Xi Is Resurrecting Mao’s ‘Continuous R...,https://www.wsj.com/world/china/chinas-xi-is-r...,China,12:08 AM ET
1199186,2024-01-01,Israeli Families Band Together to Keep Governm...,https://www.wsj.com/world/middle-east/israeli-...,Middle East,12:08 AM ET
1199187,2024-01-01,Western Anxiety About Chinese EVs Could Prove ...,https://www.wsj.com/business/autos/us-europe-a...,HEARD ON THE STREET,12:06 AM ET


# Text Pre-processing and Cleaning

In [7]:
import re

def clean(text):
    if isinstance(text, str):
      text = text.lower()  # lower case
      text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text

df['headline'] = df['headline'].apply(clean)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['headline'] = df['headline'].apply(clean)


Unnamed: 0,date,headline,url,theme,timestamp
0,2000-01-01,who we are,https://www.wsj.com/articles/SB126832890550959873,Opinion,11:59 PM ET
44,2000-01-01,opinionjournal federation,https://www.wsj.com/articles/SB126841668161761083,Opinion,11:59 PM ET
45,2000-01-01,the future is now,https://www.wsj.com/articles/SB944516725378711715,Millennium Edition: Futurology,11:59 PM ET
46,2000-01-01,through the lens darkly,https://www.wsj.com/articles/SB944517129778273589,Millennium Edition: Futurology,11:59 PM ET
47,2000-01-01,the words of tomorrow,https://www.wsj.com/articles/SB944517141695981261,Millennium Edition: Futurology,11:59 PM ET
...,...,...,...,...,...
1199184,2024-01-01,kim jong uns 2024 wish list more nuclear bombs...,https://www.wsj.com/world/asia/kim-jong-uns-20...,Asia,2:43 AM ET
1199185,2024-01-01,chinas xi is resurrecting maos continuous revo...,https://www.wsj.com/world/china/chinas-xi-is-r...,China,12:08 AM ET
1199186,2024-01-01,israeli families band together to keep governm...,https://www.wsj.com/world/middle-east/israeli-...,Middle East,12:08 AM ET
1199187,2024-01-01,western anxiety about chinese evs could prove ...,https://www.wsj.com/business/autos/us-europe-a...,HEARD ON THE STREET,12:06 AM ET


In [8]:
df['theme'].unique()

array(['Opinion', 'Millennium Edition: Futurology',
       'Millennium Edition: Industry and Economics', ..., 'on trend',
       'Moving Targets | Joe Queenan', 'the Numbers | josh zumbrun'],
      dtype=object)

In [9]:
unique_themes = df['theme'].unique()

for theme in unique_themes:
    print(theme)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Long & Short Mailbox
The Fight for Iraq
Business Schools: Recruiters' Top Picks
How's Your Drink?
JR: Encore Jan 2006
The Microsoft Case
Author Q&A
TABS ON TECH
JR: Small Business Jan 2006
Commentary: The Weekend Interview
Market Movers
Economic Forecasting Survey
From the Archive: Weekend Interview
Here and There
JR: Technology Feb 2006
The Grapevine
Market Beat
JR: Shareholder Scoreboard 2006
SHAREHOLDER SCOREBOARD
Did It Myself
JR: NCAA Basketball 2006
THE JOURNAL REPORT: NCAA Basketball Tournament
WSJ Online/Harris Interactive Personal Finance Poll
JR: Personal Health March 2006
YOUR MONEY MATTERS
JR: Your Money Matters March 2006
Economists React
JR: Technology April 2006
JR: SMQ 1-2006
THE JOURNAL REPORT: TECHNOLOGY
Reply All
JR: MFQ 1-2006
MUTUAL FUNDS QUARTERLY REVIEW
MarketBeat
JR: Executive Compensation April 2006
THE JOURNAL REPORT: The WSJ 350: A Survey of CEO Compensation
The View From Fantasyland
JR: Automot

In [10]:
df['theme'] = df['theme'].apply(clean)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['theme'] = df['theme'].apply(clean)


# Extract Rows w/ related FX

In [10]:
pattern = r'foreign exchange|currencies|currency|FX '

filtered_df = df[df['theme'].str.contains(pattern, case=False, na=False)]

filtered_df

Unnamed: 0,date,headline,url,theme,timestamp
238,2000-01-03,currency traders likely\r\nto stay on the side...,https://www.wsj.com/articles/SB946861157559880545,foreign exchange,12:17 AM ET
358,2000-01-04,bank of japan moves\r\nto curb dollars fall,https://www.wsj.com/articles/SB946912050945363247,foreign exchange,5:58 AM ET
545,2000-01-05,dollar falls vs the euro\r\nbut climbs against...,https://www.wsj.com/articles/SB946993057516960291,foreign exchange,12:01 AM ET
707,2000-01-06,dollar hits high against yen\r\nbut other curr...,https://www.wsj.com/articles/SB947084205874848096,foreign exchange,12:01 AM ET
885,2000-01-07,dollar hits sevenweek high against yen holds o...,https://www.wsj.com/articles/SB947171768440915812,foreign exchange,12:01 AM ET
...,...,...,...,...,...
1196867,2023-11-29,binance begins again with us oversight will it...,https://www.wsj.com/finance/currencies/binance...,cryptocurrency,5:30 AM ET
1197230,2023-12-04,binance copped a 4 billion plea but is still f...,https://www.wsj.com/finance/regulation/binance...,cryptocurrency,5:30 AM ET
1198763,2023-12-24,the crypto queen pulling the strings at binance,https://www.wsj.com/finance/currencies/the-cry...,cryptocurrency,12:00 AM ET
1199004,2023-12-28,big battles loom in secs war on crypto,https://www.wsj.com/finance/regulation/big-bat...,cryptocurrency,5:30 AM ET


In [11]:
filtered_df['theme'].unique()

array(['foreign exchange', 'infogrfx stat snapshot videodvd',
       'infogrfx stat snapshot magazines',
       'infogrfx stat snapshot marketing',
       'infogrfx tabs on tech pcs hardware',
       'infogrfx tabs on tech software games',
       'infogrfx tabs on tech gadgets', 'infogrfx go figure careers',
       'infogrfx stat snapshot television',
       'infogrfx tabs on tech internet', 'infogrfx stat snapshot books',
       'infogrfx go figure mutual funds', 'infogrfx tabs on tech telecom',
       'infogrfx go figure health', 'infogrfx stat snapshot music',
       'infogrfx stat snapshot movies', 'infogrfx go figure travel',
       'infogrfx go figure autos', 'infogrfx go figure money',
       'infogrfx go figure home', 'infogrfx slide show',
       'infogrfx middle east photo', 'infogrfx photos of the day',
       'infogrfx fashion photos', 'infogrfx house of the week',
       'infogrfx slide show mini', 'currencies', 'currency trading',
       'currency markets', 'djfx trader',

In [12]:
filtered_df = filtered_df[~filtered_df['theme'].str.contains('infogrfx', case=False, na=False)]
filtered_df['theme'].unique()

array(['foreign exchange', 'currencies', 'currency trading',
       'currency markets', 'djfx trader', 'foreign exchange report',
       'fx horizons', 'dj fx trader', 'fx math', 'fx global call',
       'fx asia', 'fx view', 'fx industry roundup',
       'dj fx trader podcast', 'fx news interactive', 'fx trader podcast',
       'fx podcast', 'currency talk', 'currency', 'cryptocurrency'],
      dtype=object)

In [13]:
filtered_df

Unnamed: 0,date,headline,url,theme,timestamp
238,2000-01-03,currency traders likely\r\nto stay on the side...,https://www.wsj.com/articles/SB946861157559880545,foreign exchange,12:17 AM ET
358,2000-01-04,bank of japan moves\r\nto curb dollars fall,https://www.wsj.com/articles/SB946912050945363247,foreign exchange,5:58 AM ET
545,2000-01-05,dollar falls vs the euro\r\nbut climbs against...,https://www.wsj.com/articles/SB946993057516960291,foreign exchange,12:01 AM ET
707,2000-01-06,dollar hits high against yen\r\nbut other curr...,https://www.wsj.com/articles/SB947084205874848096,foreign exchange,12:01 AM ET
885,2000-01-07,dollar hits sevenweek high against yen holds o...,https://www.wsj.com/articles/SB947171768440915812,foreign exchange,12:01 AM ET
...,...,...,...,...,...
1196867,2023-11-29,binance begins again with us oversight will it...,https://www.wsj.com/finance/currencies/binance...,cryptocurrency,5:30 AM ET
1197230,2023-12-04,binance copped a 4 billion plea but is still f...,https://www.wsj.com/finance/regulation/binance...,cryptocurrency,5:30 AM ET
1198763,2023-12-24,the crypto queen pulling the strings at binance,https://www.wsj.com/finance/currencies/the-cry...,cryptocurrency,12:00 AM ET
1199004,2023-12-28,big battles loom in secs war on crypto,https://www.wsj.com/finance/regulation/big-bat...,cryptocurrency,5:30 AM ET


In [14]:
filtered_df.to_csv('/content/drive/MyDrive/Colab Notebooks/datasets/fx_only.csv', index=False)

In [15]:
fx = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/datasets/fx_only.csv')
fx

Unnamed: 0,date,headline,url,theme,timestamp
0,2000-01-03,currency traders likely\r\nto stay on the side...,https://www.wsj.com/articles/SB946861157559880545,foreign exchange,12:17 AM ET
1,2000-01-04,bank of japan moves\r\nto curb dollars fall,https://www.wsj.com/articles/SB946912050945363247,foreign exchange,5:58 AM ET
2,2000-01-05,dollar falls vs the euro\r\nbut climbs against...,https://www.wsj.com/articles/SB946993057516960291,foreign exchange,12:01 AM ET
3,2000-01-06,dollar hits high against yen\r\nbut other curr...,https://www.wsj.com/articles/SB947084205874848096,foreign exchange,12:01 AM ET
4,2000-01-07,dollar hits sevenweek high against yen holds o...,https://www.wsj.com/articles/SB947171768440915812,foreign exchange,12:01 AM ET
...,...,...,...,...,...
10943,2023-11-29,binance begins again with us oversight will it...,https://www.wsj.com/finance/currencies/binance...,cryptocurrency,5:30 AM ET
10944,2023-12-04,binance copped a 4 billion plea but is still f...,https://www.wsj.com/finance/regulation/binance...,cryptocurrency,5:30 AM ET
10945,2023-12-24,the crypto queen pulling the strings at binance,https://www.wsj.com/finance/currencies/the-cry...,cryptocurrency,12:00 AM ET
10946,2023-12-28,big battles loom in secs war on crypto,https://www.wsj.com/finance/regulation/big-bat...,cryptocurrency,5:30 AM ET


# Extract Relevant Themes Other than FX

In [11]:
df

Unnamed: 0,date,headline,url,theme,timestamp
0,2000-01-01,who we are,https://www.wsj.com/articles/SB126832890550959873,opinion,11:59 PM ET
44,2000-01-01,opinionjournal federation,https://www.wsj.com/articles/SB126841668161761083,opinion,11:59 PM ET
45,2000-01-01,the future is now,https://www.wsj.com/articles/SB944516725378711715,millennium edition futurology,11:59 PM ET
46,2000-01-01,through the lens darkly,https://www.wsj.com/articles/SB944517129778273589,millennium edition futurology,11:59 PM ET
47,2000-01-01,the words of tomorrow,https://www.wsj.com/articles/SB944517141695981261,millennium edition futurology,11:59 PM ET
...,...,...,...,...,...
1199184,2024-01-01,kim jong uns 2024 wish list more nuclear bombs...,https://www.wsj.com/world/asia/kim-jong-uns-20...,asia,2:43 AM ET
1199185,2024-01-01,chinas xi is resurrecting maos continuous revo...,https://www.wsj.com/world/china/chinas-xi-is-r...,china,12:08 AM ET
1199186,2024-01-01,israeli families band together to keep governm...,https://www.wsj.com/world/middle-east/israeli-...,middle east,12:08 AM ET
1199187,2024-01-01,western anxiety about chinese evs could prove ...,https://www.wsj.com/business/autos/us-europe-a...,heard on the street,12:06 AM ET


In [11]:
unique_themes = df['theme'].unique()

for theme in unique_themes:
    print(theme)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
fitness
imf  world bank in prague
open that bottle night
us business news
movies
running a business
the new economy emerging markets
other
special report world business
wine quiz
global finance
quarterly stock market review
no theme
crisis in the mideast
special report asian economic survey
special report the gehoneywell deal
word mine
hollywood journal center
the new economy europes entrepreneurs
page one
from feer
election 2000
dotcom dominoes
dotcom dominos
special report internet
under the radar center
special report asias value creators
special report asia146s value creators
special report the year in technology 2000
special report americas dysfunctional voting system
the end of irrational exuberance
capital journal center
something ventured
nikkei net interactive headlines
capital
great expectations
capital exchange
dividing the skies
career news
issue briefings
special report california power crisis
business
tales 

In [23]:
df_cleaned = df.dropna(subset=['theme'])

keywords = ['u.s.','china','asia', 'trade war', 'tariffs', 'fed talk', 'asia news', 'world markets', 'us news', 'fed'
            'chinas money trail', 'chinese stocks','asian stocks', 'foreign policy', 'fx trader', 'forex focus', 'market report news', 'markets review  outlook',
            'politics', 'markets', 'foreign affairs', 'investors', 'geopolitics', 'currency markets', 'diplomacy', 'trade'
            'china 2008', 'china 2009', 'beijing olympics 2008', 'us-china relations', 'hedge funds', 'in the money',
            'interest rates', 'currency', 'currencies', 'stock market review', 'gdp', 'economics', 'monetary policy']

filtered_themes = df_cleaned[df_cleaned['theme'].str.contains('|'.join(keywords), case=False)]
filtered_themes

Unnamed: 0,date,headline,url,theme,timestamp
49,2000-01-01,talking about\r\ntomorrow\r\npeter drucker,https://www.wsj.com/articles/SB944517413185622361,millennium edition industry and economics,11:59 PM ET
50,2000-01-01,talking about\r\ntomorrow\r\ngeorge yeo,https://www.wsj.com/articles/SB944517449201534525,millennium edition industry and economics,11:59 PM ET
51,2000-01-01,talking about\r\ntomorrow\r\nronald coase,https://www.wsj.com/articles/SB944517458696055798,millennium edition industry and economics,11:59 PM ET
52,2000-01-01,think small,https://www.wsj.com/articles/SB944517534664949434,millennium edition industry and economics,11:59 PM ET
53,2000-01-01,unreal world,https://www.wsj.com/articles/SB944517543230615223,millennium edition industry and economics,11:59 PM ET
...,...,...,...,...,...
1199138,2023-12-31,investors have cut chinas internet giants down...,https://www.wsj.com/finance/stocks/investors-h...,markets,12:20 AM ET
1199181,2024-01-01,optimism abounds on wall street this new year,https://www.wsj.com/finance/optimism-abounds-w...,markets,5:30 AM ET
1199184,2024-01-01,kim jong uns 2024 wish list more nuclear bombs...,https://www.wsj.com/world/asia/kim-jong-uns-20...,asia,2:43 AM ET
1199185,2024-01-01,chinas xi is resurrecting maos continuous revo...,https://www.wsj.com/world/china/chinas-xi-is-r...,china,12:08 AM ET


In [24]:
filtered_themes.to_csv('/content/drive/MyDrive/Colab Notebooks/datasets/extracted_themes.csv', index=False)

# SPACY

In [None]:
# Named Entity Recognition (NER) using the spaCy library to extract named entities (ie IMF, World Bank, NASDAQ, Geopolitical entities etc)

import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")


def extract_named_entities(text):
    doc = nlp(text)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'GPE']]
    return named_entities

df['headline_entities'] = df['headline'].apply(extract_named_entities)
df['theme_entities'] = df['theme'].apply(extract_named_entities)

df['all_entities'] = df['headline_entities'] + df['theme_entities']

us_china_related = df[df['all_entities'].apply(lambda x: 'United States' in x or 'China' in x)]

us_china_related

# LDA

In [21]:
from gensim import corpora, models
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim

nltk.download('stopwords')
import nltk
nltk.download('punkt')

#LDA model

df_cleaned = df.dropna(subset=['theme'])

tokenized_headlines = []
for theme in df_cleaned['theme']:
    if isinstance(theme, str):
        tokenized_headlines.append(word_tokenize(theme.lower()))

stop_words = set(stopwords.words('english'))
filtered_headlines = [[word for word in headline if word not in stop_words] for headline in tokenized_headlines]

dictionary = corpora.Dictionary(filtered_headlines)
corpus = [dictionary.doc2bow(headline) for headline in filtered_headlines]

# LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

topics = lda_model.print_topics(num_words=5)

for topic in topics:
    print(topic)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(0, '0.178*"letters" + 0.137*"street" + 0.126*"heard" + 0.118*"politics" + 0.059*"economy"')
(1, '0.106*"exclusive" + 0.106*"journal" + 0.058*"finance" + 0.049*"health" + 0.045*"reports"')
(2, '0.203*"business" + 0.157*"commentary" + 0.151*"world" + 0.109*"bookshelf" + 0.034*"salt"')
(3, '0.097*"tech" + 0.084*"pro" + 0.043*"report" + 0.042*"industry" + 0.039*"bankruptcy"')
(4, '0.254*"us" + 0.216*"review" + 0.139*"outlook" + 0.113*"markets" + 0.027*"best"')
