## Imports

In [1]:
import pandas as pd
import numpy as np

import datetime as dt

import os

In [2]:
files = [f for f in os.listdir('./data/press_releases/') if 'csv' in f]
files

['apple_press_releases.csv',
 'cvs_health_press_releases.csv',
 'exxon_mobil_press_releases.csv',
 'walmart_press_releases.csv',
 'amazon_press_releases.csv']

## Finding the dates for each file and assigning the label

### Apple

In [3]:
# find the dates
file = [f for f in files if 'apple' in f]
file
df = pd.read_csv(f'./data/press_releases/{file[0]}')

for row in range(len(df)):
    link_list = [i.strip() for i in df.loc[row,'full_link'].split('/') if i.strip() != '']
    df.loc[row,'time'] = '-'.join(link_list[3:5])

df['full_text'] = df['title'] + df['body']

df.time = df.time.map(lambda x: dt.datetime.strptime(x,'%Y-%m'))
df['year'] = df.time.map(lambda x: x.year)

df

In [4]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [5]:
# save it back

df.to_csv(f'./data/press_releases/{file[0]}', index = False)

### CVS Health

In [6]:
file = [f for f in files if 'cvs' in f]
df = pd.read_csv(f'./data/press_releases/{file[0]}')

years = [str(i) for i in range(2019,2022)]

for row in range(len(df)):
    para_list = [i.strip() for i in df.loc[row,'body'].split('\n') if i.strip() != '']
    df.loc[row,'time'] = '-'.join(para_list[3:6])
    
    for year in years:
        if (year in para_list[324]) & (len(para_list[324])<30):
            df.loc[row,'time'] = ','.join(para_list[324].split(',')[-2:]).strip()
            break
        
        elif (year in para_list[334]) & (len(para_list[334])<30):
            df.loc[row,'time'] = ','.join(para_list[334].split(',')[-2:]).strip()
            break
        
        else:
            df.loc[row,'time'] = None

df = df.dropna().reset_index(drop = True)

df['full_text'] = df.title + df.body

for row in range(len(df)):
    try:
        df.loc[row,'time'] = dt.datetime.strptime(df.loc[row,'time'],'%B %d, %Y')
        df['year'] = df.loc[row,'time'].year
    except:
        print(row)
        
df

In [7]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [8]:
# save it back

df.to_csv(f'./data/press_releases/{file[0]}', index = False)

### Exxon Mobil

In [9]:
file = [f for f in files if 'exxon' in f]
df = pd.read_csv(f'./data/press_releases/{file[0]}')

for row in range(len(df)):

    para_list = [i.strip() for i in df.loc[row,'body'].split('\n') if i.strip() != '']

    for year in years:
        if (year in para_list[87])& (len(para_list[87])<20):
            df.loc[row,'time'] = ','.join(para_list[87].split(',')[-2:]).strip()
            break

        elif (year in para_list[86])& (len(para_list[86])<20):
            df.loc[row,'time'] = ','.join(para_list[86].split(',')[-2:]).strip()
            break
        
        elif (year in para_list[77])& (len(para_list[77])<20):
            df.loc[row,'time'] = ','.join(para_list[77].split(',')[-2:]).strip()
            break
        
        elif (year in para_list[74])& (len(para_list[74])<20):
            df.loc[row,'time'] = ','.join(para_list[74].split(',')[-2:]).strip()
            break
        
        else:
            df.loc[row,'time'] = None

            
df['full_text'] = df.title + df.body


months = {
    'Jan.':'January',
    'Feb.':'February',
    'Mar.':'March',
    'Apr.':'April',
    'Jun.':'June',
    'Jul.':'July',
    'Aug.':'August',
    'Sept.':'September',
    'Oct.':'October',
    'Nov.': 'November',
    'Dec.':'December'
}

for row in range(len(df)):
    for old,new in months.items():
        df.loc[row,'time']=df.loc[row,'time'].replace(old,new).replace('   •','')
            
for row in range(len(df)):
    try:
        df.loc[row,'time'] = dt.datetime.strptime(df.loc[row,'time'],'%B %d, %Y')
        df['year'] = df.loc[row,'time'].year
    except:
        print(row)

df

Unnamed: 0,full_link,time,title,body,html,full_text,year
0,https://corporate.exxonmobil.com/News/Newsroom...,2021-03-11 00:00:00,Darren Woods shares strategy for long term gro...,\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",Darren Woods shares strategy for long term gro...,2019
1,https://corporate.exxonmobil.com/News/Newsroom...,2021-03-03 00:00:00,ExxonMobil outlines plans to grow long-term sh...,\n\nSkip to content\n\n\n\nImportant Additiona...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",ExxonMobil outlines plans to grow long-term sh...,2019
2,https://corporate.exxonmobil.com/News/Newsroom...,2021-03-02 00:00:00,ExxonMobil announces Singapore workforce reduc...,\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",ExxonMobil announces Singapore workforce reduc...,2019
3,https://corporate.exxonmobil.com/News/Newsroom...,2021-03-01 00:00:00,Neil Duffin to retire as president of ExxonMob...,\n\nSkip to content\n\n\n\nImportant Additiona...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",Neil Duffin to retire as president of ExxonMob...,2019
4,https://corporate.exxonmobil.com/News/Newsroom...,2021-03-01 00:00:00,Michael Angelakis and Jeffrey Ubben join Exxon...,\n\nSkip to content\n\n\n\nImportant Additiona...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",Michael Angelakis and Jeffrey Ubben join Exxon...,2019
...,...,...,...,...,...,...,...
138,https://corporate.exxonmobil.com/News/Newsroom...,2019-01-29 00:00:00,ExxonMobil to proceed with new crude unit as p...,\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",ExxonMobil to proceed with new crude unit as p...,2019
139,https://corporate.exxonmobil.com/News/Newsroom...,2019-01-28 00:00:00,ExxonMobil to release fourth quarter and full ...,\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",ExxonMobil to release fourth quarter and full ...,2019
140,https://corporate.exxonmobil.com/News/Newsroom...,2019-01-23 00:00:00,"ExxonMobil partners with REG, Clariant to adva...",\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...","ExxonMobil partners with REG, Clariant to adva...",2019
141,https://corporate.exxonmobil.com/News/Newsroom...,2019-01-08 00:00:00,ExxonMobil and IBM to advance energy sector ap...,\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",ExxonMobil and IBM to advance energy sector ap...,2019


In [10]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

['exxon_mobil_press_releases.csv']


In [11]:
# save it back

df.to_csv(f'./data/press_releases/{file[0]}', index = False)

### Walmart

In [12]:
file = [f for f in files if 'walmart' in f]
df = pd.read_csv(f'./data/press_releases/{file[0]}')

for row in range(len(df)):
    para_list = [i.strip() for i in df.loc[row,'full_link'].split('/') if i.strip() != '']
    df.loc[row,'time'] = '-'.join(para_list[3:6])


df['full_text'] = df.title + df.body

for row in range(len(df)):
    try:
        df.loc[row,'time'] = dt.datetime.strptime(df.loc[row,'time'],'%Y-%m-%d')
        df.loc[row,'year'] = df.loc[row,'time'].year
    except:
        print(row)
        
df

Unnamed: 0,full_link,time,title,body,html,full_text,year
0,https://corporate.walmart.com/newsroom/2021/03...,2021-03-09 00:00:00,"Walmart Doubles Down on TikTok Shopping, Hosts...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...","Walmart Doubles Down on TikTok Shopping, Hosts...",2021.0
1,https://corporate.walmart.com/newsroom/2021/03...,2021-03-05 00:00:00,Walmart Board of Directors Adds Former AT&T Ch...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",Walmart Board of Directors Adds Former AT&T Ch...,2021.0
2,https://corporate.walmart.com/newsroom/2021/03...,2021-03-04 00:00:00,Walmart Maintains Relentless Focus on Growing ...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",Walmart Maintains Relentless Focus on Growing ...,2021.0
3,https://corporate.walmart.com/newsroom/2021/03...,2021-03-04 00:00:00,Walmart to Present at Bank of America 2021 Con...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",Walmart to Present at Bank of America 2021 Con...,2021.0
4,https://corporate.walmart.com/newsroom/2021/03...,2021-03-03 00:00:00,"Investing in the Future of U.S. Manufacturing,...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...","Investing in the Future of U.S. Manufacturing,...",2021.0
...,...,...,...,...,...,...,...
431,https://corporate.walmart.com/newsroom/2019/01...,2019-01-17 00:00:00,Walmart Expands Its Grocery Delivery Service P...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",Walmart Expands Its Grocery Delivery Service P...,2019.0
432,https://corporate.walmart.com/newsroom/2019/01...,2019-01-15 00:00:00,Top Ways Walmart is Changing How Customers Shop,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",Top Ways Walmart is Changing How Customers Sho...,2019.0
433,https://corporate.walmart.com/newsroom/2019/01...,2019-01-14 00:00:00,High-Tech Consolidation Center Set to Open in ...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",High-Tech Consolidation Center Set to Open in ...,2019.0
434,https://corporate.walmart.com/newsroom/2019/01...,2019-01-08 00:00:00,Custom Autonomous Cargo Vans to Deliver Grocer...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",Custom Autonomous Cargo Vans to Deliver Grocer...,2019.0


In [13]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [14]:
# save it back

df.to_csv(f'./data/press_releases/{file[0]}', index = False)

### Amazon

In [15]:
file = [f for f in files if 'amazon' in f]
df = pd.read_csv(f'./data/press_releases/{file[0]}')

for row in range(len(df)):
    
    if 'BUSINESS WIRE' in df.loc[row,'body']:
        para_list = [i.strip() for i in df.loc[row,'body'].split('\n') if 'BUSINESS WIRE' in i]
        df.loc[row,'time'] = para_list[0].split('--')[-2]
        df.loc[row,'time'] = df.loc[row,'time'].replace('(BUSINESS WIRE)—','').replace('(BUSINESS WIRE)','')
        if len(df.loc[row,'time']) == 0:
            df.loc[row,'time'] = None
    elif 'PRNewswire' in df.loc[row,'body']:
        para_list = [i.strip() for i in df.loc[row,'body'].split('\n') if 'PRNewswire' in i]
        df.loc[row,'time'] = ','.join(para_list[0].split('--')[0].split('/')[0].split(',')[-2:]).strip()
    
    else:
        df.loc[row,'time'] = None

df['full_text'] = df.title + df.body

df = df.dropna().reset_index(drop = True)

for row in range(len(df)):
    try:
        df.loc[row,'time'] = dt.datetime.strptime(df.loc[row,'time'],'%b. %d, %Y')
        df.loc[row,'year'] = df.loc[row,'time'].year
    except:
        df.loc[row,'time'] = dt.datetime.strptime(df.loc[row,'time'],'%B %d, %Y')
        df.loc[row,'year'] = df.loc[row,'time'].year

df

Unnamed: 0,full_link,time,title,body,html,full_text,year
0,https://press.aboutamazon.com/news-releases/ne...,2019-12-26 00:00:00,Amazon Continues Investment in Florida with De...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon Continues Investment in Florida with De...,2019.0
1,https://press.aboutamazon.com/news-releases/ne...,2019-12-26 00:00:00,Customers Shopped at Record Levels this Holida...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Customers Shopped at Record Levels this Holida...,2019.0
2,https://press.aboutamazon.com/news-releases/ne...,2019-12-19 00:00:00,Amazon Has Enabled Hundreds of Small Businesse...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon Has Enabled Hundreds of Small Businesse...,2019.0
3,https://press.aboutamazon.com/news-releases/ne...,2019-12-18 00:00:00,Amazon Makes Returns Even Easier This Holiday ...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon Makes Returns Even Easier This Holiday ...,2019.0
4,https://press.aboutamazon.com/news-releases/ne...,2019-12-17 00:00:00,Amazon Helps Keep the Holidays ‘Spoiler Free’ ...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon Helps Keep the Holidays ‘Spoiler Free’ ...,2019.0
...,...,...,...,...,...,...,...
427,https://press.aboutamazon.com/news-releases/ne...,2021-01-19 00:00:00,Amazon.com to Webcast Fourth Quarter 2020 Fina...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon.com to Webcast Fourth Quarter 2020 Fina...,2021.0
428,https://press.aboutamazon.com/news-releases/ne...,2021-01-19 00:00:00,Amazon Teams Up with Pharrell Williams’ YELLOW...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon Teams Up with Pharrell Williams’ YELLOW...,2021.0
429,https://press.aboutamazon.com/news-releases/ne...,2021-01-11 00:00:00,"Amazon Expands Investment in Metro Detroit, Cr...",\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...","Amazon Expands Investment in Metro Detroit, Cr...",2021.0
430,https://press.aboutamazon.com/news-releases/ne...,2021-01-06 00:00:00,Amazon Launches $2 Billion Housing Equity Fund...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",Amazon Launches $2 Billion Housing Equity Fund...,2021.0


In [16]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [17]:
# save it back

df.to_csv(f'./data/press_releases/{file[0]}', index = False)

## Removing potential leak words

In [18]:
leak_words = [
    '2021', '2020', '2019', 'Covid-19', 'Covid', 'COVID-19', 'COVID',
    'Coronavirus', 'coronavirus', 'pandemic'
]

for file in tqdm(files):
    df = pd.read_csv(f'./data/press_releases/{file}')
    
    for word in leak_words:
        df.full_text = df.full_text.replace(word,'')
    
    df.to_csv(f'./data/press_releases/{file}', index = False)
    

apple_press_releases.csv
cvs_health_press_releases.csv
exxon_mobil_press_releases.csv
walmart_press_releases.csv
amazon_press_releases.csv
