## Imports

In [58]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import datetime as dt

import os

In [2]:
files = [f for f in os.listdir('../data/press_releases/') if 'csv' in f]
files

['apple_press_releases.csv',
 'cvs_health_press_releases.csv',
 'exxon_mobil_press_releases.csv',
 'walmart_press_releases.csv',
 'amazon_press_releases.csv']

## Finding the dates for each file and assigning the label

### Apple

In [19]:
# find the dates
file = [f for f in files if 'apple' in f]
file
df = pd.read_csv(f'../data/press_releases/{file[0]}')

for row in range(len(df)):
    link_list = [i.strip() for i in df.loc[row,'full_link'].split('/') if i.strip() != '']
    df.loc[row,'date'] = '-'.join(link_list[3:5])

df['full_text'] = df['title'] + df['body']

df.date = df.date.map(lambda x: dt.datetime.strptime(x,'%Y-%m'))
df['year'] = df.date.map(lambda x: x.year)

df.head()

Unnamed: 0,full_link,title,body,html,date,full_text,year,label
0,https://www.apple.com/newsroom/2021/03/apple-e...,Apple earns historic Academy Award nominations...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal Nav...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",2021-03-01,Apple earns historic Academy Award nominations...,2021,after
1,https://www.apple.com/newsroom/2021/03/apple-w...,Apple Women’s Health Study releases preliminar...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal Nav...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",2021-03-01,Apple Women’s Health Study releases preliminar...,2021,after
2,https://www.apple.com/newsroom/2021/03/apple-t...,Apple TV+ announces programming partnership wi...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal Nav...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",2021-03-01,Apple TV+ announces programming partnership wi...,2021,after
3,https://www.apple.com/newsroom/2021/03/apple-h...,Apple Hearing Study shares new insights on hea...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal Nav...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",2021-03-01,Apple Hearing Study shares new insights on hea...,2021,after
4,https://www.apple.com/newsroom/2021/03/new-zea...,New Zealand students prototype their own Samoa...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGlobal Nav...,"<!DOCTYPE html>\n<html class=""no-js"" dir=""ltr""...",2021-03-01,New Zealand students prototype their own Samoa...,2021,after


In [5]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [6]:
# save it back

df.to_csv(f'../data/press_releases/{file[0]}', index = False)

### CVS Health

In [20]:
file = [f for f in files if 'cvs' in f]
df = pd.read_csv(f'../data/press_releases/{file[0]}')

years = [str(i) for i in range(2019,2022)]

for row in range(len(df)):
    para_list = [i.strip() for i in df.loc[row,'body'].split('\n') if i.strip() != '']
    df.loc[row,'date'] = '-'.join(para_list[3:6])
    
    for year in years:
        if (year in para_list[324]) & (len(para_list[324])<30):
            df.loc[row,'date'] = ','.join(para_list[324].split(',')[-2:]).strip()
            break
        
        elif (year in para_list[334]) & (len(para_list[334])<30):
            df.loc[row,'date'] = ','.join(para_list[334].split(',')[-2:]).strip()
            break
        
        else:
            df.loc[row,'date'] = None

df = df.dropna().reset_index(drop = True)

df['full_text'] = df.title + df.body


df.date = df.date.map(lambda x: dt.datetime.strptime(x,'%B %d, %Y'))
df['year'] = df.date.map(lambda x: x.year)

df.head()

Unnamed: 0,full_link,title,body,html,date,full_text,year
0,https://www.cvshealth.com/news-and-insights/pr...,CVS Health invests $114 million in affordable ...,\n\n\n\n\n\n\n\n Skip to main content\n...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2021-02-23,CVS Health invests $114 million in affordable ...,2021
1,https://www.cvshealth.com/news-and-insights/pr...,CVS Health now offering COVID-19 vaccines in 2...,\n\n\n\n\n\n\n\n Skip to main content\n...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2021-03-11,CVS Health now offering COVID-19 vaccines in 2...,2021
2,https://www.cvshealth.com/news-and-insights/pr...,CVS Health completes first round of COVID-19 v...,\n\n\n\n\n\n\n\n Skip to main content\n...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2021-01-25,CVS Health completes first round of COVID-19 v...,2021
3,https://www.cvshealth.com/news-and-insights/pr...,CVS Health launches Symphony™ to support senio...,\n\n\n\n\n\n\n\n Skip to main content\n...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2021-01-14,CVS Health launches Symphony™ to support senio...,2021
4,https://www.cvshealth.com/news-and-insights/pr...,"CVS Health names Kyu Rhee, M.D., MPP, as Senio...",\n\n\n\n\n\n\n\n Skip to main content\n...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2021-01-21,"CVS Health names Kyu Rhee, M.D., MPP, as Senio...",2021


In [21]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [22]:
# save it back

df.to_csv(f'../data/press_releases/{file[0]}', index = False)

### Exxon Mobil

In [41]:
file = [f for f in files if 'exxon' in f]
df = pd.read_csv(f'../data/press_releases/{file[0]}')

# test = [i.strip() for i in df.loc[1,'body'].split('\n') if i.strip() != '']

# [(i,j) for i,j in enumerate(test)]

for row in range(len(df)):

    para_list = [i.strip() for i in df.loc[row,'body'].split('\n') if i.strip() != '']

    for year in years:
        
        if (year in para_list[78]) & (len(para_list[78])<20):
            df.loc[row,'date'] = ','.join(para_list[78].split(',')[-2:]).strip()
            break
        
        elif (year in para_list[75]) & (len(para_list[75])<20):
            df.loc[row,'date'] = ','.join(para_list[75].split(',')[-2:]).strip()
            break
        
        else:
            df.loc[row,'date'] = None

            
df['full_text'] = df.title + df.body


months = {
    'Jan.':'January',
    'Feb.':'February',
    'Mar.':'March',
    'Apr.':'April',
    'Jun.':'June',
    'Jul.':'July',
    'Aug.':'August',
    'Sept.':'September',
    'Oct.':'October',
    'Nov.': 'November',
    'Dec.':'December'
}

for row in range(len(df)):
    for old,new in months.items():
        df.loc[row,'date']=df.loc[row,'date'].replace(old,new).replace('   •','')

        
df['date'] = df['date'].map(lambda x: dt.datetime.strptime(x,'%B %d, %Y'))
df['year'] = df['date'].map(lambda x: x.year)

df.head()

Unnamed: 0,full_link,title,body,html,date,full_text,year
0,https://corporate.exxonmobil.com/News/Newsroom...,Darren Woods shares strategy for long term gro...,\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",2021-03-11,Darren Woods shares strategy for long term gro...,2021
1,https://corporate.exxonmobil.com/News/Newsroom...,ExxonMobil outlines plans to grow long-term sh...,\n\nSkip to content\n\n\n\nImportant Additiona...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",2021-03-03,ExxonMobil outlines plans to grow long-term sh...,2021
2,https://corporate.exxonmobil.com/News/Newsroom...,ExxonMobil announces Singapore workforce reduc...,\n\nSkip to content\n\n\n\n\n\n\n\n\n\n\n\n\n\...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",2021-03-02,ExxonMobil announces Singapore workforce reduc...,2021
3,https://corporate.exxonmobil.com/News/Newsroom...,Neil Duffin to retire as president of ExxonMob...,\n\nSkip to content\n\n\n\nImportant Additiona...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",2021-03-01,Neil Duffin to retire as president of ExxonMob...,2021
4,https://corporate.exxonmobil.com/News/Newsroom...,Michael Angelakis and Jeffrey Ubben join Exxon...,\n\nSkip to content\n\n\n\nImportant Additiona...,"<!DOCTYPE html>\n<!--[if IE 9]><html lang=""en""...",2021-03-01,Michael Angelakis and Jeffrey Ubben join Exxon...,2021


In [42]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [43]:
# save it back

df.to_csv(f'../data/press_releases/{file[0]}', index = False)

### Walmart

In [46]:
file = [f for f in files if 'walmart' in f]
df = pd.read_csv(f'../data/press_releases/{file[0]}')

for row in range(len(df)):
    para_list = [i.strip() for i in df.loc[row,'full_link'].split('/') if i.strip() != '']
    df.loc[row,'date'] = '-'.join(para_list[3:6])

df['full_text'] = df.title + df.body

df['date'] = df['date'].map(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))
df['year'] = df['date'].map(lambda x: x.year)

df.head()

Unnamed: 0,full_link,title,body,html,date,full_text,year
0,https://corporate.walmart.com/newsroom/2021/03...,Walmart Investment to Accelerate Growth of Rak...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",2021-03-12,Walmart Investment to Accelerate Growth of Rak...,2021
1,https://corporate.walmart.com/newsroom/2021/03...,"Walmart Doubles Down on TikTok Shopping, Hosts...",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",2021-03-09,"Walmart Doubles Down on TikTok Shopping, Hosts...",2021
2,https://corporate.walmart.com/newsroom/2021/03...,Walmart Board of Directors Adds Former AT&T Ch...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",2021-03-05,Walmart Board of Directors Adds Former AT&T Ch...,2021
3,https://corporate.walmart.com/newsroom/2021/03...,Walmart Maintains Relentless Focus on Growing ...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",2021-03-04,Walmart Maintains Relentless Focus on Growing ...,2021
4,https://corporate.walmart.com/newsroom/2021/03...,Walmart to Present at Bank of America 2021 Con...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n© 2021 Wal...,"<!DOCTYPE html>\n<html class=""PressReleasePage...",2021-03-04,Walmart to Present at Bank of America 2021 Con...,2021


In [47]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [48]:
# save it back

df.to_csv(f'../data/press_releases/{file[0]}', index = False)

### Amazon

In [60]:
file = [f for f in files if 'amazon' in f]
df = pd.read_csv(f'../data/press_releases/{file[0]}')

df['full_text'] = df['title'] + df['body']

df.head()

Unnamed: 0,full_link,title,body,html,year,label,full_text
0,https://press.aboutamazon.com/news-releases/ne...,Amazon Continues Investment in Florida with De...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2019,before,Amazon Continues Investment in Florida with De...
1,https://press.aboutamazon.com/news-releases/ne...,Customers Shopped at Record Levels this Holida...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2019,before,Customers Shopped at Record Levels this Holida...
2,https://press.aboutamazon.com/news-releases/ne...,Amazon Has Enabled Hundreds of Small Businesse...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2019,before,Amazon Has Enabled Hundreds of Small Businesse...
3,https://press.aboutamazon.com/news-releases/ne...,Amazon Makes Returns Even Easier This Holiday ...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2019,before,Amazon Makes Returns Even Easier This Holiday ...
4,https://press.aboutamazon.com/news-releases/ne...,Amazon Helps Keep the Holidays ‘Spoiler Free’ ...,\n\n\n\n\n\n\n\n Skip to main navigatio...,"<!DOCTYPE html>\n<html dir=""ltr"" lang=""en"" pre...",2019,before,Amazon Helps Keep the Holidays ‘Spoiler Free’ ...


In [52]:
# assign the label

df['label'] = df.year.map(lambda x: 'after' if x >= 2020 else 'before')

In [61]:
# save it back

df.to_csv(f'../data/press_releases/{file[0]}', index = False)

## Removing potential leak words

In [62]:
leak_words = [
    '2021', '2020', '2019', 'Covid-19', 'Covid', 'COVID-19', 'COVID',
    'Coronavirus', 'coronavirus', 'pandemic', 'Amazon', 'Apple', 'Walmart', 
    'CVS Health', 'CVS', 'Exxon Mobil', 'Exxon', 'menu','newsroom',
    'amazon.com', 'blog', 'press', '\n', '\r', '\t'
]

for file in tqdm(files):
    df = pd.read_csv(f'../data/press_releases/{file}')
    
    for word in leak_words:
        df.full_text = df.full_text.map(lambda x: x.replace(word,''))
    
    df.to_csv(f'../data/press_releases/{file}', index = False)

100%|██████████| 5/5 [00:06<00:00,  1.23s/it]
