# Gather Press Release Text



## Imports

In [1]:
import pandas as pd

import time
from tqdm import tqdm

import os

import requests
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [2]:
files = [f for f in os.listdir('../data/links/') if 'csv' in f]
files

['apple_links.csv',
 'walmart_links.csv',
 'cvs_health_links.csv',
 'amazon_links.csv',
 'exxon_mobil_links.csv']

The below provides a rough estimate of how long the code in the cell that follows will take to run.

In [3]:
lengths = [len(pd.read_csv(f'../data/links/{file}')) for file in files]
run_time = (sum(lengths)*3.9)/60
print('Files:',files)
print('Lengths:',lengths)
print('Time: %0.2f minutes'%(run_time))

Files: ['apple_links.csv', 'walmart_links.csv', 'cvs_health_links.csv', 'amazon_links.csv', 'exxon_mobil_links.csv']
Lengths: [263, 437, 767, 449, 143]
Time: 133.83 minutes


For each file, iterate through the rows and use either the link by itself or the base + the link with the `requests` library to gather press release text. These files are then saved in the press_releases folder.

In [6]:
for file in files:
    # create new file name that can be used later
    new_file_name = file.replace('links.csv', 'press_releases.csv')

    # read in the file as a data frame
    df = pd.read_csv(f'../data/links/{file}')

    # create list that dictionaries (created in for loop) can be appended to
    press_releases = []

    try:
        # iterate through each row in the data frame
        for i in tqdm(range(len(df))):
            time.sleep(3)
            try:
                
                # create dictionary for each row and the results it returns
                press_release = {}

                # for those files that have a base string, get the url, otherwise
                # just use the link column
                if type(df.loc[i, 'base']) == str:
                    url = df.loc[i, 'base'] + df.loc[i, 'link']
                else:
                    url = df.loc[i, 'link']

                req = requests.get(url)

                soup = BeautifulSoup(req.content, 'lxml')

                press_release['full_link'] = url
                press_release['title'] = soup.title.text
                press_release['body'] = soup.body.text
                press_release['html'] = soup
                
                if 'amazon' in file:
                    press_release['year'] = df.loc[i,'year']
                else:
                    pass
                    
                press_releases.append(press_release)
                

            except:
                print(f'Error: {file} | {url} | {i} | {req}')
            
    except:
        print(f'Error: {file} | {url}')

    pr_df = pd.DataFrame(press_releases)
    pr_df.to_csv(f'../data/press_releases/{new_file_name}', index=False)

100%|██████████| 263/263 [14:46<00:00,  3.37s/it]
100%|██████████| 437/437 [26:06<00:00,  3.58s/it]
100%|██████████| 767/767 [1:01:20<00:00,  4.80s/it]
100%|██████████| 449/449 [26:18<00:00,  3.51s/it]
100%|██████████| 143/143 [08:09<00:00,  3.43s/it]
