## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



from tqdm import tqdm
import time
import os

import requests
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')


In [2]:
files = [f for f in os.listdir('./data/links/') if 'csv' in f]
files

['apple_links.csv',
 'walmart_links.csv',
 'cvs_health_links.csv',
 'amazon_links.csv',
 'exxon_mobil_links.csv']

The below provides a rough estimate of how long the code in the cell that follows will take to run.

In [3]:
lengths = [len(pd.read_csv(f'./data/links/{file}')) for file in files]
time = (sum(lengths)*3.1)/60
print('Files:',files)
print('Lengths:',lengths)
print('Time:',time,'minutes')

Files: ['apple_links.csv', 'walmart_links.csv', 'cvs_health_links.csv', 'amazon_links.csv', 'exxon_mobil_links.csv']
Lengths: [262, 436, 750, 449, 143]
Time: 105.4 minutes


For each file, iterate through the rows and use either the link by itself or the base + the link with the `requests` library to gather press release text. These files are then saved in the press_releases folder.

In [5]:
for file in files:
    # create new file name that can be used later
    new_file_name = file.replace('links.csv', 'press_releases.csv')

    # print out where you are in the process
    print(f'Current file: {file} | Creating file: {new_file_name}')

    # read in the file as a data frame
    df = pd.read_csv(f'./data/links/{file}')

    # create list that dictionaries (created in for loop) can be appended to
    press_releases = []

    try:
        # iterate through each row in the data frame
        for i in tqdm(range(len(df))):
            try:

                # create dictionary for each row and the results it returns
                press_release = {}

                # for those files that have a base string, get the url, otherwise
                # just use the link column
                if type(df.loc[i, 'base']) == str:
                    url = df.loc[i, 'base'] + df.loc[i, 'link']
                else:
                    url = df.loc[i, 'link']

                req = requests.get(url)

                soup = BeautifulSoup(req.content, 'lxml')

                press_release['full_link'] = url
                press_release['time'] = soup.time
                press_release['title'] = soup.title.text
                press_release['body'] = soup.body.text
                press_release['html'] = soup
                press_releases.append(press_release)
                time.sleep(3)

            except:
                print(f'Error: {file} | {url} | {i} | {req}')

    except:
        print(f'Error: {file} | {url}')

    pr_df = pd.DataFrame(press_releases)
    pr_df.to_csv(f'./data/press_releases/{new_file_name}', index=False)

Current file: apple_links.csv | Creating file: apple_press_releases.csv
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
260
Done - apple_press_releases.csv
Current file: walmart_links.csv | Creating file: walmart_press_releases.csv
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
260
265
270
275
280
285
290
295
300
305
310
315
320
325
330
335
340
345
350
355
360
365
370
375
380
385
390
395
400
405
410
415
420
425
430
435
Done - walmart_press_releases.csv
Current file: cvs_health_links.csv | Creating file: cvs_health_press_releases.csv
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
25