In [1]:
import requests
from bs4 import BeautifulSoup
import re
import sklearn
import time
import numpy as np
import json
import pandas as pd
pd.set_option("max_colwidth", 500)

# import sys   
# sys.setrecursionlimit(25000)

from pandarallel import pandarallel

In [2]:
df1 = pd.DataFrame(range(1, 3704), columns = ['page'])
df1.page = 'https://www.pbs.org/newshour/politics/page/'+ df1.page.astype('str')

df2 = pd.DataFrame(range(1, 3253), columns = ['page'])
df2.page = 'https://www.pbs.org/newshour/nation/page/'+ df2.page.astype('str')

df3 = pd.DataFrame(range(1, 2326), columns = ['page'])
df3.page = 'https://www.pbs.org/newshour/world/page/'+ df3.page.astype('str')

df = pd.concat([df1,df2,df3]).reset_index(drop=True)
df

Unnamed: 0,page
0,https://www.pbs.org/newshour/politics/page/1
1,https://www.pbs.org/newshour/politics/page/2
2,https://www.pbs.org/newshour/politics/page/3
3,https://www.pbs.org/newshour/politics/page/4
4,https://www.pbs.org/newshour/politics/page/5
...,...
9275,https://www.pbs.org/newshour/world/page/2321
9276,https://www.pbs.org/newshour/world/page/2322
9277,https://www.pbs.org/newshour/world/page/2323
9278,https://www.pbs.org/newshour/world/page/2324


In [3]:
def extract_row(url):
    try:
        time.sleep(1)
        res=requests.get(url,'html.parser')
        soup = BeautifulSoup(res.content)

    except:
        print('cant connect',url)
        return
    
    rows = soup.find_all('div', {'class': 'card-horiz__intro'})
    
    outputs = []
    
    for row in rows:
        try:
            title = row.a.text
            href = row.a['href']
            if href.startswith('/'):
                href = 'https://www.pbs.org' + href
        except:
            print(url)

        output = {'title':title, 'href': href}
        outputs.append(output)
        
    return outputs

In [4]:
pandarallel.initialize(nb_workers=16)
batch_size = 16*8

i=0
while i<= df.shape[0]:
    print(i, end=',')
    df.loc[i: i+ batch_size-1, 'rows']= df.loc[i: i+ batch_size-1, 'page'].parallel_apply(extract_row).values
    i += batch_size

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
0,128,256,384,512,640,768,896,1024,1152,1280,1408,1536,1664,1792,1920,2048,2176,2304,2432,2560,2688,2816,2944,3072,3200,3328,3456,3584,3712,3840,3968,4096,4224,4352,4480,4608,4736,4864,4992,5120,5248,5376,5504,5632,5760,5888,6016,6144,6272,6400,6528,6656,6784,6912,7040,7168,7296,7424,7552,7680,7808,7936,8064,8192,8320,8448,8576,8704,8832,8960,9088,9216,

In [7]:
df = df[~df.rows.isnull()]
df = df.explode('rows').reset_index(drop=True)
df

Unnamed: 0,page,rows
0,https://www.pbs.org/newshour/politics/page/1,"{'title': 'Biden assails Georgia voting law as an ‘atrocity’', 'href': 'https://www.pbs.org/newshour/politics/biden-assails-georgia-voting-law-as-an-atrocity'}"
1,https://www.pbs.org/newshour/politics/page/1,"{'title': 'Arkansas governor signs bill allowing medical workers to refuse treatment to LGBTQ people', 'href': 'https://www.pbs.org/newshour/politics/arkansas-governor-signs-bill-allowing-medical-workers-to-refuse-treatment-to-lgbtq-people'}"
2,https://www.pbs.org/newshour/politics/page/1,"{'title': 'WATCH: Harris discusses child poverty and education with Secretary Miguel Cardona', 'href': 'https://www.pbs.org/newshour/education/watch-live-harris-discusses-child-poverty-and-education-with-secretary-miguel-cardona'}"
3,https://www.pbs.org/newshour/politics/page/1,"{'title': 'Biden taps Sen. Manchin’s wife to co-chair Appalachian board', 'href': 'https://www.pbs.org/newshour/politics/biden-taps-sen-manchins-wife-to-co-chair-appalachian-board'}"
4,https://www.pbs.org/newshour/politics/page/1,"{'title': 'Turkey detains students and supporters over LGBTQ flags', 'href': 'https://www.pbs.org/newshour/world/turkey-detains-students-and-supporters-over-lgbtq-flags'}"
...,...,...
92776,https://www.pbs.org/newshour/world/page/2325,"{'title': 'Storm Damage in Tennessee', 'href': 'https://www.pbs.org/newshour/world/weather-jan-june99-storms_05-07'}"
92777,https://www.pbs.org/newshour/world/page/2325,"{'title': 'Survivors’ Stories After the Tornadoes', 'href': 'https://www.pbs.org/newshour/show/1999-picking-up-the-pieces-in-oklahoma'}"
92778,https://www.pbs.org/newshour/world/page/2325,"{'title': 'Picking Up the Pieces After the Tornadoes in the Midwest', 'href': 'https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-05'}"
92779,https://www.pbs.org/newshour/world/page/2325,"{'title': 'Killer Tornadoes in Oklahoma and Kansas', 'href': 'https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-04'}"


In [11]:
df1 = pd.json_normalize(df.rows)
df1

Unnamed: 0,title,href
0,Biden assails Georgia voting law as an ‘atrocity’,https://www.pbs.org/newshour/politics/biden-assails-georgia-voting-law-as-an-atrocity
1,Arkansas governor signs bill allowing medical workers to refuse treatment to LGBTQ people,https://www.pbs.org/newshour/politics/arkansas-governor-signs-bill-allowing-medical-workers-to-refuse-treatment-to-lgbtq-people
2,WATCH: Harris discusses child poverty and education with Secretary Miguel Cardona,https://www.pbs.org/newshour/education/watch-live-harris-discusses-child-poverty-and-education-with-secretary-miguel-cardona
3,Biden taps Sen. Manchin’s wife to co-chair Appalachian board,https://www.pbs.org/newshour/politics/biden-taps-sen-manchins-wife-to-co-chair-appalachian-board
4,Turkey detains students and supporters over LGBTQ flags,https://www.pbs.org/newshour/world/turkey-detains-students-and-supporters-over-lgbtq-flags
...,...,...
92776,Storm Damage in Tennessee,https://www.pbs.org/newshour/world/weather-jan-june99-storms_05-07
92777,Survivors’ Stories After the Tornadoes,https://www.pbs.org/newshour/show/1999-picking-up-the-pieces-in-oklahoma
92778,Picking Up the Pieces After the Tornadoes in the Midwest,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-05
92779,Killer Tornadoes in Oklahoma and Kansas,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-04


In [13]:
df1 = df1[~df1.duplicated(subset='href')].reset_index(drop=True)
df1

Unnamed: 0,title,href
0,Biden assails Georgia voting law as an ‘atrocity’,https://www.pbs.org/newshour/politics/biden-assails-georgia-voting-law-as-an-atrocity
1,Arkansas governor signs bill allowing medical workers to refuse treatment to LGBTQ people,https://www.pbs.org/newshour/politics/arkansas-governor-signs-bill-allowing-medical-workers-to-refuse-treatment-to-lgbtq-people
2,WATCH: Harris discusses child poverty and education with Secretary Miguel Cardona,https://www.pbs.org/newshour/education/watch-live-harris-discusses-child-poverty-and-education-with-secretary-miguel-cardona
3,Biden taps Sen. Manchin’s wife to co-chair Appalachian board,https://www.pbs.org/newshour/politics/biden-taps-sen-manchins-wife-to-co-chair-appalachian-board
4,Turkey detains students and supporters over LGBTQ flags,https://www.pbs.org/newshour/world/turkey-detains-students-and-supporters-over-lgbtq-flags
...,...,...
76694,Storm Damage in Tennessee,https://www.pbs.org/newshour/world/weather-jan-june99-storms_05-07
76695,Survivors’ Stories After the Tornadoes,https://www.pbs.org/newshour/show/1999-picking-up-the-pieces-in-oklahoma
76696,Picking Up the Pieces After the Tornadoes in the Midwest,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-05
76697,Killer Tornadoes in Oklahoma and Kansas,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-04


In [14]:
df1.to_csv('pbs_link.csv', index= False)

In [5]:
df1 = pd.read_csv('pbs_link.csv', header=[0])
df1[['text','tags']]=None
df1

Unnamed: 0,title,href,text,tags
0,Biden assails Georgia voting law as an ‘atrocity’,https://www.pbs.org/newshour/politics/biden-assails-georgia-voting-law-as-an-atrocity,,
1,Arkansas governor signs bill allowing medical workers to refuse treatment to LGBTQ people,https://www.pbs.org/newshour/politics/arkansas-governor-signs-bill-allowing-medical-workers-to-refuse-treatment-to-lgbtq-people,,
2,WATCH: Harris discusses child poverty and education with Secretary Miguel Cardona,https://www.pbs.org/newshour/education/watch-live-harris-discusses-child-poverty-and-education-with-secretary-miguel-cardona,,
3,Biden taps Sen. Manchin’s wife to co-chair Appalachian board,https://www.pbs.org/newshour/politics/biden-taps-sen-manchins-wife-to-co-chair-appalachian-board,,
4,Turkey detains students and supporters over LGBTQ flags,https://www.pbs.org/newshour/world/turkey-detains-students-and-supporters-over-lgbtq-flags,,
...,...,...,...,...
76694,Storm Damage in Tennessee,https://www.pbs.org/newshour/world/weather-jan-june99-storms_05-07,,
76695,Survivors’ Stories After the Tornadoes,https://www.pbs.org/newshour/show/1999-picking-up-the-pieces-in-oklahoma,,
76696,Picking Up the Pieces After the Tornadoes in the Midwest,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-05,,
76697,Killer Tornadoes in Oklahoma and Kansas,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-04,,


In [2]:
def extract_text(url):
    try:
        time.sleep(1)
        res=requests.get(url,'html.parser')
        soup = BeautifulSoup(res.content)

    except:
        print('cant connect',url)
        return pd.Series([None]*2)
    
    rows = soup.find_all('div', {'class': 'body-text'})
    
    if len(rows):
        text = rows[0].text
    else:
        text = None
        print('No text', url)
        
    tags = soup.find_all('li', {'class': 'tag__list-item'})
    if len(tags):
        tags = [i.text for i in tags]
    else:
        tags = None
        
    return pd.Series([text, tags])

In [8]:
pandarallel.initialize(nb_workers=16)
batch_size = 16*16

i=0
while i<= df1.shape[0]:
    print(i, end=',')
    df1.loc[i: i+ batch_size-1, ['text','tags']]= df1.loc[i: i+ batch_size-1, 'href'].parallel_apply(extract_text).values
    i += batch_size
    
    if i%(batch_size *5) == 0:
        df1.to_csv('pbs_with_text.csv', index= False)

df1.to_csv('pbs_with_text.csv', index= False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
0,256,512,768,No text https://www.pbs.org/newshour/podcasts/special-series/pbs-newshour-special-report-american-reckoning
1024,No text https://www.pbs.org/newshour/podcasts/special-series/whats-at-stake-in-the-georgia-senate-runoffs
1280,No text https://www.pbs.org/newshour/podcasts/special-series/how-rocky-presidential-transitions-have-shaped-american-history
No text https://www.pbs.org/newshour/podcasts/special-series/a-grandmother-a-granddaughter-and-a-deep-post-election-divide
1536,No text https://www.pbs.org/newshour/podcasts/special-series/in-an-unprecedented-election-two-key-swing-states-show-how-we-got-here
1792,No text https://www.pbs.org/newshour/podcasts/special-series/in-pennsylvania-mail-in-ballots-and-legal-battles-could-put-our-electoral-system-to-the-test
2048,2304,No text https://www.pbs.org/newshour/elections/2020/historic-deba

KeyboardInterrupt: 

cant connectcant connectcant connectcant connect

In [3]:
df1 = pd.read_csv('pbs_with_text.csv', header= [0])
df1

Unnamed: 0,title,href,text,tags
0,Biden assails Georgia voting law as an ‘atrocity’,https://www.pbs.org/newshour/politics/biden-assails-georgia-voting-law-as-an-atrocity,"\nPresident Joe Biden on Friday called a sweeping Republican-sponsored overhaul of Georgia’s elections laws “outrageous” and “an atrocity,” and urged Congress to move quickly to bolster voting rights across the nation in response.\nBiden commented after Georgia Gov. Brian Kemp signed into law new restrictions on voting by mail and greater legislative control over how elections are run.\n“It’s an atrocity,” Biden told reporters. “They passed a law saying you can’t provide water for people sta...","['election', 'georgia', 'joe biden']"
1,Arkansas governor signs bill allowing medical workers to refuse treatment to LGBTQ people,https://www.pbs.org/newshour/politics/arkansas-governor-signs-bill-allowing-medical-workers-to-refuse-treatment-to-lgbtq-people,"\nLITTLE ROCK, Ark. (AP) — Arkansas Gov. Asa Hutchinson on Friday signed into law legislation allowing doctors to refuse to treat someone because of religious or moral objections, a move opponents have said will give providers broad powers to turn away LGBTQ patients and others.\nThe measure says health care workers and institutions have the right to not participate in non-emergency treatments that violate their conscience. The new law won’t take effect until late this summer.\nREAD MORE: Fo...","['arkansas', 'asa hutchinson', 'lgbtq rights']"
2,WATCH: Harris discusses child poverty and education with Secretary Miguel Cardona,https://www.pbs.org/newshour/education/watch-live-harris-discusses-child-poverty-and-education-with-secretary-miguel-cardona,"\nWASHINGTON (AP) — Vice President Kamala Harris discussed child poverty and education with Secretary Miguel Cardona on Friday.\nThe Biden administration says it will launch a major campaign to make Americans aware of the benefits available under the $1.9 trillion relief package President Joe Biden signed into law last week.\nOfficials at the Treasury Department briefed reporters on the efforts they are planning, including highlighting a provision in the measure that expands the child tax cr...","['child poverty', 'kamala harris', 'miguel cardona']"
3,Biden taps Sen. Manchin’s wife to co-chair Appalachian board,https://www.pbs.org/newshour/politics/biden-taps-sen-manchins-wife-to-co-chair-appalachian-board,"\nWASHINGTON (AP) — President Joe Biden is nominating Gayle Manchin, the wife of West Virginia Sen. Joe Manchin, to be the co-chair of the Appalachian Regional Commission, an economic development partnership involving the federal government and 13 states.\nGayle Manchin, 73, has held multiple government positions linked to education in a state where her husband is a political force.\nA former teacher, she was on the West Virginia Board of Education from 2007 to 2015 and served a two-year ter...","['appalachian regional commission', 'biden administration', 'gayle manchin', 'joe manchin']"
4,Turkey detains students and supporters over LGBTQ flags,https://www.pbs.org/newshour/world/turkey-detains-students-and-supporters-over-lgbtq-flags,"\nISTANBUL (AP) — Police in Turkey detained dozens of people who assembled outside a courthouse Friday in a show of solidarity with 12 students who were taken into custody for unfurling rainbow flags, according to Turkish news reports. The detentions came amid growing government intolerance toward the LGBT community.\nStudents and faculty at Istanbul’s prestigious Bogazici University have been demonstrating regularly since January against President Recep Tayyip Erdogan’s appointment of a new...","['lgbtq rights', 'protests', 'turkey']"
...,...,...,...,...
76694,Storm Damage in Tennessee,https://www.pbs.org/newshour/world/weather-jan-june99-storms_05-07,,
76695,Survivors’ Stories After the Tornadoes,https://www.pbs.org/newshour/show/1999-picking-up-the-pieces-in-oklahoma,,
76696,Picking Up the Pieces After the Tornadoes in the Midwest,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-05,,
76697,Killer Tornadoes in Oklahoma and Kansas,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-04,,


In [9]:
pandarallel.initialize(nb_workers=16)
batch_size = 16*16

resume = 5632

i=resume

while i<= df1.shape[0]:
    print(i, end=',')
    df1.loc[i: i+ batch_size-1, ['text','tags']]= df1.loc[i: i+ batch_size-1, 'href'].parallel_apply(extract_text).values
    i += batch_size
    
    if (i-resume)%(batch_size *5) == 0:
        df1.to_csv('pbs_with_text.csv', index= False)

df1.to_csv('pbs_with_text.csv', index= False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
5632,5888,6144,No text https://www.pbs.org/newshour/guide/your-guide-to-the-trump-impeachment-hearings
6400,6656,6912,7168,7424,No text https://www.pbs.org/newshour/elections/2020/candidate-interviews/
7680,7936,8192,8448,8704,No text https://www.pbs.org/newshour/show/trump-delivers-2019-state-of-the-union-address
8960,9216,9472,9728,9984,10240,No text https://www.pbs.org/newshour/elections/midterm-2018-election/
10496,10752,11008,11264,11520,11776,12032,12288,12544,12800,13056,13312,13568,13824,14080,14336,14592,14848,15104,15360,15616,No text https://www.pbs.org/newshour/show/pbs-newshour-special-report-president-trumps-address-congress
No text https://www.pbs.org/newshour/show/watch-trumps-address-congress-7-minutes
15872,No text https://www.pbs.org/newshour/show/green-card-holder-voted-illegally-face-deportation
16128,No text https://www.pbs

In [10]:
quit()

In [2]:
df= pd.read_csv('pbs_with_text.csv', header=[0])
df

Unnamed: 0,title,href,text,tags
0,Biden assails Georgia voting law as an ‘atrocity’,https://www.pbs.org/newshour/politics/biden-assails-georgia-voting-law-as-an-atrocity,"\nPresident Joe Biden on Friday called a sweeping Republican-sponsored overhaul of Georgia’s elections laws “outrageous” and “an atrocity,” and urged Congress to move quickly to bolster voting rights across the nation in response.\nBiden commented after Georgia Gov. Brian Kemp signed into law new restrictions on voting by mail and greater legislative control over how elections are run.\n“It’s an atrocity,” Biden told reporters. “They passed a law saying you can’t provide water for people sta...","['election', 'georgia', 'joe biden']"
1,Arkansas governor signs bill allowing medical workers to refuse treatment to LGBTQ people,https://www.pbs.org/newshour/politics/arkansas-governor-signs-bill-allowing-medical-workers-to-refuse-treatment-to-lgbtq-people,"\nLITTLE ROCK, Ark. (AP) — Arkansas Gov. Asa Hutchinson on Friday signed into law legislation allowing doctors to refuse to treat someone because of religious or moral objections, a move opponents have said will give providers broad powers to turn away LGBTQ patients and others.\nThe measure says health care workers and institutions have the right to not participate in non-emergency treatments that violate their conscience. The new law won’t take effect until late this summer.\nREAD MORE: Fo...","['arkansas', 'asa hutchinson', 'lgbtq rights']"
2,WATCH: Harris discusses child poverty and education with Secretary Miguel Cardona,https://www.pbs.org/newshour/education/watch-live-harris-discusses-child-poverty-and-education-with-secretary-miguel-cardona,"\nWASHINGTON (AP) — Vice President Kamala Harris discussed child poverty and education with Secretary Miguel Cardona on Friday.\nThe Biden administration says it will launch a major campaign to make Americans aware of the benefits available under the $1.9 trillion relief package President Joe Biden signed into law last week.\nOfficials at the Treasury Department briefed reporters on the efforts they are planning, including highlighting a provision in the measure that expands the child tax cr...","['child poverty', 'kamala harris', 'miguel cardona']"
3,Biden taps Sen. Manchin’s wife to co-chair Appalachian board,https://www.pbs.org/newshour/politics/biden-taps-sen-manchins-wife-to-co-chair-appalachian-board,"\nWASHINGTON (AP) — President Joe Biden is nominating Gayle Manchin, the wife of West Virginia Sen. Joe Manchin, to be the co-chair of the Appalachian Regional Commission, an economic development partnership involving the federal government and 13 states.\nGayle Manchin, 73, has held multiple government positions linked to education in a state where her husband is a political force.\nA former teacher, she was on the West Virginia Board of Education from 2007 to 2015 and served a two-year ter...","['appalachian regional commission', 'biden administration', 'gayle manchin', 'joe manchin']"
4,Turkey detains students and supporters over LGBTQ flags,https://www.pbs.org/newshour/world/turkey-detains-students-and-supporters-over-lgbtq-flags,"\nISTANBUL (AP) — Police in Turkey detained dozens of people who assembled outside a courthouse Friday in a show of solidarity with 12 students who were taken into custody for unfurling rainbow flags, according to Turkish news reports. The detentions came amid growing government intolerance toward the LGBT community.\nStudents and faculty at Istanbul’s prestigious Bogazici University have been demonstrating regularly since January against President Recep Tayyip Erdogan’s appointment of a new...","['lgbtq rights', 'protests', 'turkey']"
...,...,...,...,...
76694,Storm Damage in Tennessee,https://www.pbs.org/newshour/world/weather-jan-june99-storms_05-07,"\nThe storm swept through Tennessee late Wednesday and early Thursday, leaving 50 people injured and millions of dollars of damage. Thirty counties reported damage, and officials say it will take up to five days to restore power in some parts of the state.\nThe tiny town of Linden, 70 miles southwest of Nashville, was among the hardest hit. The storm killed three people, injured six and knocked out power to the community of 1,300.\nIn Nashville, 50,000 Nashville Electric Service customers lo...","['tennessee', 'weather']"
76695,Survivors’ Stories After the Tornadoes,https://www.pbs.org/newshour/show/1999-picking-up-the-pieces-in-oklahoma,"\nResidents of Oklahoma and Kansas are returning to the places where they once had homes following a May 3, 1999, tornado. Betty Ann Bowser follows two households as they pick through the damage and try to reconstruct their lives in this report from May 6, 1999.\n","['oklahoma', 'tornado']"
76696,Picking Up the Pieces After the Tornadoes in the Midwest,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-05,"\nOfficials and residents in the tornado-ravaged Plains began to survey the damage Wednesday where a series of twisters left thousands of homes flattened.\nAccording to the National Weather Service, the multiple tornadoes developed out of a line of major storms. Up to 45 tornadoes touched down in Oklahoma, where 38 people died on Monday.\nIn addition, five people died in Kansas from 14 tornadoes. President Clinton has authorized federal disaster aid for both states.\nOne elderly woman was al...","['arkansas', 'kansas', 'oklahoma', 'texas', 'tornadoes', 'weather']"
76697,Killer Tornadoes in Oklahoma and Kansas,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-04,"\nEmergency officials said local hospitals had treated some 550 people for injuries from flying debris in Oklahoma alone.\nFire departments urged local residents to be patient while efforts continued to find survivors and stabilize the situation, but many officials feared the death toll would rise as they continued their work.\n“It is worse than what you can see,” Bob Thompson, fire battalion chief in Sedgwick County, KS, told reporters. “We’ll probably find more deaths. I don’t think we’ve ...","['kansas', 'oklahoma', 'tornadoes', 'weather']"


In [4]:
df = df[~df.text.isnull()].reset_index(drop=True)
df

Unnamed: 0,title,href,text,tags
0,Biden assails Georgia voting law as an ‘atrocity’,https://www.pbs.org/newshour/politics/biden-assails-georgia-voting-law-as-an-atrocity,"\nPresident Joe Biden on Friday called a sweeping Republican-sponsored overhaul of Georgia’s elections laws “outrageous” and “an atrocity,” and urged Congress to move quickly to bolster voting rights across the nation in response.\nBiden commented after Georgia Gov. Brian Kemp signed into law new restrictions on voting by mail and greater legislative control over how elections are run.\n“It’s an atrocity,” Biden told reporters. “They passed a law saying you can’t provide water for people sta...","['election', 'georgia', 'joe biden']"
1,Arkansas governor signs bill allowing medical workers to refuse treatment to LGBTQ people,https://www.pbs.org/newshour/politics/arkansas-governor-signs-bill-allowing-medical-workers-to-refuse-treatment-to-lgbtq-people,"\nLITTLE ROCK, Ark. (AP) — Arkansas Gov. Asa Hutchinson on Friday signed into law legislation allowing doctors to refuse to treat someone because of religious or moral objections, a move opponents have said will give providers broad powers to turn away LGBTQ patients and others.\nThe measure says health care workers and institutions have the right to not participate in non-emergency treatments that violate their conscience. The new law won’t take effect until late this summer.\nREAD MORE: Fo...","['arkansas', 'asa hutchinson', 'lgbtq rights']"
2,WATCH: Harris discusses child poverty and education with Secretary Miguel Cardona,https://www.pbs.org/newshour/education/watch-live-harris-discusses-child-poverty-and-education-with-secretary-miguel-cardona,"\nWASHINGTON (AP) — Vice President Kamala Harris discussed child poverty and education with Secretary Miguel Cardona on Friday.\nThe Biden administration says it will launch a major campaign to make Americans aware of the benefits available under the $1.9 trillion relief package President Joe Biden signed into law last week.\nOfficials at the Treasury Department briefed reporters on the efforts they are planning, including highlighting a provision in the measure that expands the child tax cr...","['child poverty', 'kamala harris', 'miguel cardona']"
3,Biden taps Sen. Manchin’s wife to co-chair Appalachian board,https://www.pbs.org/newshour/politics/biden-taps-sen-manchins-wife-to-co-chair-appalachian-board,"\nWASHINGTON (AP) — President Joe Biden is nominating Gayle Manchin, the wife of West Virginia Sen. Joe Manchin, to be the co-chair of the Appalachian Regional Commission, an economic development partnership involving the federal government and 13 states.\nGayle Manchin, 73, has held multiple government positions linked to education in a state where her husband is a political force.\nA former teacher, she was on the West Virginia Board of Education from 2007 to 2015 and served a two-year ter...","['appalachian regional commission', 'biden administration', 'gayle manchin', 'joe manchin']"
4,Turkey detains students and supporters over LGBTQ flags,https://www.pbs.org/newshour/world/turkey-detains-students-and-supporters-over-lgbtq-flags,"\nISTANBUL (AP) — Police in Turkey detained dozens of people who assembled outside a courthouse Friday in a show of solidarity with 12 students who were taken into custody for unfurling rainbow flags, according to Turkish news reports. The detentions came amid growing government intolerance toward the LGBT community.\nStudents and faculty at Istanbul’s prestigious Bogazici University have been demonstrating regularly since January against President Recep Tayyip Erdogan’s appointment of a new...","['lgbtq rights', 'protests', 'turkey']"
...,...,...,...,...
76274,Hurricane Lenny Pounds Caribbean,https://www.pbs.org/newshour/world/weather-july-dec99-lenny_update_11-18,"\nAt least four people have died in the storm, which has ravaged coastal areas from Colombia to the Dutch island of St. Maarten.\nAs of Thursday afternoon, the storm was stalled 35 miles west-southwest of St. Maartin and is forecasted to move slowly to the east-northeast. Lenny is currently classified as a Category Four storm, capable of causing extensive damage and a 13 to 18 foot storm surge.\nEarlier Thursday morning, the storm forced a Russian freighter carrying cement to run aground nea...","['caribbean', 'hurricane lenny', 'weather']"
76275,Storm Damage in Tennessee,https://www.pbs.org/newshour/world/weather-jan-june99-storms_05-07,"\nThe storm swept through Tennessee late Wednesday and early Thursday, leaving 50 people injured and millions of dollars of damage. Thirty counties reported damage, and officials say it will take up to five days to restore power in some parts of the state.\nThe tiny town of Linden, 70 miles southwest of Nashville, was among the hardest hit. The storm killed three people, injured six and knocked out power to the community of 1,300.\nIn Nashville, 50,000 Nashville Electric Service customers lo...","['tennessee', 'weather']"
76276,Survivors’ Stories After the Tornadoes,https://www.pbs.org/newshour/show/1999-picking-up-the-pieces-in-oklahoma,"\nResidents of Oklahoma and Kansas are returning to the places where they once had homes following a May 3, 1999, tornado. Betty Ann Bowser follows two households as they pick through the damage and try to reconstruct their lives in this report from May 6, 1999.\n","['oklahoma', 'tornado']"
76277,Picking Up the Pieces After the Tornadoes in the Midwest,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-05,"\nOfficials and residents in the tornado-ravaged Plains began to survey the damage Wednesday where a series of twisters left thousands of homes flattened.\nAccording to the National Weather Service, the multiple tornadoes developed out of a line of major storms. Up to 45 tornadoes touched down in Oklahoma, where 38 people died on Monday.\nIn addition, five people died in Kansas from 14 tornadoes. President Clinton has authorized federal disaster aid for both states.\nOne elderly woman was al...","['arkansas', 'kansas', 'oklahoma', 'texas', 'tornadoes', 'weather']"


In [7]:
df.text = df.text.apply(lambda x: re.sub("^\s+|\s+$", "", x, flags=re.UNICODE))
df

Unnamed: 0,title,href,text,tags
0,Biden assails Georgia voting law as an ‘atrocity’,https://www.pbs.org/newshour/politics/biden-assails-georgia-voting-law-as-an-atrocity,"President Joe Biden on Friday called a sweeping Republican-sponsored overhaul of Georgia’s elections laws “outrageous” and “an atrocity,” and urged Congress to move quickly to bolster voting rights across the nation in response.\nBiden commented after Georgia Gov. Brian Kemp signed into law new restrictions on voting by mail and greater legislative control over how elections are run.\n“It’s an atrocity,” Biden told reporters. “They passed a law saying you can’t provide water for people stand...","['election', 'georgia', 'joe biden']"
1,Arkansas governor signs bill allowing medical workers to refuse treatment to LGBTQ people,https://www.pbs.org/newshour/politics/arkansas-governor-signs-bill-allowing-medical-workers-to-refuse-treatment-to-lgbtq-people,"LITTLE ROCK, Ark. (AP) — Arkansas Gov. Asa Hutchinson on Friday signed into law legislation allowing doctors to refuse to treat someone because of religious or moral objections, a move opponents have said will give providers broad powers to turn away LGBTQ patients and others.\nThe measure says health care workers and institutions have the right to not participate in non-emergency treatments that violate their conscience. The new law won’t take effect until late this summer.\nREAD MORE: For ...","['arkansas', 'asa hutchinson', 'lgbtq rights']"
2,WATCH: Harris discusses child poverty and education with Secretary Miguel Cardona,https://www.pbs.org/newshour/education/watch-live-harris-discusses-child-poverty-and-education-with-secretary-miguel-cardona,"WASHINGTON (AP) — Vice President Kamala Harris discussed child poverty and education with Secretary Miguel Cardona on Friday.\nThe Biden administration says it will launch a major campaign to make Americans aware of the benefits available under the $1.9 trillion relief package President Joe Biden signed into law last week.\nOfficials at the Treasury Department briefed reporters on the efforts they are planning, including highlighting a provision in the measure that expands the child tax cred...","['child poverty', 'kamala harris', 'miguel cardona']"
3,Biden taps Sen. Manchin’s wife to co-chair Appalachian board,https://www.pbs.org/newshour/politics/biden-taps-sen-manchins-wife-to-co-chair-appalachian-board,"WASHINGTON (AP) — President Joe Biden is nominating Gayle Manchin, the wife of West Virginia Sen. Joe Manchin, to be the co-chair of the Appalachian Regional Commission, an economic development partnership involving the federal government and 13 states.\nGayle Manchin, 73, has held multiple government positions linked to education in a state where her husband is a political force.\nA former teacher, she was on the West Virginia Board of Education from 2007 to 2015 and served a two-year term ...","['appalachian regional commission', 'biden administration', 'gayle manchin', 'joe manchin']"
4,Turkey detains students and supporters over LGBTQ flags,https://www.pbs.org/newshour/world/turkey-detains-students-and-supporters-over-lgbtq-flags,"ISTANBUL (AP) — Police in Turkey detained dozens of people who assembled outside a courthouse Friday in a show of solidarity with 12 students who were taken into custody for unfurling rainbow flags, according to Turkish news reports. The detentions came amid growing government intolerance toward the LGBT community.\nStudents and faculty at Istanbul’s prestigious Bogazici University have been demonstrating regularly since January against President Recep Tayyip Erdogan’s appointment of a new r...","['lgbtq rights', 'protests', 'turkey']"
...,...,...,...,...
76274,Hurricane Lenny Pounds Caribbean,https://www.pbs.org/newshour/world/weather-july-dec99-lenny_update_11-18,"At least four people have died in the storm, which has ravaged coastal areas from Colombia to the Dutch island of St. Maarten.\nAs of Thursday afternoon, the storm was stalled 35 miles west-southwest of St. Maartin and is forecasted to move slowly to the east-northeast. Lenny is currently classified as a Category Four storm, capable of causing extensive damage and a 13 to 18 foot storm surge.\nEarlier Thursday morning, the storm forced a Russian freighter carrying cement to run aground near ...","['caribbean', 'hurricane lenny', 'weather']"
76275,Storm Damage in Tennessee,https://www.pbs.org/newshour/world/weather-jan-june99-storms_05-07,"The storm swept through Tennessee late Wednesday and early Thursday, leaving 50 people injured and millions of dollars of damage. Thirty counties reported damage, and officials say it will take up to five days to restore power in some parts of the state.\nThe tiny town of Linden, 70 miles southwest of Nashville, was among the hardest hit. The storm killed three people, injured six and knocked out power to the community of 1,300.\nIn Nashville, 50,000 Nashville Electric Service customers lost...","['tennessee', 'weather']"
76276,Survivors’ Stories After the Tornadoes,https://www.pbs.org/newshour/show/1999-picking-up-the-pieces-in-oklahoma,"Residents of Oklahoma and Kansas are returning to the places where they once had homes following a May 3, 1999, tornado. Betty Ann Bowser follows two households as they pick through the damage and try to reconstruct their lives in this report from May 6, 1999.","['oklahoma', 'tornado']"
76277,Picking Up the Pieces After the Tornadoes in the Midwest,https://www.pbs.org/newshour/world/weather-jan-june99-tornadoes_05-05,"Officials and residents in the tornado-ravaged Plains began to survey the damage Wednesday where a series of twisters left thousands of homes flattened.\nAccording to the National Weather Service, the multiple tornadoes developed out of a line of major storms. Up to 45 tornadoes touched down in Oklahoma, where 38 people died on Monday.\nIn addition, five people died in Kansas from 14 tornadoes. President Clinton has authorized federal disaster aid for both states.\nOne elderly woman was also...","['arkansas', 'kansas', 'oklahoma', 'texas', 'tornadoes', 'weather']"


In [8]:
df.to_csv('pbs_with_text.csv', index= False)