In [1]:
import pandas as pd
import bs4 as bs
import urllib.request
from tqdm import tqdm
import numpy as np

# load story metadata

In [None]:
url_root = 'https://www.allsides.com/'
url_page = url_root + 'story/admin?page='
tab_num = 0

# loop over pages
# there are about 100 pages as of 12/29/2020, so we set this number to be well above that
dfs = []
for tab_num in tqdm(range(150)):
    # read in the source
    try:
        source = urllib.request.urlopen(url_page + str(tab_num))
        sp = bs.BeautifulSoup(source, 'lxml')
        table = sp.table
        df = pd.read_html(str(table), encoding='utf-8',
                          header=0)[0]  # read table with no links

        # get links
        links = []
        for tag in table.find_all('a'):
            if tag.has_attr('href'):
                link = tag.get('href')
                if '/story' in link:
                    links.append(link)
            else:
                print(f'error! missing a link for {link}')
        df['url_story'] = links
        dfs.append(df)
    except:
        print(f'stopped after {tab_num} pages')
        break

df = pd.concat(dfs)
df.to_pickle('../data/df_links.pkl')

# load news title / sources by following links

In [None]:
# add info from link
def get_info_from_url_story(url_story):
    '''add info rom url on a a story
    '''
    story = urllib.request.urlopen(url_story)
    sp_story = bs.BeautifulSoup(story, 'html.parser')

    # extract info from specific story page
    story_triplet_info = {}

    # loop over left, center, and right stories
    try:
        for div in sp_story.find_all('div', {'class': 'news-title'})[:3]:
            title = div.a.contents[0]
            url = div.a.get('href')

            news_source = div.parent.find_all(
                'div', {'class': 'news-source'})[0].contents[1]
            leaning = div.parent.find_all('div', {'class': 'bias-image'})[0].img.get(
                'title').replace("Political News Media Bias Rating: ", '')

            news_text = '\n'.join([s.contents[0] for s in
                                   div.parent.find_all(
                                       'div', {'class': 'news-body'})[0].contents
                                   if 'Tag' in str(type(s))])
            prefix = leaning.lower().replace('lean ', '') + '_story_'
            story_info = {
                f'{prefix}title': title,
                f'{prefix}url': url,
                f'{prefix}source': news_source,
                f'{prefix}leaning': leaning,
                f'{prefix}text': news_text
            }
            story_triplet_info = {**story_triplet_info, **story_info}
    except:
        return {}
    return story_triplet_info


def get_stories(df):
    '''Add list for all stories
    '''
    story_triplet_list = []
    for i in tqdm(range(df.shape[0])):
        url_story = url_root + df.iloc[i]['url_story']
        story_triplet_info = get_info_from_url_story(url_story)
        story_triplet_list.append(story_triplet_info)

    df_stories = pd.DataFrame.from_dict(story_triplet_list)
    for k in df_stories:
        df_stories[k] = df_stories[k].astype(str)
    return df_stories


df_stories = get_stories(df)
df_stories.to_pickle('../data/df_stories.pkl')

# store merged df

In [None]:
df_links = pd.read_pickle('../data/df_links.pkl')
df_stories = pd.read_pickle('../data/df_stories.pkl')
df_links.index = np.arange(df_links.shape[0])
df = pd.concat((df_links, df_stories), axis=1)  # 'rows', ignore_index=True)
# df = df_links.merge(df_stories)

# drop columns for mixed / not rated
df = df[[k for k in df.keys()
        if not 'mixed' in k and not 'not rated' in k]]

# drop rows with any nans
df = df.replace('nan', np.nan)
df = df.dropna()

df.to_pickle('../data/df_final.pkl')

# store as csv

In [8]:
df = pd.read_pickle('../data/df_final.pkl')

In [9]:
# move left_story_title, center_story_title	, right_story_title to be the first columns
columns_titles = ['left_story_title',
                  'center_story_title', 'right_story_title']
columns = columns_titles + [c for c in df.columns if not c in columns_titles]
df = df[columns]

In [13]:
df.to_csv('../data/df_final.csv')