In [44]:
import numpy as np
import pandas as pd

from collections import Counter

In [28]:
df = pd.read_csv('../data/Interactive Media Bias Chart - Ad Fontes Media.csv')
scraped_df = pd.read_csv('../data/scraped_data.csv')

In [29]:
df.head()

Unnamed: 0,Source,Url,Bias,Quality
0,ABC,https://abcnews.go.com/Politics/us-disrupted-a...,-5.33,52.33
1,ABC,https://abcnews.go.com/Politics/appeals-court-...,0.67,51.67
2,ABC,https://abcnews.go.com/Politics/electoral-coll...,-10.0,32.0
3,ABC,https://abcnews.go.com/Politics/facebook-agree...,-2.33,52.33
4,ABC,https://abcnews.go.com/Politics/donald-trump-t...,-4.33,52.67


In [34]:
scraped_df.head()

Unnamed: 0.1,Unnamed: 0,Source,Url,Bias,Quality,Header,Body
0,0,ABC,https://abcnews.go.com/Politics/us-disrupted-a...,-5.33,52.33,US disrupted alleged Russian trolls' internet ...,U.S. cyber operators disrupted internet access...
1,1,ABC,https://abcnews.go.com/Politics/appeals-court-...,0.67,51.67,Appeals court says special counsel Robert Muel...,A federal appeals court rejected the most dire...
2,2,ABC,https://abcnews.go.com/Politics/electoral-coll...,-10.0,32.0,The Electoral College limits the campaign play...,"U.S Senator Elizabeth Warren, who is competing..."
3,3,ABC,https://abcnews.go.com/Politics/facebook-agree...,-2.33,52.33,Facebook agrees to hide demographics from land...,Facebook announced Tuesday that it will block ...
4,4,ABC,https://abcnews.go.com/Politics/donald-trump-t...,-4.33,52.67,"Donald Trump and 'the Trump of the Tropics,' B...","President Donald Trump and ""the Trump of the T..."


## Checking number of successful scrapes

In [31]:
clean_scraped = scraped_df.drop(scraped_df[scraped_df['Header'].isna()].index, axis=0)

In [32]:
len(clean_scraped)

1311

In [71]:
len(scraped_df)

1916

## Checking content for scraped data

In [87]:
clean_scraped['header_len'] = clean_scraped['Header'].apply(lambda x: len(str(x)))
clean_scraped['body_len'] = clean_scraped['Body'].apply(lambda x: len(str(x)))

In [89]:
clean_scraped['header_len'].describe()

count    1311.000000
mean       67.473684
std        27.776052
min         2.000000
25%        55.000000
50%        68.000000
75%        84.000000
max       188.000000
Name: header_len, dtype: float64

In [90]:
clean_scraped['body_len'].describe()

count      1311.000000
mean       7185.812357
std       17974.291546
min           3.000000
25%        2607.500000
50%        4588.000000
75%        7159.500000
max      380001.000000
Name: body_len, dtype: float64

## Checking sources where scraper did not work

In [40]:
failed_sources = scraped_df[scraped_df['Header'].isna()]['Source']

In [41]:
failed_sources.unique()

array(['Alternet', 'American Spectator, The', 'Axios',
       'Bipartisan Report', 'CBS', 'CNN', 'Conservative Tribune',
       'Counterpunch', 'Daily Beast', 'Daily Caller', 'Daily Kos',
       'Daily Signal', 'Fortune', 'Fox News', 'FreeSpeech TV',
       'Guacamoley', 'Huffington Post', 'InfoWars', 'Intercept',
       'LA Times', 'Life News', 'MSNBC', 'NewsPunch', 'Occupy Democrats',
       'One America News Network', 'Palmer Report', 'PJ Media',
       'ProPublica', 'RedState', 'Reuters', 'Second Nexus', 'Spoutable',
       'The Advocate', 'The American Conservative', 'The Economist',
       'The Federalist', 'The Gateway Pundit', 'The Skimm', 'The Week',
       'Time', 'Truthout', 'Twitchy', 'UrNews24', 'Washington Monthly',
       'Washington Times', 'Weather.com', 'World Truth TV',
       'WorldNetDaily'], dtype=object)

In [42]:
len(failed_source)

48

In [43]:
len(scraped_df['Source'].unique())

108

In [56]:
sorted_source = {k: v for k, v in sorted(Counter(scraped_df['Source'].values).items(), key=lambda item: item[1])}
sorted_source

{'CNSNews': 1,
 'EPI': 1,
 'Spoutable': 1,
 'Forward': 4,
 'UrNews24': 5,
 'Weather.com': 5,
 'Occupy Democrats': 7,
 'American Spectator, The': 9,
 'Counterpunch': 9,
 'Conservative Review': 10,
 'Crooks and Liars': 10,
 'FreeSpeech TV': 10,
 'IJR': 10,
 'Life News': 10,
 'Newsy': 10,
 'One America News Network': 10,
 'Progressive, The': 10,
 'Patribotics': 11,
 'Daily Kos': 12,
 'CBS': 13,
 'Conservative Tribune': 13,
 'Daily Beast': 13,
 'Financial Times': 13,
 'The Advocate': 13,
 'The Nation': 13,
 'Vanity Fair': 13,
 'Wonkette': 13,
 'World Truth TV': 13,
 'Axios': 14,
 'Breitbart': 14,
 'Daily Mail': 14,
 'Forbes': 14,
 'Fortune': 14,
 'Marketwatch': 14,
 'National Review': 14,
 'New York Post': 14,
 'NewsMax': 14,
 'RedState': 14,
 'Slate': 14,
 'The American Conservative': 14,
 'The Atlantic': 14,
 'The Economist': 14,
 'The Federalist': 14,
 'The Skimm': 14,
 'Truthout': 14,
 'Vox': 14,
 'Al Jazeera': 15,
 'BBC': 15,
 'Bipartisan Report': 15,
 'Business Insider': 15,
 'BuzzFe

In [55]:
sorted_failed = {k: v for k, v in sorted(Counter(failed_sources).items(), key=lambda item: item[1])}
sorted_failed

{'CBS': 1,
 'LA Times': 1,
 'Reuters': 1,
 'Second Nexus': 1,
 'Spoutable': 1,
 'Time': 3,
 'The Economist': 4,
 'The Skimm': 4,
 'UrNews24': 5,
 'Weather.com': 5,
 'Occupy Democrats': 7,
 'American Spectator, The': 9,
 'Counterpunch': 9,
 'FreeSpeech TV': 10,
 'Life News': 10,
 'One America News Network': 10,
 'Daily Kos': 12,
 'Conservative Tribune': 13,
 'Daily Beast': 13,
 'NewsPunch': 13,
 'The Advocate': 13,
 'World Truth TV': 13,
 'Axios': 14,
 'Fortune': 14,
 'RedState': 14,
 'The American Conservative': 14,
 'The Federalist': 14,
 'Truthout': 14,
 'Bipartisan Report': 15,
 'Daily Caller': 15,
 'Daily Signal': 15,
 'Guacamoley': 15,
 'Huffington Post': 15,
 'InfoWars': 15,
 'Intercept': 15,
 'PJ Media': 15,
 'ProPublica': 15,
 'Washington Monthly': 15,
 'Washington Times': 15,
 'WorldNetDaily': 15,
 'Alternet': 16,
 'Palmer Report': 16,
 'The Gateway Pundit': 16,
 'Twitchy': 16,
 'The Week': 20,
 'CNN': 25,
 'MSNBC': 33,
 'Fox News': 41}

In [63]:
source_df = pd.DataFrame(sorted_source.items(), columns=['Source', 'Count'])
failed_df = pd.DataFrame(sorted_failed.items(), columns=['Source', 'Failed'])

In [64]:
failed_df.head()

Unnamed: 0,Source,Failed
0,CBS,1
1,LA Times,1
2,Reuters,1
3,Second Nexus,1
4,Spoutable,1


In [66]:
compare_df = source_df.merge(failed_df, on='Source', how='left')
compare_df.head()

Unnamed: 0,Source,Count,Failed
0,CNSNews,1,
1,EPI,1,
2,Spoutable,1,1.0
3,Forward,4,
4,UrNews24,5,5.0


In [70]:
compare_df[compare_df['Failed'].notnull()]

Unnamed: 0,Source,Count,Failed
2,Spoutable,1,1.0
4,UrNews24,5,5.0
5,Weather.com,5,5.0
6,Occupy Democrats,7,7.0
7,"American Spectator, The",9,9.0
8,Counterpunch,9,9.0
11,FreeSpeech TV,10,10.0
13,Life News,10,10.0
15,One America News Network,10,10.0
18,Daily Kos,12,12.0
