### Creating Lists of sources by Political Popularity

In [16]:
import pandas as pd
import mediacloud
import datetime as dt

In [17]:
# Pull in CSVs
REP_sources = pd.read_csv('Collection-231013110-Rep Voters 2018.csv')
LEANREP_sources = pd.read_csv('Collection-231013109-LeanRep Voters 2018.csv')
SPLIT_sources = pd.read_csv('Collection-231013108-Split Voters 2018.csv')
LEANDEM_sources = pd.read_csv('Collection-231013089-LeanDem Voters 2018.csv')
DEM_sources = pd.read_csv('Collection-231013063-Dem Voters 2018.csv')

REP_sources['polarity'] = -1.0
LEANREP_sources['polarity'] = -0.5
SPLIT_sources['polarity'] = 0.0
LEANDEM_sources['polarity'] = 0.5
DEM_sources['polarity'] = 1.0

# Combine DFs
COMB_sources = pd.concat([REP_sources, LEANREP_sources, SPLIT_sources, LEANDEM_sources, DEM_sources],ignore_index=True)
#COMB_sources.head()

# Simplified DF
LABELED_sources = COMB_sources[['id','label','homepage','domain','polarity','media_type','stories_per_week','primary_language']].copy()
LABELED_sources.head()


Unnamed: 0,id,label,homepage,domain,polarity,media_type,stories_per_week,primary_language
0,101,Washington Times,https://www.washingtontimes.com/,washingtontimes.com,-1.0,print_native,790.0,en
1,109,Power Line,https://www.powerlineblog.com/,powerlineblog.com,-1.0,,0.0,en
2,114,Right Wing News,http://rightwingnews.com/,rightwingnews.com,-1.0,,2.0,en
3,117,Hot Air,http://hotair.com,hotair.com,-1.0,digital_native,112.0,en
4,128,RedState,https://www.redstate.com/,redstate.com,-1.0,digital_native,237.0,en


##### Evaluate Duplicate sources

In [18]:
# Create DF of domain counts
domain_counts = LABELED_sources['domain'].value_counts().reset_index()
domain_counts.columns = ['domain', 'occurrences']
domain_counts = domain_counts.sort_values('occurrences', ascending=False)
domain_counts['is_dupe_domain'] = domain_counts['occurrences'] > 1

# Add polarities column
domain_polarities = LABELED_sources.groupby('domain')['polarity'].apply(list).reset_index()
domain_polarities.columns = ['domain', 'polarities']

domain_counts = domain_counts.merge(domain_polarities, on='domain', how='left')

# summary DF of dupes
DUPE_domains = domain_counts[domain_counts['is_dupe_domain']]
DUPE_domains = DUPE_domains[['domain','occurrences','polarities']].sort_values('occurrences', ascending=False)

display(DUPE_domains)

Unnamed: 0,domain,occurrences,polarities
0,ap.org,3,"[-0.5, 0.0, 1.0]"
2,huffingtonpost.com,3,"[0.0, 0.5, 1.0]"
1,deadspin.com,3,"[0.0, 0.5, 1.0]"
32,baltimoresun.com,2,"[0.0, 0.5]"
31,boston.com,2,"[0.5, 1.0]"
30,nbcsports.com,2,"[-0.5, 0.0]"
29,qns.com,2,"[0.5, 1.0]"
28,cincinnati.com,2,"[-0.5, 0.0]"
27,foreignpolicy.com,2,"[0.0, 0.5]"
18,suntimes.com,2,"[0.0, 0.5]"


In [19]:
# Identify dupe counts - also confirmed labels/domains all match when there are dupes
DUP_counts = LABELED_sources.groupby(['label','domain','polarity']).size().reset_index(name='count')

DUP_counts = DUP_counts.sort_values('count', ascending=False)

DUP_summary = pd.DataFrame({
    'total_rows': [len(LABELED_sources)],
    'unique_domains': [LABELED_sources['domain'].nunique()],
    'domains_with_multiple_rows': [int(domain_counts['is_dupe_domain'].sum())],
    'exact_duplicate_rows': [int(DUP_counts.loc[DUP_counts['count'] > 1, 'count'].sum())]
})

display(DUP_summary)


Unnamed: 0,total_rows,unique_domains,domains_with_multiple_rows,exact_duplicate_rows
0,1506,1469,34,0


##### Additional EDA

In [20]:
# Some additional EDA

print("Polarity Distribution:")
display(LABELED_sources['polarity'].value_counts().sort_index().reset_index())

print("Media Type Distribution:")
display(LABELED_sources['media_type'].value_counts().reset_index())

print("Language Distribution:")
display(LABELED_sources['primary_language'].value_counts().head(10).reset_index())

print("Stories per Week by Polarity:")
display(LABELED_sources.groupby('polarity')['stories_per_week'].describe())

print("Polarity vs Media Type:")
display(pd.crosstab(LABELED_sources['polarity'], LABELED_sources['media_type']))


Polarity Distribution:


Unnamed: 0,polarity,count
0,-1.0,152
1,-0.5,254
2,0.0,526
3,0.5,378
4,1.0,196


Media Type Distribution:


Unnamed: 0,media_type,count
0,print_native,305
1,video_broadcast,244
2,digital_native,106
3,other,6


Language Distribution:


Unnamed: 0,primary_language,count
0,en,1306
1,es,3
2,zh,2
3,ar,2
4,ru,1
5,lb,1
6,ja,1
7,fa,1


Stories per Week by Polarity:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.0,122.0,94.934426,154.777512,0.0,6.0,45.0,113.75,980.0
-0.5,158.0,248.924051,389.106678,0.0,36.75,127.0,316.75,2501.0
0.0,371.0,315.350404,512.992178,0.0,40.0,158.0,377.0,5263.0
0.5,294.0,185.55102,444.615817,0.0,13.25,51.0,154.0,5263.0
1.0,163.0,55.055215,131.647127,0.0,3.0,17.0,48.5,1080.0


Polarity vs Media Type:


media_type,digital_native,other,print_native,video_broadcast
polarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1.0,17,0,8,7
-0.5,6,1,59,99
0.0,28,3,132,110
0.5,37,1,81,21
1.0,18,1,25,7


##### Isolate top sources to test with Common Crawl

In [21]:
# Top sources of each split to look through the top stories per week and if they populate on common crawl
TOP_sources = (LABELED_sources
               .sort_values('stories_per_week', ascending=False)
               .groupby('polarity', group_keys=False)
               .head(5)
               .sort_values('polarity')
               .reset_index(drop=True))

display(TOP_sources)

Unnamed: 0,id,label,homepage,domain,polarity,media_type,stories_per_week,primary_language
0,366487,Conservative Review,https://www.conservativereview.com/,conservativereview.com,-1.0,,491.0,en
1,19334,Breitbart,https://www.breitbart.com/,breitbart.com,-1.0,digital_native,733.0,en
2,101,Washington Times,https://www.washingtontimes.com/,washingtontimes.com,-1.0,print_native,790.0,en
3,293059,dennismichaellynch.com,http://dennismichaellynch.com/,dennismichaellynch.com,-1.0,,424.0,en
4,1092,Foxnews.com,http://www.foxnews.com/,foxnews.com,-1.0,video_broadcast,980.0,en
5,18966,reason.com,http://reason.com,reason.com,-0.5,,2501.0,en
6,25701,thesun.co.uk,http://www.thesun.co.uk/sol/homepage/,thesun.co.uk,-0.5,,2496.0,en
7,717902,daily.gazette.com,http://daily.gazette.com/,gazette.com,-0.5,,2086.0,en
8,20120,nj.com,http://www.nj.com,nj.com,-0.5,digital_native,1911.0,en
9,66979,krdo.com,http://www.krdo.com/,krdo.com,-0.5,video_broadcast,1283.0,en


In [22]:
# Export to csv 
LABELED_sources.to_csv('MediaCloudsources_Polarity.csv', index=False)
TOP_sources.to_csv('MediaCloudsources_TopsourcesToTest.csv', index=False)