In [3]:
import pandas as pd

### Loading Articles

In [87]:
# Load in each article
a1 = pd.read_parquet('~/Documents/bert-news/data/articles1.gzip')
a2 = pd.read_parquet('~/Documents/bert-news/data/articles2.gzip')
a3 = pd.read_parquet('~/Documents/bert-news/data/articles3.gzip')

# Concatenate articles together
articles = pd.concat([a1, a2, a3], ignore_index=True)
del a1, a2, a3

# For now, including 140K articles should be enough
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142570 entries, 0 to 142569
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   title        142570 non-null  object
 1   author       142570 non-null  object
 2   publication  142570 non-null  object
 3   content      142570 non-null  object
dtypes: object(4)
memory usage: 4.4+ MB


### Only Include Top Publications

In [88]:
# Maybe, focus on top news sources
articles['publication'].value_counts()

Breitbart              23781
New York Post          17493
NPR                    11992
CNN                    11488
Washington Post        11114
Reuters                10710
Guardian                8681
New York Times          7803
Atlantic                7179
Business Insider        6757
National Review         6203
Talking Points Memo     5214
Vox                     4947
Buzzfeed News           4854
Fox News                4354
Name: publication, dtype: int64

In [89]:
# Only include Breitbart and CNN
pubs = ['Breitbart', 'New York Post', 'NPR', 'CNN', 'Washington Post', 'New York Times']
articles = articles[articles['publication'].isin(pubs)]

# Another glimpse!
articles['publication'].value_counts()

Breitbart          23781
New York Post      17493
NPR                11992
CNN                11488
Washington Post    11114
New York Times      7803
Name: publication, dtype: int64

### Only Include Articles with Authors

In [90]:
# Occurrence of NULL authors
articles['author'].value_counts()

nan                                                   4882
Breitbart News                                        1559
Pam Key                                               1282
Associated Press                                      1224
Charlie Spiering                                       928
                                                      ... 
Jai Bednall, News.com.au                                 1
Miles Marshall Lewis                                     1
Bassey Etim                                              1
Amanda Woods, Lorena Mongelli and Linda Massarella       1
Scott Shane and Adam Goldman                             1
Name: author, Length: 7889, dtype: int64

In [91]:
# Remove NAN authors
articles = articles[articles['author'] != 'nan']

# Another glimpse!
articles['author'].value_counts()

Breitbart News                  1559
Pam Key                         1282
Associated Press                1224
Charlie Spiering                 928
Jerome Hudson                    806
                                ... 
Jai Bednall, News.com.au           1
Miles Marshall Lewis               1
Bassey Etim                        1
Steve Marble                       1
Scott Shane and Adam Goldman       1
Name: author, Length: 7888, dtype: int64

### Assign Publications to Political Party

In [94]:
# Determine party based on PEW survey
right_pubs = ['Breitbart', 'New York Post']

# Assign publication to party
articles['party'] = 'left'
articles.loc[articles['publication'].isin(right_pubs), 'party'] = 'right'

# Another glimpse!
articles['party'].value_counts()

right    41266
left     37523
Name: party, dtype: int64

### Stratify on Publications

In [95]:
# For each publication,
# randomly select the same number of
# articles as the publication with the
# fewest number of articles
articles['publication'].value_counts()

Breitbart          23781
New York Post      17485
NPR                11654
Washington Post    11077
New York Times      7767
CNN                 7025
Name: publication, dtype: int64

In [96]:
# Stratify articles by publication
min_strat = articles.groupby('publication').size().min()
articles = articles.groupby('publication').apply(lambda x: x.sample(min_strat))

# Another glimpse!
articles['publication'].value_counts()

New York Post      7025
CNN                7025
NPR                7025
New York Times     7025
Breitbart          7025
Washington Post    7025
Name: publication, dtype: int64

### Save and Serialize Data 

In [97]:
# Save preprocessed data
articles.to_parquet('~/Downloads/proc_articles.gzip', compression='gzip')