In [1]:
# Import libraries for generic data preprocessing
import os
import numpy as np
import pandas as pd
from itertools import chain

# Import libraries for model selection and accuracy measures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Import BERT transformer libraries
from sentence_transformers import SentenceTransformer

### Set Random Seed

In [2]:
np.random.seed(10)
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

### Load Preprocessed Data

In [3]:
articles = pd.read_parquet('../data/cluster_articles.gzip')
articles = articles.reset_index(drop=True)

In [4]:
articles.head()

Unnamed: 0,title,author,publication,content,party,cluster
0,Breitbart Launches ’Border Wall Construction C...,Milo,Breitbart,last weekend church confessed sin personal van...,right,-1
1,IDF Airstrike Eliminates 4 Islamic State-Linke...,Breitbart Jerusalem,Breitbart,times israel reports israeli airstrike killed ...,right,4
2,Oracle Funds Anti-Google Effort that Outs Hill...,Chriss W. Street,Breitbart,oracle corporation using deep financial resour...,right,13
3,Silicon Valley Urges Giving Election Day Off t...,Chriss W. Street,Breitbart,apparently worried populist movement led donal...,right,-1
4,Illegal Migrant Abandoned in Desert Calls 911 ...,Bob Price,Breitbart,severely dehydrated illegal alien called 911 p...,right,5


### Data Preprocessing

In [5]:
# Filter out any articles without definitive clusters
articles = articles[articles['cluster'] != -1].reset_index(drop=True)

### Tokenize Articles

In [6]:
# Load in pre-trained DistilBERT model
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased , runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark
tokenizer = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [7]:
# Encode embeddings
embeddings = tokenizer.encode(articles['content'], show_progress_bar=True)

Batches:   0%|          | 0/901 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Standardize Embeddings

In [None]:
# Perform mean-centering standardization on embeddings
std_embeddings = StandardScaler(with_mean=True).fit_transform(embeddings)