# Upload dataset (contains title/tags/heading/source/text/bias rating for ~20,000 articles) and explore counts across topic, bias rating

In [1]:
#upload file

from google.colab import files

uploaded = files.upload()
print(uploaded.keys())

import io
import pandas as pd

df = pd.read_csv(io.BytesIO(uploaded['all_sides_2022.csv']))

Saving all_sides_2022.csv to all_sides_2022.csv
dict_keys(['all_sides_2022.csv'])


In [2]:
#peek at df
df

Unnamed: 0.1,Unnamed: 0,title,tags,heading,source,text,bias_rating
0,0,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Chicago Gun Violence Spikes and Increasingly F...,New York Times (News),As Yasmin Miller drove home from a laundromat ...,left
1,1,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",‘Bullets just came from nowhere’: Fourth of Ju...,Chicago Tribune,As many Chicagoans were celebrating the Fourth...,center
2,2,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Dozens of shootings across US mark bloody July...,New York Post (News),The nation’s 4th of July weekend was marred by...,right
3,3,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Federal Government Will Run Out of Cash on Oct...,The Epoch Times,Treasury Secretary Janet Yellen on Tuesday war...,right
4,4,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Yellen tells Congress that U.S. will run out o...,Washington Post,Treasury Secretary Janet Yellen on Tuesday tol...,left
...,...,...,...,...,...,...,...
21749,21749,Biden Seeks $30 Billion For Disaster Aid And R...,"['Disaster', 'Joe Biden', 'Disaster Aid', 'Afg...",White House asks Congress to pass emergency fu...,Washington Post,The White House budget office Tuesday sent Con...,left
21750,21750,Biden Seeks $30 Billion For Disaster Aid And R...,"['Disaster', 'Joe Biden', 'Disaster Aid', 'Afg...","Biden wants $30B for disaster aid, Afghan rese...",Washington Examiner,The White House budget office on Tuesday urged...,right
21751,21751,US House Condemns QAnon Group,"['US House', 'QAnon', 'Free Speech', 'Politics']",Seventeen Republican congressmen and Justin Am...,Washington Examiner,The House passed a resolution condemning the c...,right
21752,21752,US House Condemns QAnon Group,"['US House', 'QAnon', 'Free Speech', 'Politics']",Why did some Republicans balk at a resolution ...,NBC News (Online),As many Americans came to realize in recent mo...,left


In [3]:
#gauge topic breakdown, most popular topics

from collections import Counter
import ast
import pandas as pd

tag_counts = Counter()

for tags in df['tags']:
    tag_list = ast.literal_eval(tags)
    tag_counts.update(tag_list)

tag_counts_df = pd.DataFrame(tag_counts.items(), columns=['Tag', 'Count']).sort_values(by='Count', ascending=False)

tag_counts_df.head(30)

Unnamed: 0,Tag,Count
13,Politics,4089
51,Elections,3187
47,Donald Trump,2240
98,Coronavirus,2139
60,World,1936
9,Economy And Jobs,1880
50,Presidential Elections,1698
27,Joe Biden,1597
43,White House,1580
45,Public Health,1047


# Data preprocessing, filtering

In [4]:
#filtered df, by tags. Filtering to medical topics that are specific to the late 2010s-2022

specified_tags = {'Coronavirus', 'Public Health', 'Healthcare'}  #adjust as needed

def has_specified_tags(tags):
    tag_list = ast.literal_eval(tags)
    return any(tag in specified_tags for tag in tag_list)

df_filtered = df[df['tags'].apply(has_specified_tags)]

df_filtered.head()

Unnamed: 0.1,Unnamed: 0,title,tags,heading,source,text,bias_rating
9,9,Denying Abortion Medication Could Violate Civi...,"['Abortion', 'Pharmacies', 'Abortion Pills', '...",Biden admin to pharmacies: Refusing to fill co...,Politico,Senior Biden administration officials announce...,left
10,10,Denying Abortion Medication Could Violate Civi...,"['Abortion', 'Pharmacies', 'Abortion Pills', '...",Pharmacies risk violating civil rights laws if...,CNBC,The Health and Human Services Department on We...,center
11,11,Denying Abortion Medication Could Violate Civi...,"['Abortion', 'Pharmacies', 'Abortion Pills', '...",HHS warns pharmacies denying abortion pills vi...,Washington Examiner,The Biden administration is warning retail pha...,right
33,33,Trump's Small Financial Stake in Hydroxychloro...,"['Healthcare', 'White House', 'Hydroxychloroqu...",‘Follow the money’: MSNBC contributor blows th...,Raw Story,President Donald Trump has been raising questi...,left
34,34,Trump's Small Financial Stake in Hydroxychloro...,"['Healthcare', 'White House', 'Hydroxychloroqu...","Trump Has ‘Small,’ ‘Distant Link’ To Sanofi, F...",Forbes,Topline: President Trump—increasingly a booste...,center


In [6]:
#add general is_biased column

df_filtered['is_biased'] = df_filtered['bias_rating'].apply(lambda x: 1 if x in ['left', 'right'] else 0)

df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['is_biased'] = df_filtered['bias_rating'].apply(lambda x: 1 if x in ['left', 'right'] else 0)


Unnamed: 0.1,Unnamed: 0,title,tags,heading,source,text,bias_rating,is_biased
9,9,Denying Abortion Medication Could Violate Civi...,"['Abortion', 'Pharmacies', 'Abortion Pills', '...",Biden admin to pharmacies: Refusing to fill co...,Politico,Senior Biden administration officials announce...,left,1
10,10,Denying Abortion Medication Could Violate Civi...,"['Abortion', 'Pharmacies', 'Abortion Pills', '...",Pharmacies risk violating civil rights laws if...,CNBC,The Health and Human Services Department on We...,center,0
11,11,Denying Abortion Medication Could Violate Civi...,"['Abortion', 'Pharmacies', 'Abortion Pills', '...",HHS warns pharmacies denying abortion pills vi...,Washington Examiner,The Biden administration is warning retail pha...,right,1
33,33,Trump's Small Financial Stake in Hydroxychloro...,"['Healthcare', 'White House', 'Hydroxychloroqu...",‘Follow the money’: MSNBC contributor blows th...,Raw Story,President Donald Trump has been raising questi...,left,1
34,34,Trump's Small Financial Stake in Hydroxychloro...,"['Healthcare', 'White House', 'Hydroxychloroqu...","Trump Has ‘Small,’ ‘Distant Link’ To Sanofi, F...",Forbes,Topline: President Trump—increasingly a booste...,center,0
...,...,...,...,...,...,...,...,...
21698,21698,Uncertain Future of Health Bill,['Healthcare'],Sen. John McCain: Senate Health Care Bill ‘Is ...,HuffPost,Arizona Senator John McCain (R) appeared on CB...,left,1
21699,21699,Uncertain Future of Health Bill,['Healthcare'],Republicans debate Plan B if ObamaCare repeal ...,The Hill,Senate Republicans are starting to consider wh...,center,0
21739,21739,House Dems Introduce New $3 Trillion Stimulus ...,"['Coronavirus', 'Banking And Finance', 'Econom...",House Democrats push new round of stimulus che...,NBC News (Online),WASHINGTON — House Democratic leaders are push...,left,1
21740,21740,House Dems Introduce New $3 Trillion Stimulus ...,"['Coronavirus', 'Banking And Finance', 'Econom...",GOP senators give Democrats’ $3T relief bill a...,Associated Press,WASHINGTON (AP) — House Speaker Nancy Pelosi u...,left,1


In [31]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [7]:
#balance the dataset (center, left, right)

min_count = df_filtered["is_biased"].value_counts().min()

#sample min_count instances from each class
df_sample = (
    df_filtered.groupby("is_biased")
    .apply(lambda x: x.sample(n=min_count, random_state=42))
    .reset_index(drop=True)
)

#check class distribution
print(df_sample["is_biased"].value_counts())

is_biased
0    664
1    664
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=min_count, random_state=42))


In [8]:
import spacy

#Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [
        token.lemma_.lower()  #Lemmatize and lowercase
        for token in doc
        if not token.is_stop and not token.is_punct  #Remove stopwords & punctuation
    ]
    return " ".join(tokens)

df_sample["processed_heading"] = df_sample["heading"].astype(str).apply(preprocess_text)
df_sample["processed_text"] = df_sample["text"].astype(str).apply(preprocess_text)

df_sample[["heading", "processed_heading", "processed_text"]].head()

Unnamed: 0,heading,processed_heading,processed_text
0,U.S. Population Growth Falls To Record Low 0.1...,u.s. population growth fall record low 0.1 202...,population u.s. grow 0.1 year low rate nation ...
1,Biden signs executive order on abortion access,biden sign executive order abortion access,president biden sign executive order friday ai...
2,Trump Claim That Malaria Drugs Treat Coronavir...,trump claim malaria drugs treat coronavirus sp...,health official world issue warning use antima...
3,More workers are facing mandatory Covid vaccin...,worker face mandatory covid vaccination job,people return workplace follow month work home...
4,Trump’s actions on pandemic relief aren’t ille...,trump action pandemic relief illegal ineffective,hour president trump announcement saturday adm...


# TF-IDF (Bag of Words) + Logistic Regression (Basline) - Text

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df_sample["processed_text"], df_sample["is_biased"], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)  # Unigrams & bigrams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.575187969924812
              precision    recall  f1-score   support

           0       0.56      0.62      0.59       131
           1       0.59      0.53      0.56       135

    accuracy                           0.58       266
   macro avg       0.58      0.58      0.57       266
weighted avg       0.58      0.58      0.57       266



# TF-IDF (Bag of Words) + Logistic Regression (Baseline) - Headlines

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df_sample["processed_heading"], df_sample["is_biased"], test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)  # Unigrams & bigrams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5375939849624061
              precision    recall  f1-score   support

           0       0.53      0.53      0.53       131
           1       0.54      0.55      0.55       135

    accuracy                           0.54       266
   macro avg       0.54      0.54      0.54       266
weighted avg       0.54      0.54      0.54       266



# Random Forrest Classifier - Headlines

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [13]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)  # Unigrams & bigrams
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [14]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)
y_pred_rf = rf_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.47368421052631576
              precision    recall  f1-score   support

           0       0.46      0.44      0.45       131
           1       0.48      0.50      0.49       135

    accuracy                           0.47       266
   macro avg       0.47      0.47      0.47       266
weighted avg       0.47      0.47      0.47       266



# SVM - Headlines

In [17]:
svm_classifier = SVC(kernel="linear", probability=True, random_state=42)  # Linear SVM
svm_classifier.fit(X_train_tfidf, y_train)
y_pred_svm = svm_classifier.predict(X_test_tfidf)

In [18]:
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

Accuracy: 0.5112781954887218
              precision    recall  f1-score   support

           0       0.50      0.48      0.49       131
           1       0.52      0.54      0.53       135

    accuracy                           0.51       266
   macro avg       0.51      0.51      0.51       266
weighted avg       0.51      0.51      0.51       266



In [19]:
#Tune SVC parameters

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10, 100],  #Regularization strength
    'kernel': ['linear', 'rbf'],  #Linear vs. non-linear kernel
    'gamma': ['scale', 'auto']  #Only used for 'rbf' kernel
}

svm_model = SVC()

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train_tfidf, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

best_svm = SVC(**grid_search.best_params_)
best_svm.fit(X_train_tfidf, y_train)

y_pred_best_svm = best_svm.predict(X_test_tfidf)

print("\nOptimized SVM Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_best_svm))
print(classification_report(y_test, y_pred_best_svm))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best Accuracy: 0.512242005492072

Optimized SVM Performance:
Accuracy: 0.5112781954887218
              precision    recall  f1-score   support

           0       0.50      0.48      0.49       131
           1       0.52      0.54      0.53       135

    accuracy                           0.51       266
   macro avg       0.51      0.51      0.51       266
weighted avg       0.51      0.51      0.51       266



# Fine-Tuned DistilBert Model

In [32]:
#!pip install transformers torch shap

In [33]:
#!pip install datasets

In [20]:
import torch
from transformers import DistilBertTokenizerFast

In [22]:
from datasets import Dataset, ClassLabel

In [23]:
import torch
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [24]:
# Convert DataFrame to Hugging Face Dataset format
df_sample["label"] = df_sample["is_biased"].astype(int)  # Rename target column

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_sample["processed_heading"].tolist(), df_sample["label"].tolist(), test_size=0.2, random_state=42
)

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Convert to Dataset format
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

Map:   0%|          | 0/266 [00:00<?, ? examples/s]

In [25]:
#Load model with number of output classes = 2 (biased or not)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # THIS DISABLES WANDB
)



In [27]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [28]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6917,0.689628,0.582707,0.497891,0.730367,0.582707
2,0.6474,0.660593,0.556391,0.514884,0.593606,0.556391
3,0.6132,0.65844,0.515038,0.514509,0.514694,0.515038


TrainOutput(global_step=201, training_loss=0.6693961371236773, metrics={'train_runtime': 12011.0042, 'train_samples_per_second': 0.265, 'train_steps_per_second': 0.017, 'total_flos': 422041132118016.0, 'train_loss': 0.6693961371236773, 'epoch': 3.0})

In [39]:
trainer.save_model("bert_model_bi_med")

In [29]:
results = trainer.evaluate()
results

{'eval_loss': 0.658439576625824,
 'eval_accuracy': 0.5150375939849624,
 'eval_f1': 0.5145089310662514,
 'eval_precision': 0.5146938507103742,
 'eval_recall': 0.5150375939849624,
 'eval_runtime': 273.9704,
 'eval_samples_per_second': 0.971,
 'eval_steps_per_second': 0.062,
 'epoch': 3.0}

In [30]:
predictions = trainer.predict(test_dataset)

#convert logits to class predictions
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = test_dataset["label"]  # True labels

#print Classification Report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=["Not Biased (0)", "Biased (1)"]))

Classification Report:
                precision    recall  f1-score   support

Not Biased (0)       0.51      0.48      0.49       131
    Biased (1)       0.52      0.55      0.53       135

      accuracy                           0.52       266
     macro avg       0.51      0.51      0.51       266
  weighted avg       0.51      0.52      0.51       266

