# 1. Introduction

In this .ipynb we will train the models on the YoutubeAudit data

In [None]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# Import ML Libraries (Check if cuda is available)
import torch
torch.cuda.is_available()

# solution for pip install transformers encoding ANSI issue
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

# Import libraries
import transformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging

# sklearn standard helpers
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

In [None]:
misinformation_df = pd.read_csv('path/to/misinf/df')
not_misinf_df = pd.read_csv('path/to/not_misinf/df')

In [None]:
import ast
def get_transcript(transcript_list_str):
    transcript_list = ast.literal_eval(transcript_list_str)
    transcript_text = ""
    for line in transcript_list:
        transcript_text += " " + line["text"]
    return transcript_text

In [None]:
misinformation_df['transcript_text'] = misinformation_df['transcript'].apply(get_transcript)
not_misinf_df['transcript_text'] = not_misinf_df['transcript'].apply(get_transcript)

# 2. Clean Transcripts

In [None]:
import re
def is_timestamp(s):
    return bool(re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', s))

In [None]:
misinformation_df = misinformation_df[~misinformation_df['transcript_text'].apply(is_timestamp)]
not_misinf_df = not_misinf_df[~not_misinf_df['transcript_text'].apply(is_timestamp)]

In [None]:
def is_mostly_text(transcript, threshold=0.7):
    if not transcript:
        return False
    text_chars = sum(c.isalpha() or c.isspace() for c in transcript)
    return (text_chars / len(transcript)) >= threshold

misinformation_df['is_text'] = misinformation_df['transcript_text'].apply(is_mostly_text)
not_misinf_df['is_text'] = not_misinf_df['transcript_text'].apply(is_mostly_text)

In [None]:
def has_enough_words(transcript, min_words=10):
    return len(transcript.split()) >= min_words

misinformation_df['has_enough_words'] = misinformation_df['transcript_text'].apply(has_enough_words)
not_misinf_df['has_enough_words'] = not_misinf_df['transcript_text'].apply(has_enough_words)

In [None]:
from langdetect import detect

def is_english(transcript):
    try:
        return detect(transcript) == 'en'
    except:
        return False

In [None]:
misinformation_df['is_english'] = misinformation_df['transcript_text'].apply(is_english)
not_misinf_df['is_english'] = not_misinf_df['transcript_text'].apply(is_english)

In [None]:
misinformation_df = misinformation_df[misinformation_df['is_text'] & misinformation_df['has_enough_words'] & misinformation_df['is_english']]
not_misinf_df = not_misinf_df[not_misinf_df['is_text'] & not_misinf_df['has_enough_words'] & not_misinf_df['is_english']]

In [None]:
not_misinf_df

In [None]:
# Remove special characters
misinformation_df['transcript_text'] = misinformation_df['transcript_text'].str.replace(r'[^\w\s]', '', regex=True)
not_misinf_df['transcript_text'] = not_misinf_df['transcript_text'].str.replace(r'[^\w\s]', '', regex=True)


# 3. Modelling with Simple transformers
We use the sliding_window: `sliding_window=True` which is suited for long document classification

## Data Preparations for transformer model

In [None]:
# Add a 'label' column to each dataframe
misinformation_df['label'] = 'fake'
not_misinf_df['label'] = 'real'

# Create copies of the dataframes
fake = misinformation_df.copy()
real = not_misinf_df.copy()

# Add also numeric label
fake['numeric_label'] = 1
real['numeric_label'] = 0

In [None]:
# Split the pseudoscience dataframe
fake_train, fake_test = train_test_split(fake, test_size=0.2, random_state=42)

# Split the science dataframe
real_train, real_test = train_test_split(real, test_size=0.2, random_state=42)

In [None]:
train = pd.concat([real_train, fake_train], ignore_index=True)
test = pd.concat([real_test, fake_test], ignore_index=True)

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Preparing train data
train_set = train[['transcript_text', 'numeric_label']]
test_set = test[['transcript_text', 'numeric_label']]
train_set.numeric_label.value_counts(), test_set.numeric_label.value_counts()

## Modelling


### Bert

In [None]:
# Optional model configuration
## multiprocessing should be disabled
model_args = ClassificationArgs(
    num_train_epochs=5,
    overwrite_output_dir=True,
    use_multiprocessing=False,
    use_multiprocessing_for_evaluation=False,
    sliding_window=True
)

# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    'bert-base-uncased',
    num_labels=2,
    args=model_args,
)


# Train the model
model.train_model(train_set)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_set)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

result, model_outputs, wrong_predictions = model.eval_model(test_set, f1=f1_multiclass, acc=accuracy_score)

In [None]:
y_preds, raw_outputs= model.predict(list(test_set['transcript_text']))
y_true = test_set['numeric_label']

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
print('Precision: ' + str(precision_score(y_true, y_preds, average='micro')))
print('Recall: ' + str(recall_score(y_true, y_preds, average='micro')))

In [None]:
# imports
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc
from sklearn.metrics import confusion_matrix
import seaborn as sns
y_test = y_true

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_preds)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Hypothetical Random clf: Random guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_preds)
avg_precision = average_precision_score(y_test, y_preds)

plt.plot(recall, precision, lw=2, label='Precision-Recall curve (Avg Precision = %0.2f)' % avg_precision)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
cm = confusion_matrix(y_test, y_preds)
labels=['Real', 'Misinformation']
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

### Electra

In [None]:
electra_args = ClassificationArgs(
    num_train_epochs=5,
    overwrite_output_dir=True,
    use_multiprocessing=False,
    use_multiprocessing_for_evaluation=False,
    sliding_window=True
)

# make sure we set the early stopping
electra_args.use_early_stopping = True

# Create a ClassificationModel
electra = ClassificationModel(
    'electra',
    'google/electra-base-discriminator',
    num_labels=2,
    args=electra_args
)

# Train the model
electra.train_model(train_set)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

result, model_outputs, wrong_predictions = electra.eval_model(test_set, f1=f1_multiclass, acc=accuracy_score)

In [None]:
result

In [None]:
y_preds, raw_outputs= electra.predict(list(test_set['transcript_text']))
y_true = test_set['numeric_label']

In [None]:
print('Precision: ' + str(precision_score(y_true, y_preds, average='micro')))
print('Recall: ' + str(recall_score(y_true, y_preds, average='micro')))

In [None]:
# imports
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc
from sklearn.metrics import confusion_matrix
import seaborn as sns
y_test = y_true

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_preds)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Hypothetical Random clf: Random guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_preds)
avg_precision = average_precision_score(y_test, y_preds)

plt.plot(recall, precision, lw=2, label='Precision-Recall curve (Avg Precision = %0.2f)' % avg_precision)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
cm = confusion_matrix(y_test, y_preds)
labels=['Real', 'Misinformation']
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

### Roberta

In [None]:
roberta_args = ClassificationArgs(
    num_train_epochs=5,
    overwrite_output_dir=True,
    use_multiprocessing=False,
    use_multiprocessing_for_evaluation=False,
    sliding_window=True
)

# make sure we set the early stopping
roberta_args.use_early_stopping = True

# Create a ClassificationModel
roberta = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=2,
    args=roberta_args
)

# Train the model
roberta.train_model(train_set)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

result, model_outputs, wrong_predictions = roberta.eval_model(test_set, f1=f1_multiclass, acc=accuracy_score)

In [None]:
y_preds, raw_outputs= roberta.predict(list(test_set['transcript_text']))
y_true = test_set['numeric_label']

In [None]:
print('Precision: ' + str(precision_score(y_true, y_preds, average='micro')))
print('Recall: ' + str(recall_score(y_true, y_preds, average='micro')))

In [None]:
# imports
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc
from sklearn.metrics import confusion_matrix
import seaborn as sns
y_test = y_true

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_preds)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Hypothetical Random clf: Random guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_preds)
avg_precision = average_precision_score(y_test, y_preds)

plt.plot(recall, precision, lw=2, label='Precision-Recall curve (Avg Precision = %0.2f)' % avg_precision)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
cm = confusion_matrix(y_test, y_preds)
labels=['Real', 'Misinformation']
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()