In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import json

In [4]:
data = json.loads(open("Oppositional_thinking_analysis_dataset.json").read())

In [6]:
df = pd.DataFrame(data)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

# Handle class imbalance in the training set
train_df_majority = train_df[train_df.category == 'CRITICAL']
train_df_minority = train_df[train_df.category == 'CONSPIRACY']

train_df_minority_upsampled = resample(train_df_minority, 
                                       replace=True,     # sample with replacement
                                       n_samples=len(train_df_majority),    # to match majority class
                                       random_state=42) # reproducible results

train_df_balanced = pd.concat([train_df_majority, train_df_minority_upsampled])

print(train_df_balanced['category'].value_counts())

category
CRITICAL      2097
CONSPIRACY    2097
Name: count, dtype: int64


In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Define vectorizers
vectorizers = {
    'CountVectorizer': CountVectorizer(),
    'TfidfVectorizer': TfidfVectorizer()
}

# Function to train and evaluate a Naïve Bayes model
def train_and_evaluate(train_df, test_df, vectorizer):
    X_train = vectorizer.fit_transform(train_df['text'])
    y_train = train_df['category']
    X_test = vectorizer.transform(test_df['text'])
    y_test = test_df['category']
    
    model = MultinomialNB() #multinomial Naive Bayes
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"Results for {vectorizer.__class__.__name__}:")
    print(classification_report(y_test, y_pred))

# Train and evaluate models with different vectorizers
for name, vectorizer in vectorizers.items():
    train_and_evaluate(train_df_balanced, test_df, vectorizer)

Results for CountVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.76      0.82      0.79       276
    CRITICAL       0.90      0.86      0.88       524

    accuracy                           0.85       800
   macro avg       0.83      0.84      0.84       800
weighted avg       0.85      0.85      0.85       800

Results for TfidfVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.69      0.87      0.77       276
    CRITICAL       0.92      0.79      0.85       524

    accuracy                           0.82       800
   macro avg       0.81      0.83      0.81       800
weighted avg       0.84      0.82      0.82       800



In [28]:
import re
from sklearn.pipeline import Pipeline
import nltk

df = pd.DataFrame(data)


# Split data
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

# Handle class imbalance in the training set
train_df_majority = train_df[train_df.category == 'CRITICAL']
train_df_minority = train_df[train_df.category == 'CONSPIRACY']

train_df_minority_upsampled = resample(train_df_minority, 
                                       replace=True,     # sample with replacement
                                       n_samples=len(train_df_majority),    # to match majority class
                                       random_state=42) # reproducible results

train_df_balanced = pd.concat([train_df_majority, train_df_minority_upsampled])

print(train_df_balanced['category'].value_counts())

nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin


def preprocess2(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]
    return ' '.join(tokens)

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return [preprocess(text) for text in X]

class StemmedCountVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.vectorizer = CountVectorizer()

    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self

    def transform(self, X):
        return self.vectorizer.transform([' '.join([self.stemmer.stem(word) for word in document.split()]) for document in X])

# Pre-processing pipelines
pipelines = {
    'CountVectorizer with Stop Words': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', CountVectorizer(stop_words='english')),
        ('classifier', MultinomialNB())
    ]),
    'TfidfVectorizer with Stop Words': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(stop_words='english')),
        ('classifier', MultinomialNB())
    ]),
    'Stemmed CountVectorizer': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', StemmedCountVectorizer()),
        ('classifier', MultinomialNB())
    ])
}

# Function to train and evaluate a Naïve Bayes model
def train_and_evaluate(train_df, test_df, pipeline):
    X_train = train_df['text']
    y_train = train_df['category']
    X_test = test_df['text']
    y_test = test_df['category']
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print(f"Results for {pipeline.named_steps['vectorizer'].__class__.__name__}:")
    print(classification_report(y_test, y_pred))

# Train and evaluate models with different pipelines
for name, pipeline in pipelines.items():
    print(f"Evaluating {name}...")
    train_and_evaluate(train_df_balanced, test_df, pipeline)

category
CRITICAL      2097
CONSPIRACY    2097
Name: count, dtype: int64
Evaluating CountVectorizer with Stop Words...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Results for CountVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.78      0.76      0.77       276
    CRITICAL       0.88      0.89      0.88       524

    accuracy                           0.84       800
   macro avg       0.83      0.82      0.82       800
weighted avg       0.84      0.84      0.84       800

Evaluating TfidfVectorizer with Stop Words...
Results for TfidfVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.76      0.85      0.80       276
    CRITICAL       0.92      0.86      0.89       524

    accuracy                           0.86       800
   macro avg       0.84      0.86      0.85       800
weighted avg       0.86      0.86      0.86       800

Evaluating Stemmed CountVectorizer...
Results for StemmedCountVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.72      0.79      0.75       276
    CRITICAL       0.88      0.84      0.86       524

    

In [24]:
def preprocess2(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]
    return ' '.join(tokens)

preprocess2(" I ’m deeply concerned that the push to vaccinate these children is nothing more than a dystopian experiment with unknown consequences . ” — Rep. Louie Gohmert ( R - Texas ) # TheDefender https :// childrenshealthdefense . org / defender / fda - eua - covid - shots - infants - young - kids ")

'deeply concerned push vaccinate children nothing dystopian experiment unknown consequences rep louie gohmert r texas thedefender childrenshealthdefense org defender fda eua covid shots infants young kids'

In [27]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def preprocess(text:str) -> int:

    # Lowercasing
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove decimals  
    text = re.sub(r'[\:\-\']', '', text)  # Remove specific punctuation
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text) # Remove special characters
    text = re.sub(r'\d+\.\d+', '', text)  # Matches one or more digits followed by a dot and one or more digits
    text = re.sub(r'\bcom\b', '', text, flags=re.IGNORECASE)  # Matches "com" at word boundaries (whole word)


    # Tokenization
    tokens = word_tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return ' '.join(lemmatized_tokens)

    


preprocess(" I ’m deeply concerned that the push to vaccinate these children is nothing more than a dystopian experiment with unknown consequences . ” — Rep. Louie Gohmert ( R - Texas ) # TheDefender https :// childrenshealthdefense . org / defender / fda - eua - covid - shots - infants - young - kids ")

'deeply concerned push vaccinate child nothing dystopian experiment unknown consequence rep louie gohmert r texas thedefender childrenshealthdefense org defender fda eua covid shot infant young kid'