In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import json

In [4]:
data = json.loads(open("Oppositional_thinking_analysis_dataset.json").read())

In [6]:
df = pd.DataFrame(data)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

# Handle class imbalance in the training set
train_df_majority = train_df[train_df.category == 'CRITICAL']
train_df_minority = train_df[train_df.category == 'CONSPIRACY']

train_df_minority_upsampled = resample(train_df_minority, 
                                       replace=True,     # sample with replacement
                                       n_samples=len(train_df_majority),    # to match majority class
                                       random_state=42) # reproducible results

train_df_balanced = pd.concat([train_df_majority, train_df_minority_upsampled])

print(train_df_balanced['category'].value_counts())

category
CRITICAL      2097
CONSPIRACY    2097
Name: count, dtype: int64


In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Define vectorizers
vectorizers = {
    'CountVectorizer': CountVectorizer(),
    'TfidfVectorizer': TfidfVectorizer()
}

# Function to train and evaluate a Naïve Bayes model
def train_and_evaluate(train_df, test_df, vectorizer):
    X_train = vectorizer.fit_transform(train_df['text'])
    y_train = train_df['category']
    X_test = vectorizer.transform(test_df['text'])
    y_test = test_df['category']
    
    model = MultinomialNB() #multinomial Naive Bayes
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"Results for {vectorizer.__class__.__name__}:")
    print(classification_report(y_test, y_pred))

# Train and evaluate models with different vectorizers
for name, vectorizer in vectorizers.items():
    train_and_evaluate(train_df_balanced, test_df, vectorizer)

Results for CountVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.76      0.82      0.79       276
    CRITICAL       0.90      0.86      0.88       524

    accuracy                           0.85       800
   macro avg       0.83      0.84      0.84       800
weighted avg       0.85      0.85      0.85       800

Results for TfidfVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.69      0.87      0.77       276
    CRITICAL       0.92      0.79      0.85       524

    accuracy                           0.82       800
   macro avg       0.81      0.83      0.81       800
weighted avg       0.84      0.82      0.82       800



In [29]:
import re
from sklearn.pipeline import Pipeline
import nltk

df = pd.DataFrame(data)


# Split data
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

# Handle class imbalance in the training set
train_df_majority = train_df[train_df.category == 'CRITICAL']
train_df_minority = train_df[train_df.category == 'CONSPIRACY']

train_df_minority_upsampled = resample(train_df_minority, 
                                       replace=True,     # sample with replacement
                                       n_samples=len(train_df_majority),    # to match majority class
                                       random_state=42) # reproducible results

train_df_balanced = pd.concat([train_df_majority, train_df_minority_upsampled])

print(train_df_balanced['category'].value_counts())

nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin


def preprocess2(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]
    return ' '.join(tokens)

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


def preprocess(text:str) -> int:

    # Lowercasing
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove decimals  
    text = re.sub(r'[\:\-\']', '', text)  # Remove specific punctuation
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text) # Remove special characters
    text = re.sub(r'\d+\.\d+', '', text)  # Matches one or more digits followed by a dot and one or more digits
    text = re.sub(r'\bcom\b', '', text, flags=re.IGNORECASE)  # Matches "com" at word boundaries (whole word)


    # Tokenization
    tokens = word_tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    return ' '.join(lemmatized_tokens)



class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return [preprocess(text) for text in X]
    

class StemmedCountVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.vectorizer = CountVectorizer()

    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        return self

    def transform(self, X):
        return self.vectorizer.transform([' '.join([self.stemmer.stem(word) for word in document.split()]) for document in X])

# Pre-processing pipelines
pipelines = {
    'CountVectorizer with Stop Words': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', CountVectorizer(stop_words='english')),
        ('classifier', MultinomialNB())
    ]),
    'TfidfVectorizer with Stop Words': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', TfidfVectorizer(stop_words='english')),
        ('classifier', MultinomialNB())
    ]),
    'Stemmed CountVectorizer': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('vectorizer', StemmedCountVectorizer()),
        ('classifier', MultinomialNB())
    ])
}

# Function to train and evaluate a Naïve Bayes model
def train_and_evaluate(train_df, test_df, pipeline):
    X_train = train_df['text']
    y_train = train_df['category']
    X_test = test_df['text']
    y_test = test_df['category']
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print(f"Results for {pipeline.named_steps['vectorizer'].__class__.__name__}:")
    print(classification_report(y_test, y_pred))

# Train and evaluate models with different pipelines
for name, pipeline in pipelines.items():
    print(f"Evaluating {name}...")
    train_and_evaluate(train_df_balanced, test_df, pipeline)

category
CRITICAL      2097
CONSPIRACY    2097
Name: count, dtype: int64
Evaluating CountVectorizer with Stop Words...


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Results for CountVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.78      0.76      0.77       276
    CRITICAL       0.88      0.89      0.88       524

    accuracy                           0.84       800
   macro avg       0.83      0.82      0.82       800
weighted avg       0.84      0.84      0.84       800

Evaluating TfidfVectorizer with Stop Words...
Results for TfidfVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.76      0.85      0.80       276
    CRITICAL       0.92      0.86      0.89       524

    accuracy                           0.86       800
   macro avg       0.84      0.86      0.85       800
weighted avg       0.86      0.86      0.86       800

Evaluating Stemmed CountVectorizer...
Results for StemmedCountVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.72      0.79      0.75       276
    CRITICAL       0.88      0.84      0.86       524

    

In [31]:
## add custom features 

import json

def calculate_uppercase_percentage(text):
    uppercase_count = 0
    total_letters = 0
    
    for char in text:
        if char.isalpha():  # Check if the character is a letter
            total_letters += 1
            if char.isupper():  # Check if the letter is uppercase
                uppercase_count += 1
    
    if total_letters == 0:
        return 0
    uppercase_percentage = (uppercase_count / total_letters) * 100
    
    return uppercase_percentage

def classify_uppercase_percentage(percentage):
    if percentage < 6:
        return "low"
    elif 6 <= percentage <= 12:
        return "neutral"
    else:
        return "high"

# Update the data with the new key-value pair
updated_data = []
for comment in data:
    uppercase_percentage = calculate_uppercase_percentage(comment["text"])
    uppercase_amount = classify_uppercase_percentage(uppercase_percentage)
    updated_comment = comment.copy()
    updated_comment["uppercase_amount"] = uppercase_amount
    updated_data.append(updated_comment)

# Save the updated data to a new file
with open('Oppositional_thinking_analysis_dataset.json', 'w') as f:
    json.dump(updated_data, f, indent=4)


In [72]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.validation import check_is_fitted

nltk.download('stopwords')
nltk.download('punkt')



# Calculate uppercase percentage and classify
def calculate_uppercase_percentage(text):
    uppercase_count = 0
    total_letters = 0
    
    for char in text:
        if char.isalpha():
            total_letters += 1
            if char.isupper():
                uppercase_count += 1
    
    if total_letters == 0:
        return 0
    uppercase_percentage = (uppercase_count / total_letters) * 100
    
    return uppercase_percentage

def classify_uppercase_percentage(percentage):
    if percentage < 6:
        return "low"
    elif 6 <= percentage <= 12:
        return "neutral"
    else:
        return "high"

# Update the data with the new key-value pair
for comment in data:
    uppercase_percentage = calculate_uppercase_percentage(comment["text"])
    comment["uppercase_amount"] = classify_uppercase_percentage(uppercase_percentage)

df = pd.DataFrame(data)

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

# Handle class imbalance in the training set
train_df_majority = train_df[train_df.category == 'CRITICAL']
train_df_minority = train_df[train_df.category == 'CONSPIRACY']

train_df_minority_upsampled = resample(train_df_minority, 
                                       replace=True,     
                                       n_samples=len(train_df_majority),    
                                       random_state=42)

train_df_balanced = pd.concat([train_df_majority, train_df_minority_upsampled])

print(train_df_balanced['category'].value_counts())

def preprocess(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]
    return ' '.join(tokens)

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_train_processed = X.copy()
        X_train_processed['text'] = X_train_processed['text'].apply(preprocess)
        return X_train_processed

class StemmedCountVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.vectorizer = CountVectorizer()

    def fit(self, X, y=None):
      
        self.vectorizer.fit(X)
        return self

    def transform(self, X):
        return self.vectorizer.transform([' '.join([self.stemmer.stem(word) for word in document.split()]) for document in X])

class CombinedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, text_transformer, additional_features):
        self.text_transformer = text_transformer
        self.additional_features = additional_features
        self.encoder = OneHotEncoder()

    def fit(self, X, y=None):

        # Fit the individual transformers
        self.text_transformer.fit(X['text'])
        self.encoder.fit(X[[self.additional_features]])  # Note double brackets for DataFrame shape
        return self

    def transform(self, X):

        # Transform the data
        text_features = self.text_transformer.transform(X['text'])
        additional_features = self.encoder.transform(X[[self.additional_features]]).toarray()  # Note double brackets for DataFrame shape
        return np.hstack((text_features.toarray(), additional_features))

# Pre-processing pipelines
pipelines = {
    'CountVectorizer with Stop Words': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('features', CombinedFeatures(CountVectorizer(stop_words='english'), 'uppercase_amount')),
        ('classifier', MultinomialNB())
    ]),
    'TfidfVectorizer with Stop Words': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('features', CombinedFeatures(TfidfVectorizer(stop_words='english'), 'uppercase_amount')),
        ('classifier', MultinomialNB())
    ]),
    'Stemmed CountVectorizer': Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('features', CombinedFeatures(StemmedCountVectorizer(), 'uppercase_amount')),
        ('classifier', MultinomialNB())
    ])
}

# Function to train and evaluate a Naïve Bayes model
def train_and_evaluate(train_df, test_df, pipeline):
    X_train = train_df[['text', 'uppercase_amount']]
    #print("1:",type(X_train))
    #print("1:",X_train)
    y_train = train_df['category']
    X_test = test_df[['text', 'uppercase_amount']]
    y_test = test_df['category']
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print(f"Results for {pipeline.named_steps['features'].text_transformer.__class__.__name__}:")
    print(classification_report(y_test, y_pred))

# Train and evaluate models with different pipelines
for name, pipeline in pipelines.items():
    print(f"Evaluating {name}...")
    train_and_evaluate(train_df_balanced, test_df, pipeline)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


category
CRITICAL      2097
CONSPIRACY    2097
Name: count, dtype: int64
Evaluating CountVectorizer with Stop Words...
Fit 2: Type of X is <class 'pandas.core.frame.DataFrame'>
Results for CountVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.78      0.75      0.77       276
    CRITICAL       0.87      0.89      0.88       524

    accuracy                           0.84       800
   macro avg       0.83      0.82      0.82       800
weighted avg       0.84      0.84      0.84       800

Evaluating TfidfVectorizer with Stop Words...
Fit 2: Type of X is <class 'pandas.core.frame.DataFrame'>
Results for TfidfVectorizer:
              precision    recall  f1-score   support

  CONSPIRACY       0.77      0.82      0.79       276
    CRITICAL       0.90      0.87      0.88       524

    accuracy                           0.85       800
   macro avg       0.83      0.84      0.84       800
weighted avg       0.85      0.85      0.85       800

Evaluat

In [70]:

def preprocess(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    tokens = nltk.word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]
    return ' '.join(tokens)


X_train_processed = train_df.copy()
X_train_processed['text'] = X_train_processed['text'].apply(preprocess)

# Check the result
print(X_train_processed)

         id                                               text    category  \
1281   4912  latest vaers data us deaths reported may occur...    CRITICAL   
3392    713  needs stop worldwide immediately every human r...    CRITICAL   
3501   3264  united airlines ceo kirby feel bad people gett...    CRITICAL   
1718  12399  breaking new icelandic study shows covid reinf...    CRITICAL   
2162   3601  carlson would definitely higher vaccination ra...    CRITICAL   
...     ...                                                ...         ...   
95     1440  father young son refused food nt vaccine passp...    CRITICAL   
1898   4859  severe rhabdomyolysis multiorgan failure covid...    CRITICAL   
3838   4577  year old brazil suffered facial paralysis weak...    CRITICAL   
3355  11142  dr mccullough interview joe rogan got views ru...    CRITICAL   
2544   6487  using fake c tests deadiy remdevisir drug kiii...  CONSPIRACY   

     uppercase_amount  
1281          neutral  
3392           

In [68]:
print(res[2])

united airlines ceo kirby feel bad people getting fired getting vaccinated


In [44]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Assuming the preprocess and preprocess2 functions are defined as before

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return [preprocess(text) for text in X]

# Combine features using ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[
        ('text_vect', TfidfVectorizer(stop_words='english'), 'text'),
        ('uppercase_ohe', OneHotEncoder(), ['uppercase_amount'])
    ],
    remainder='drop'  # This drops the columns that are not specified
)

# Define the pipeline with the ColumnTransformer
pipeline = Pipeline([
    ('features', column_transformer),
    ('classifier', MultinomialNB())
])


df = pd.DataFrame(data)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

# Handle class imbalance
train_df_majority = train_df[train_df.category == 'CRITICAL']
train_df_minority = train_df[train_df.category == 'CONSPIRACY']
train_df_minority_upsampled = resample(train_df_minority, replace=True, n_samples=len(train_df_majority), random_state=42)
train_df_balanced = pd.concat([train_df_majority, train_df_minority_upsampled])

def train_and_evaluate(train_df, test_df, pipeline):
    X_train = train_df[['text', 'uppercase_amount']]
    y_train = train_df['category']
    X_test = test_df[['text', 'uppercase_amount']]
    y_test = test_df['category']
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print("Results:")
    print(classification_report(y_test, y_pred))

# Evaluate the model
train_and_evaluate(train_df_balanced, test_df, pipeline)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erikrubinov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Results:
              precision    recall  f1-score   support

  CONSPIRACY       0.77      0.83      0.80       276
    CRITICAL       0.91      0.87      0.89       524

    accuracy                           0.85       800
   macro avg       0.84      0.85      0.84       800
weighted avg       0.86      0.85      0.86       800

