In [38]:
import pandas as pd
import json
import re
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/stoffregen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/stoffregen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/stoffregen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load the dataset

In [39]:
data = json.loads(open("../data/processed/Oppositional_thinking_analysis_dataset.json").read())

## Define function to handle imbalance in the training set

In [40]:
def balance_data(data):
    df = pd.DataFrame(data)
    df.pop('id')
    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['category'], random_state=42)

    # Handle class imbalance in the training set
    train_df_majority = train_df[train_df.category == 'CRITICAL']
    train_df_minority = train_df[train_df.category == 'CONSPIRACY']

    train_df_minority_upsampled = resample(train_df_minority, 
                                        replace=True,     
                                        n_samples=len(train_df_majority),    
                                        random_state=42)

    train_df_balanced = pd.concat([train_df_majority, train_df_minority_upsampled])

    return train_df_balanced, test_df



In [41]:
train_df_balanced, test_df = balance_data(data)

# Define different complex pre-processing functions

In [42]:
def preprocess_basic(text, lem_tag = True, stem_tag = False):
    tokens = nltk.word_tokenize(text.lower())

    if lem_tag:
    # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    if stem_tag:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)



def preprocess_advanced(text:str, lem_tag = True, stem_tag = False) -> int:

    # Lowercasing
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove decimals  
    text = re.sub(r'[\:\-\']', '', text)  # Remove specific punctuation
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text) # Remove special characters
    text = re.sub(r'\d+\.\d+', '', text)  # Matches one or more digits followed by a dot and one or more digits
    text = re.sub(r'\bcom\b', '', text, flags=re.IGNORECASE)  # Matches "com" at word boundaries (whole word)


    # Tokenization
    tokens = word_tokenize(text)

    # Removing stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    if lem_tag:
    # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    if stem_tag:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

## Define train and evaluate function

In [61]:
def train_and_evaluate(train_df, test_df, pipeline):
    # put all available features into X_train, drop columns category and id if they exist
    drop_columns = ['category'] if 'category' in train_df.columns else []
    drop_columns += ['id'] if 'id' in train_df.columns else []
    X_train = train_df.drop(drop_columns, axis=1)
    # If only one column remains, convert it to a Series
    if len(X_train.columns) == 1:
        X_train = X_train.iloc[:, 0]
        
    
    # if X_train is not a series, make X_test also a dataframe
    if isinstance(X_train, pd.Series):
        X_test = test_df['text']
    else:
        X_test = test_df[['text']]
    
    y_train = train_df['category']
    y_test = test_df['category']
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    report = classification_report(y_test, y_pred, output_dict=True)    
    # print f1 score
    pipe_name = pipeline.named_steps['vectorizer'].__class__.__name__ if 'vectorizer' in pipeline.named_steps else pipeline.named_steps['features'].text_transformer
    print(f"f1 score {pipe_name}: {report['weighted avg']['f1-score']}")
    
    # return the report as a dataframe
    return pd.DataFrame(report).transpose()

## Define function to save the results into a csv file

In [44]:
def save_results_to_csv(report, base_path='../reports/run_without_ngrams_modified_pre_basic/', file_name='classification_report.csv'):
    # Ensure the directory exists
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    
    file_path = os.path.join(base_path, file_name)
    
    # Save the report to CSV
    if not os.path.isfile(file_path):
        report.to_csv(file_path, header=True)
    else:
        report.to_csv(file_path, mode='a', header=False)

## Define the basic preprocessor class 

In [45]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, custom_preprocess_function, use_lemmatization=True, use_stemmanization = True):
        self.use_lemmatization = use_lemmatization
        self.use_stemmanization = use_stemmanization
        self.preprocess_function = custom_preprocess_function

    # define fit function to make the transformer pipeline compatible with sklearn
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        res = [self.preprocess_function(text, self.use_lemmatization, self.use_stemmanization) for text in X]
        return res


## Define function to create pipelines with:
* custom pre-processing functions
* custom classifier (e.g. naive bayes, FNN)
* no lemmatization nor stemming


In [46]:
def create_pipelines(custom_preprocess, classifier):
    vectorizers = [
        (CountVectorizer, 'CountVectorizer'),
        (TfidfVectorizer, 'TfidfVectorizer')
    ]
    
    pipelines = {}
    
    for vectorizer, vec_name in vectorizers:
        pipeline_name = (
            f'{vec_name} with preprocessing function {custom_preprocess.__name__}, '
            f'classifier {type(classifier).__name__} no Lemmatization nor stem'
        )
        
        pipelines[pipeline_name] = Pipeline([
            ('preprocessor', TextPreprocessor(custom_preprocess_function=custom_preprocess, use_lemmatization=False, use_stemmanization=False)),
            ('vectorizer', vectorizer()),
            ('classifier', classifier)
        ])
    
    return pipelines


## Train and evaluate naive bayes and FNN models with different preprocessing
* without lemmatization nor stemming

In [47]:
report_file_name = 'classification_report_1.csv'
# remove file
if os.path.isfile("../reports/run_without_ngrams_modified_pre_basic/" + report_file_name):
    os.remove("../reports/run_without_ngrams_modified_pre_basic/" + report_file_name)


for preprocess_function in [preprocess_advanced, preprocess_basic]:
    for classifier in [MultinomialNB(), MLPClassifier(hidden_layer_sizes=(50,))]:
        pipelines = create_pipelines(preprocess_function, classifier)
        for name, pipeline in pipelines.items():
            print(f"Evaluating {name} ...")
            report_df = train_and_evaluate(train_df_balanced, test_df, pipeline)
            # enrich df with the name of the pipeline, preprocess function
            report_df['pipeline'] = name
            report_df['classifier'] = pipeline.named_steps['classifier'].__class__.__name__
            report_df['vectorizer'] = pipeline.named_steps['vectorizer'].__class__.__name__
            report_df['preprocess_function'] = preprocess_function.__name__
            save_results_to_csv(report_df, file_name=report_file_name)
                    

Evaluating CountVectorizer with preprocessing function preprocess_advanced, classifier MultinomialNB no Lemmatization nor stem ...
f1 score CountVectorizer: 0.8459123319635202
False
Evaluating TfidfVectorizer with preprocessing function preprocess_advanced, classifier MultinomialNB no Lemmatization nor stem ...
f1 score TfidfVectorizer: 0.8520913770913771
True
Evaluating CountVectorizer with preprocessing function preprocess_advanced, classifier MLPClassifier no Lemmatization nor stem ...




f1 score CountVectorizer: 0.8613506629220821
True
Evaluating TfidfVectorizer with preprocessing function preprocess_advanced, classifier MLPClassifier no Lemmatization nor stem ...
f1 score TfidfVectorizer: 0.8433776007082779
True
Evaluating CountVectorizer with preprocessing function preprocess_basic, classifier MultinomialNB no Lemmatization nor stem ...
f1 score CountVectorizer: 0.8512663773254274
True
Evaluating TfidfVectorizer with preprocessing function preprocess_basic, classifier MultinomialNB no Lemmatization nor stem ...
f1 score TfidfVectorizer: 0.8237452711223203
True
Evaluating CountVectorizer with preprocessing function preprocess_basic, classifier MLPClassifier no Lemmatization nor stem ...




f1 score CountVectorizer: 0.8597936836433994
True
Evaluating TfidfVectorizer with preprocessing function preprocess_basic, classifier MLPClassifier no Lemmatization nor stem ...


KeyboardInterrupt: 

## Enrich data json with uppercase percentage and comment length feature

In [48]:
## add feature uppercase percentage
def calculate_uppercase_percentage(text):
    uppercase_count = 0
    total_letters = 0
    
    for char in text:
        if char.isalpha():  # Check if the character is a letter
            total_letters += 1
            if char.isupper():  # Check if the letter is uppercase
                uppercase_count += 1
    
    if total_letters == 0:
        return 0
    uppercase_percentage = (uppercase_count / total_letters) * 100
    
    return uppercase_percentage

def classify_uppercase_percentage(percentage):
    if percentage < 6:
        return "low"
    elif 6 <= percentage <= 12:
        return "neutral"
    else:
        return "high"
    
# add custom feature: comment length
def classify_comment_lenght(length):
    if length < 190:
        return "short"
    elif 190 <= length <= 560:
        return "average"
    else:
        return "long"

# Update the data with the new key-value pair
updated_data = data.copy()
for comment in data:
    comment_length = classify_comment_lenght(len(comment["text"]))
    uppercase_percentage = calculate_uppercase_percentage(comment["text"])
    uppercase_amount = classify_uppercase_percentage(uppercase_percentage)
    comment["comment_length"] = comment_length
    comment["uppercase_amount"] = uppercase_amount


# Save the updated data to a new file
with open('../data/processed/Oppositional_thinking_analysis_dataset_with_features.json', 'w') as f:
    json.dump(updated_data, f, indent=4)


In [49]:
data = json.loads(open("../data/processed/Oppositional_thinking_analysis_dataset_with_features.json").read())
train_df_balanced, test_df = balance_data(data)


## Add a preprocessor to handle the new features
* it uses a OneHotEncoder to encode the custom features

In [50]:
class CombinedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, text_transformer, ngram_range=(1,1), additional_features = []):
        self.text_transformer = text_transformer
        self.additional_features = additional_features
        self.encoder = OneHotEncoder()
        self.vectorizer = CountVectorizer()


    def fit(self, X, y=None):
        # Fit the individual transformers
        self.text_transformer.fit(X['text'])
        if self.additional_features:
            self.encoder.fit(X[self.additional_features]) 

        return self

    def transform(self, X):
        # print type of X
        text_features = self.text_transformer.transform(X['text'])
     
        if self.additional_features:
            additional_features = self.encoder.transform(X[self.additional_features]).toarray()
            text_features = np.hstack((text_features.toarray(), additional_features))
       
        return text_features
    


## We add a new TextPreprocessor (TODO: WHY?)

In [71]:
class TextPreprocessorAdvanced(BaseEstimator, TransformerMixin):
    def __init__(self, custom_preprocess_function,  use_lemmatization=True, use_stemmanization=False):
        self.custom_preprocess_function = custom_preprocess_function
        self.use_lemmatization = use_lemmatization
        self.use_stemmanization = use_stemmanization

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_train_processed = X.copy()
        X_train_processed['text'] = X_train_processed['text'].apply(
            lambda x: self.custom_preprocess_function(x, lem_tag=self.use_lemmatization, stem_tag=self.use_stemmanization))
        return X_train_processed

## We add some new pipelines 
* those use the combined features with either CountVectorizer or TfidfVectorizer
* they use both lemmatization and stemming
* they use a OneHotEncoder for the custom features

In [69]:
# Pre-processing pipelines
custom_preprocess = preprocess_advanced

def create_advanced_pipelines(custom_preprocess, classifier, custom_feautures : list ):
    pipelines = {
        f'CountVectorizer with preprocessing function {custom_preprocess.__name__} and no Lemmatization nor stem with custom features: {custom_feautures} and classifier {classifier}': Pipeline([
            ('preprocessor',  TextPreprocessorAdvanced(custom_preprocess_function=custom_preprocess, use_stemmanization=False, use_lemmatization=False)),
            ('features', CombinedFeatures(CountVectorizer(), custom_feautures)),
            ('classifier', classifier)
        ]),
        f'TfidfVectorizer with preprocessing function {custom_preprocess.__name__} and no Lemmatization nor stem with custom features: {custom_feautures} and classifier {classifier}': Pipeline([
            ('preprocessor', TextPreprocessorAdvanced(custom_preprocess_function=custom_preprocess, use_stemmanization=False, use_lemmatization=False)),
            ('features', CombinedFeatures(TfidfVectorizer(),custom_feautures)),
            ('classifier', classifier)
        ]),
        f'CountVectorizer with preprocessing function {custom_preprocess.__name__} and Lemmatization and stem with custom features: {custom_feautures} and classifier {classifier}': Pipeline([
            ('preprocessor',  TextPreprocessorAdvanced(custom_preprocess_function=custom_preprocess, use_stemmanization=True, use_lemmatization=True)),
            ('features', CombinedFeatures(CountVectorizer(), custom_feautures)),
            ('classifier', classifier)
        ]),
        f'TfidfVectorizer with preprocessing function {custom_preprocess.__name__} and Lemmatization and stem with custom features: {custom_feautures} and classifier {classifier}': Pipeline([
            ('preprocessor', TextPreprocessorAdvanced(custom_preprocess_function=custom_preprocess, use_stemmanization=True, use_lemmatization=True)),
            ('features', CombinedFeatures(TfidfVectorizer(), custom_feautures)),
            ('classifier', classifier)
        ])
    }   
    return pipelines


## Run and evaluate the advanced pipelines 
* with naive bayes as well as FNN
* use different feature_list combinations

In [17]:
report_file_name = 'classification_report_2.csv'
# remove file
if os.path.isfile("../reports/run_without_ngrams_modified_pre_basic/" + report_file_name):
    os.remove("../reports/run_without_ngrams_modified_pre_basic/" + report_file_name)

# Train and evaluate models with different pipelines
for classifier in [MultinomialNB(), MLPClassifier(hidden_layer_sizes=(50,))]:
    for feature_list in [['uppercase_amount', 'comment_length'], ['comment_length'], ['uppercase_amount']]:
        for preprocess_function in [preprocess_advanced, preprocess_basic]:
            pipelines = create_advanced_pipelines(preprocess_function, classifier, feature_list)
            for name, pipeline in pipelines.items():
                print(f"Evaluating {name}...")
                report_df = train_and_evaluate(train_df_balanced, test_df, pipeline)
                # enrich df with the name of the pipeline, and preprocess function
                report_df['pipeline'] = name
                report_df['classifier'] = pipeline.named_steps['classifier'].__class__.__name__
                report_df['vectorizer'] = pipeline.named_steps['features'].text_transformer
                report_df['features'] = pipeline.named_steps['features'].__class__.__name__
                report_df['preprocess_function'] = preprocess_function.__name__
                report_df['preprocess_function_uses_lemmatization'] = pipeline.named_steps['preprocessor'].use_lemmatization
                report_df['preprocess_function_uses_stemmanization'] = pipeline.named_steps['preprocessor'].use_stemmanization
                report_df['feature_list'] = json.dumps(feature_list)
                save_results_to_csv(report_df, file_name=report_file_name)

Evaluating CountVectorizer with preprocessing function preprocess_advanced and no Lemmatization nor stem with custom features: ['uppercase_amount', 'comment_length'] and classifier MultinomialNB()...
Index(['text', 'comment_length', 'uppercase_amount'], dtype='object')
Index(['text', 'comment_length', 'uppercase_amount'], dtype='object')
Index(['text'], dtype='object')
f1 score CountVectorizer(): 0.8459123319635202
Evaluating TfidfVectorizer with preprocessing function preprocess_advanced and no Lemmatization nor stem with custom features: ['uppercase_amount', 'comment_length'] and classifier MultinomialNB()...
Index(['text', 'comment_length', 'uppercase_amount'], dtype='object')
Index(['text', 'comment_length', 'uppercase_amount'], dtype='object')


KeyboardInterrupt: 

In [29]:
train_df_balanced[['uppercase_amount', 'comment_length']]

Unnamed: 0,uppercase_amount,comment_length
1281,neutral,average
3392,low,long
3501,high,short
1718,low,average
2162,high,short
...,...,...
1579,low,long
1066,high,average
1173,low,average
2757,neutral,long
