In [1]:
import sys
sys.path.append('..')
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import argparse
from tqdm import tqdm
import numpy as np
import torch
import torchvision.transforms.functional as TF
from PIL import Image

from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from rouge_score import rouge_scorer
from sklearn.preprocessing import StandardScaler
import ast


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load CSV files for train, validation, and test datasets
df = pd.read_csv('/media/manhdd5/4T/Manhdd/SOSum_summarization/data/train_dataset.csv')
df['answer_body'] = df['answer_body'].str.replace('[©¥¢]', '', regex=True)
df['truth'] = df['truth'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x != '' else [])

val_df = pd.read_csv('/media/manhdd5/4T/Manhdd/SOSum_summarization/data/validation_dataset.csv')
val_df['answer_body'] = val_df['answer_body'].str.replace('[©¥¢]', '', regex=True)
val_df['truth'] = val_df['truth'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x != '' else [])

test_df = pd.read_csv('/media/manhdd5/4T/Manhdd/SOSum_summarization/data/test_dataset.csv')
test_df['answer_body'] = test_df['answer_body'].str.replace('[©¥¢]', '', regex=True)
test_df['truth'] = test_df['truth'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x != '' else [])


In [3]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [4]:
# Function to encode sentences using BERT
def encode_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.pooler_output.squeeze(0).numpy()

In [5]:
# Preprocessing step: Encode all sentences into embeddings
def preprocess_data(dataframe):
    sentences = []
    labels = []
    embeddings = []
    
    for idx, row in dataframe.iterrows():
        answer_sentences = row['answer_body'].split('.')
        truth_indices = row['truth']
        for i, sentence in enumerate(answer_sentences):
            if sentence.strip():  # Skip empty sentences
                sentences.append(sentence.strip())
                labels.append(1 if i in truth_indices else 0)  # Label sentences
                embeddings.append(encode_sentence(sentence.strip()))
                
    return np.array(embeddings), np.array(labels), sentences

In [6]:
# # Preprocess the data
# X, y = preprocess_data(df)

# # Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data for train, validation, and test datasets
X_train, y_train, train_sentences = preprocess_data(df)
X_val, y_val, val_sentences = preprocess_data(val_df)
X_test, y_test, test_sentences = preprocess_data(test_df)

# Scaling features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [7]:
# Define classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(),
    "AdaBoost": AdaBoostClassifier(),
    "Naive Bayes": GaussianNB(),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
}

In [8]:
# Train and evaluate each classifier on train set
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred_val = clf.predict(X_val)
    y_pred_test = clf.predict(X_test)
    
    print(f"Classifier: {name}")
    
    print("\nValidation Set Results")
    print(f"F1 Score: {f1_score(y_val, y_pred_val)}")
    print(f"Precision: {precision_score(y_val, y_pred_val)}")
    print(f"Recall: {recall_score(y_val, y_pred_val)}")
    print("Classification Report:")
    print(classification_report(y_val, y_pred_val))
    
    print("\nTest Set Results")
    print(f"F1 Score: {f1_score(y_test, y_pred_test)}")
    print(f"Precision: {precision_score(y_test, y_pred_test)}")
    print(f"Recall: {recall_score(y_test, y_pred_test)}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred_test))
    print("-" * 50)

Classifier: Random Forest

Validation Set Results
F1 Score: 0.39436619718309857
Precision: 0.6461538461538462
Recall: 0.28378378378378377
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.94      0.84      1482
           1       0.65      0.28      0.39       592

    accuracy                           0.75      2074
   macro avg       0.71      0.61      0.62      2074
weighted avg       0.73      0.75      0.72      2074


Test Set Results
F1 Score: 0.3554006968641115
Precision: 0.6681222707423581
Recall: 0.24208860759493672
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.95      0.84      1568
           1       0.67      0.24      0.36       632

    accuracy                           0.75      2200
   macro avg       0.71      0.60      0.60      2200
weighted avg       0.73      0.75      0.70      2200

--------------------------------------------------
Classi

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classifier: Logistic Regression

Validation Set Results
F1 Score: 0.440713536201469
Precision: 0.5817174515235457
Recall: 0.3547297297297297
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.90      0.83      1482
           1       0.58      0.35      0.44       592

    accuracy                           0.74      2074
   macro avg       0.68      0.63      0.64      2074
weighted avg       0.72      0.74      0.72      2074


Test Set Results
F1 Score: 0.44011684518013633
Precision: 0.5721518987341773
Recall: 0.3575949367088608
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83      1568
           1       0.57      0.36      0.44       632

    accuracy                           0.74      2200
   macro avg       0.67      0.62      0.63      2200
weighted avg       0.72      0.74      0.72      2200

--------------------------------------------------
Cla

In [9]:
# Function to calculate ROUGE scores
def calculate_rouge_scores(pred_summary, true_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(true_summary, pred_summary)
    return scores

# Function to reconstruct the predicted summary based on sentence indices
def get_predicted_summary(sentences, y_pred):
    return ' '.join([sentence for sentence, pred in zip(sentences, y_pred) if pred == 1])

# Function to get true summary based on ground truth
def get_true_summary(sentences, truth_indices):
    return ' '.join([sentences[i] for i in truth_indices])

In [10]:
# Function to calculate and display precision, recall, F1, ROUGE scores for a dataset
def evaluate_with_rouge(dataset, classifier, dataset_name, X, y, sentences):
    print(f"Evaluating {dataset_name} set with {classifier}")
    y_pred = classifier.predict(X)
    
    # Classification metrics
    f1 = f1_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    
    print(f"{dataset_name} Set Results")
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    
    # ROUGE metrics
    pred_summary = get_predicted_summary(sentences, y_pred)
    true_summary = get_true_summary(sentences, dataset['truth'][0])  # Assuming 1 row per dataset for simplicity
    
    rouge_scores = calculate_rouge_scores(pred_summary, true_summary)
    
    print(f"ROUGE-1: {rouge_scores['rouge1'].fmeasure}")
    print(f"ROUGE-2: {rouge_scores['rouge2'].fmeasure}")
    print(f"ROUGE-L: {rouge_scores['rougeL'].fmeasure}")
    print("-" * 50)

In [12]:
# Evaluate ROUGE scores for train, validation, and test sets
evaluate_with_rouge(df, classifiers['Random Forest'], "Train", X_train, y_train, train_sentences)
evaluate_with_rouge(val_df, classifiers['Random Forest'], "Validation", X_val, y_val, val_sentences)
evaluate_with_rouge(test_df, classifiers['Random Forest'], "Test", X_test, y_test, test_sentences)

Evaluating Train set with RandomForestClassifier()
Train Set Results
F1 Score: 0.9623042721824859
Precision: 0.9979951884522855
Recall: 0.9290780141843972
ROUGE-1: 0.00013401532241852986
ROUGE-2: 8.934753959212848e-05
ROUGE-L: 0.00013401532241852986
--------------------------------------------------
Evaluating Validation set with RandomForestClassifier()
Validation Set Results
F1 Score: 0.39436619718309857
Precision: 0.6461538461538462
Recall: 0.28378378378378377
ROUGE-1: 0.0033064682785699526
ROUGE-2: 0.0028943560057887122
ROUGE-L: 0.0033064682785699526
--------------------------------------------------
Evaluating Test set with RandomForestClassifier()
Test Set Results
F1 Score: 0.3554006968641115
Precision: 0.6681222707423581
Recall: 0.24208860759493672
ROUGE-1: 0.025229357798165136
ROUGE-2: 0.011473152822395595
ROUGE-L: 0.02064220183486239
--------------------------------------------------
