In [2]:
import pandas as pd
import numpy as np
import os
from machine_learning_classifiers import TextClassifier

#____________________________________________________________
# Load synthetic data



# --- Data Upload and Preparation ---

# Load your CSV file (update the path as needed)
# Construct path with os library
current_dir = os.getcwd()
two_steps_back = os.path.dirname(os.path.dirname(current_dir))
path = os.path.join(two_steps_back, 'data', 'vaccination_synthetic_articles_combined.csv')

df_new = pd.read_csv(path)


# Create a DataFrame for changed articles (label 0)
df_changed = df_new[['Changed_article']].dropna().copy()
df_changed['text'] = df_changed['Changed_article']
df_changed['target'] = 0
df_changed['synthetic'] = True

# Combine the two DataFrames into one
df_synthetic_train = df_changed[['text', 'target', 'synthetic']]




# ____________________________________________________________
# Load the MMCoVaR News Dataset

path = os.path.join(two_steps_back, 'data', 'MMCoVaR_News_Dataset.csv')
df_MMCoVaR = pd.read_csv(path)


# Create a DataFrame for original articles with label 1
df_MMCoVaR_train = df_MMCoVaR[['body_text', 'reliability']].copy()
df_MMCoVaR_train.columns = ['text', 'target']
df_MMCoVaR_train['synthetic'] = False


df = pd.concat([df_synthetic_train, df_MMCoVaR_train], ignore_index=True, axis=0)




#____________________________________________________________
# --- Running the Machine Learning Pipeline ---

# Instantiate the classifier without n-grams, using default parameters
classifier = TextClassifier(
    df_MMCoVaR_train,
    text_column='text',
    target_column='target',
    use_ngrams=False,
    test_size=0.2,
    random_state=42
)
classifier.run_analysis()

# Optionally, run the analysis with n-grams enabled
print("\n\n\n Running analysis with ngrams enabled \n\n\n")
classifier_ng = TextClassifier(
    df_MMCoVaR_train,
    text_column='text',
    target_column='target',
    use_ngrams=True,
    test_size=0.2,
    random_state=42
)
classifier_ng.run_analysis()




Logistic Regression with TF-IDF
              precision    recall  f1-score   support

           0       0.91      0.83      0.87       192
           1       0.90      0.95      0.93       327

    accuracy                           0.91       519
   macro avg       0.91      0.89      0.90       519
weighted avg       0.91      0.91      0.90       519

Confusion Matrix:
 [[159  33]
 [ 16 311]]
AUC: 0.9665042048929663

Naive Bayes with TF-IDF
              precision    recall  f1-score   support

           0       1.00      0.16      0.28       192
           1       0.67      1.00      0.80       327

    accuracy                           0.69       519
   macro avg       0.84      0.58      0.54       519
weighted avg       0.79      0.69      0.61       519

Confusion Matrix:
 [[ 31 161]
 [  0 327]]
AUC: 0.9067278287461774

Logistic Regression with Word2Vec
              precision    recall  f1-score   support

           0       0.72      0.61      0.66       192
           1 