In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

In [None]:
def print_evaluation_metrics(y_true, y_pred, y_scores=None):
    """
    This function to print the evaluation metrics

    """

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    print(f"True Positives (TP): {tp}")
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}")

    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    if y_scores is not None:
        roc_auc = roc_auc_score(y_true, y_scores)
        print(f"ROC-AUC: {roc_auc:.4f}")

    false_alarm_rate = fp / (fp + tn)
    print(f"False Alarm Rate (FAR): {false_alarm_rate:.4f}")


def select_classifier(choice):
    """
    This function returns a classifier based on the user's choice. You can add new ML model to this dic.
    """
    classifiers = {
        'random_forest': RandomForestClassifier(criterion='gini', random_state=42),
        'gradient_boosting': GradientBoostingClassifier(random_state=42),
        'logistic_regression': LogisticRegression(max_iter=1000, random_state=42),
        'svm': SVC(kernel="sigmoid",probability=True, random_state=42), #'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
        'knn': KNeighborsClassifier()
    }

    return classifiers.get(choice.lower(), "Invalid choice")

In [None]:
data = pd.read_csv('/content/gemini.csv')

In [None]:
data.columns

Index(['Statement', 'ID', 'Validity', 'True_fleshes', 'True_smog_index',
       'True_flesch_kincaid_grade', 'True_coleman_liau_index',
       'True_automated_readability_index', 'True_dale_chall_readability_score',
       'True_difficult_words', 'True_linsear_write_formula',
       'True_difficult_words.1', 'True_gunning_fog', 'True_text_standard',
       'Characters', 'Words', 'Word_Length', 'Number_Count',
       'Positive_Sentiment', 'Neutral_Sentiment', 'Negative_Sentiment',
       'Positive_Stance', 'Negative_Stance', 'Neutral_Stance'],
      dtype='object')

In [None]:
data.drop(['True_text_standard'], axis=1, inplace=True)

In [None]:
data['Validity'] = data['Validity'].astype(str)
data['Validity'] = data['Validity'].str.strip().str.upper().replace({'TRUE': 1, 'FALSE': 0})

X = data.drop('Validity', axis=1)
y = data['Validity']

  data['Validity'] = data['Validity'].str.strip().str.upper().replace({'TRUE': 1, 'FALSE': 0})


In [None]:
y.value_counts()

Unnamed: 0_level_0,count
Validity,Unnamed: 1_level_1
1,540
0,540


In [None]:
text_features = ['Statement']
numeric_features = data.columns.drop(['Statement',  'Validity', 'ID', 'True_fleshes',
 'True_smog_index',
 'True_flesch_kincaid_grade',
 'True_coleman_liau_index',
 'True_automated_readability_index',
 'True_dale_chall_readability_score',
 'True_difficult_words',
 'True_linsear_write_formula',
 'True_difficult_words.1',
 'True_gunning_fog']).tolist()
categorical_features = []

In [None]:
numeric_features

['Characters',
 'Words',
 'Word_Length',
 'Number_Count',
 'Positive_Sentiment',
 'Neutral_Sentiment',
 'Negative_Sentiment',
 'Positive_Stance',
 'Negative_Stance',
 'Neutral_Stance']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text_statement', TfidfVectorizer(stop_words='english'), 'Statement'),
        # ('text_standard', TfidfVectorizer(stop_words='english'), 'True_text_standard'), if you want to use this coumn as a text uncomment this and comment the drop line in the begening of the code
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ], remainder='passthrough') # this means pass through the  remaining columns without any transformation

In [None]:
classifier =  select_classifier("knn")

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=404)


In [None]:
pipeline.fit(X_train, y_train)


In [None]:
predictions = pipeline.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.78      0.77      0.78       162
           1       0.77      0.78      0.78       162

    accuracy                           0.78       324
   macro avg       0.78      0.78      0.78       324
weighted avg       0.78      0.78      0.78       324



In [None]:
if hasattr(pipeline, "predict_proba"):
    y_scores = pipeline.predict_proba(X_test)[:, 1]
    print_evaluation_metrics(y_test, predictions, y_scores)
else:
    print_evaluation_metrics(y_test, predictions)

True Positives (TP): 127
True Negatives (TN): 125
False Positives (FP): 37
False Negatives (FN): 35
Accuracy: 0.7778
ROC-AUC: 0.8457
False Alarm Rate (FAR): 0.2284


GPT data:


1.    KNN model:
True Positives (TP): 156
True Negatives (TN): 150
False Positives (FP): 12
False Negatives (FN): 6
Accuracy: 0.9444
ROC-AUC: 0.9804
False Alarm Rate (FAR): 0.0741

2. RF model:
True Positives (TP): 161
True Negatives (TN): 161
False Positives (FP): 1
False Negatives (FN): 1
Accuracy: 0.9938
ROC-AUC: 0.9994
False Alarm Rate (FAR): 0.0062

GEMINI data:
1. KNN model:
True Positives (TP): 155
True Negatives (TN): 156
False Positives (FP): 6
False Negatives (FN): 7
Accuracy: 0.9599
ROC-AUC: 0.9793
False Alarm Rate (FAR): 0.0370

2. RF model:

True Positives (TP): 160
True Negatives (TN): 161
False Positives (FP): 1
False Negatives (FN): 2
Accuracy: 0.9907
ROC-AUC: 0.9999
False Alarm Rate (FAR): 0.0062


Human data:
1. KNN model:
True Positives (TP): 154
True Negatives (TN): 151
False Positives (FP): 11
False Negatives (FN): 8
Accuracy: 0.9414
ROC-AUC: 0.9790

False Alarm Rate (FAR): 0.0679
2. RF model:
True Positives (TP): 158
True Negatives (TN): 162
False Positives (FP): 0
False Negatives (FN): 4
Accuracy: 0.9877
ROC-AUC: 0.9999
False Alarm Rate (FAR): 0.0000
