In [None]:
# The model utilizes the Naïve Bayes (NB) algorithm, but it exhibits a bias toward Human-generated outcomes probably due to the diferent dataset.
# The accuracy of the model is 0.97
# By using streamlit run ai_generated_text the model becomes an application

import streamlit as st
import numpy as np
import pandas as pd
import string
import os
import nltk
from nltk.corpus import words
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv('train_from_LLM-Detect_AI-GT_1MNB-3SGD.csv')

# Ensure the dataset has no missing values in essential columns
df.dropna(subset=['text', 'RDizzl3_seven'], inplace=True)

# Convert labels to binary (assuming True/False values)
df['RDizzl3_seven'] = df['RDizzl3_seven'].astype(int)

# Function to shuffle dataset
def shuffle_data(df):
    return df.sample(frac=1, random_state=42).reset_index(drop=True)

df = shuffle_data(df)

# Function to clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace('\n', ' ').replace("'", "")  # Remove unwanted tags
    text = ''.join([x for x in text if x not in string.punctuation])  # Remove punctuation
    return text.lower()

df['text'] = df['text'].apply(clean_text)

# Download necessary NLTK resources
nltk.download('words')
english_words = set(words.words())

# Function to check spelling
def is_spelled_correctly(word):
    return word in english_words

# Split dataset
y = df['RDizzl3_seven']  # Using correct column name
X = df['text']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Build text classification pipeline with stopwords removal
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer(stop_words='english', lowercase=True, strip_accents='unicode')),
    ('tfidf_transformer', TfidfTransformer()),
    ('naive_bayes', MultinomialNB())
])

# Train model
pipeline.fit(X_train, y_train)

# Streamlit UI
st.title("AI vs Human Text Detector")
user_input = st.text_area("Enter a text to check if it is AI or human-generated:")

if st.button("Analyze"):
    user_input_cleaned = clean_text(user_input)
    prediction = pipeline.predict([user_input_cleaned])[0]
    st.write("The text is:", "AI-generated" if prediction == 1 else "Human-generated")

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Display classification report
st.subheader("Model Performance on Test Data:")
st.text(classification_report(y_test, y_pred))