In [None]:
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import nltk

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("dataset.csv", header=None)

# Parse the text and label using regex
def extract_text_and_label(row):
    match = re.match(r"""["']?(.*?)["']?:\s*(True|False),?$""", row[0].strip())
    if match:
        text, label = match.groups()
        return {"text": text.strip(), "label": label == "True"}
    return None

parsed_rows = [extract_text_and_label(row) for row in df.values if extract_text_and_label(row)]
clean_df = pd.DataFrame(parsed_rows)

# Preprocess text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

clean_df["clean_text"] = clean_df["text"].apply(preprocess)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(clean_df["clean_text"])
y = clean_df["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train XGBoost model with class weight handling
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_resampled, y_resampled)

# Predict probabilities on test set
y_probs = model.predict_proba(X_test)[:, 1]

# Threshold tuning: choose 0.4 instead of 0.5
y_pred_thresh = (y_probs >= 0.4).astype(int)

# Classification report
print(classification_report(y_test, y_pred_thresh))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

       False       0.90      0.89      0.90       113
        True       0.48      0.50      0.49        22

    accuracy                           0.83       135
   macro avg       0.69      0.70      0.69       135
weighted avg       0.83      0.83      0.83       135

