<a href="https://colab.research.google.com/github/chandrajitpal/DUT_textclassification/blob/main/DUT_binarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load and Prepare the Data
try:
    # Load the dataset from the CSV file
    file_path = 'Train_Binary_22July2025.xlsx - Binary classification.csv'
    df = pd.read_csv(file_path)

    print("Dataset loaded successfully. Here are the first 5 rows:")
    print(df.head())
    print("\nDataset Information:")
    df.info()

    # --- Data Cleaning and Preprocessing ---
    # We only need the 'Abstract' and 'Yes/No' columns.
    # Let's drop rows where either of these is missing.
    df_clean = df[['Abstract', 'Yes/No']].dropna()

    # Convert the target labels 'Yes'/'No' to numerical format (1/0)
    # This is required for most machine learning models.
    df_clean['label'] = df_clean['Yes/No'].map({'Yes': 1, 'No': 0})

    # Define our features (X) and target (y)
    X = df_clean['Abstract']
    y = df_clean['label']

    print(f"\nNumber of samples after cleaning: {len(df_clean)}")
    print(f"Class distribution:\n{df_clean['Yes/No'].value_counts()}")

    # Step 3: Split the data into training and testing sets
    # 80% for training, 20% for testing.
    # random_state ensures we get the same split every time we run the script, for reproducibility.
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"\nTraining set size: {len(X_train)}")
    print(f"Testing set size: {len(X_test)}")

    # Step 4: Build the AI Model Pipeline
    # A pipeline chains together multiple steps. Here, it will:
    # 1. Convert text to TF-IDF vectors.
    # 2. Train an SGD Classifier on these vectors.
    model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=50, tol=None)),
    ])

    # Step 5: Train the AI Model
    print("\nTraining the model...")
    model_pipeline.fit(X_train, y_train)
    print("Training complete!")

    # Step 6: Evaluate the Model
    print("\nEvaluating model performance on the test set...")
    y_pred = model_pipeline.predict(X_test)

    # Calculate and print the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nModel Accuracy: {accuracy:.2%}")

    # Print a detailed classification report (precision, recall, f1-score)
    print("\nClassification Report:")
    # Use target_names to show 'Yes' and 'No' instead of 1 and 0
    print(classification_report(y_test, y_pred, target_names=['No', 'Yes']))

    # Step 7: Create a Prediction Function
    def predict_abstract(abstract_text):
        """
        Takes a new abstract text as input and predicts if it's 'Yes' or 'No'.
        """
        # The model pipeline handles both vectorization and prediction
        prediction = model_pipeline.predict([abstract_text])

        # Convert the numerical prediction back to a human-readable label
        return 'Yes' if prediction[0] == 1 else 'No'

    # --- Example Usage of the Predictor ---
    print("\n--- Testing the Predictor with New Examples ---")

    # Example 1 (likely 'Yes' based on keywords like 'well-being' and 'climate change')
    abstract1 = "This study examines the direct impact of climate change on the socio-economic well-being of coastal communities. We analyze how rising sea levels affect health and livelihood."
    print(f"\nAbstract: '{abstract1}'")
    print(f"Prediction: {predict_abstract(abstract1)}")

    # Example 2 (likely 'No' as it's not directly about climate change and well-being)
    abstract2 = "We present a new algorithm for data compression using neural networks. The focus is on computational efficiency and lossless compression ratios for large datasets."
    print(f"\nAbstract: '{abstract2}'")
    print(f"Prediction: {predict_abstract(abstract2)}")


except FileNotFoundError:
    print(f"Error: The file 'Train_Binary_22July2025.xlsx - Binary classification.csv' was not found.")
    print("Please make sure the CSV file is in the same directory as this script.")
except Exception as e:
    print(f"An error occurred: {e}")

