In [12]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import torch
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import DistilBertTokenizer, DistilBertModel

# Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import nltk
nltk.download('punkt_tab')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dinesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dinesh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dinesh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dinesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

def basic_clean(text):
    return re.sub(r'\s+', ' ', text).strip()


In [4]:
def get_bert_embeddings(texts, tokenizer, model, batch_size=16):
    model.eval()
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        encodings = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            cls_embed = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embed.cpu().numpy())

    return np.vstack(embeddings)


In [16]:
input_labelled = pd.read_csv('C:/Users/Dinesh/Git_Hub/HostelFix-AI/data/data_input.csv')
input0 = pd.DataFrame(input_labelled)
df = input0

# === Choose target column ===
target_col = 'repair_person'  # Change to 'needs_spare_parts' or 'priority' as needed

# === Prepare labels ===
le = LabelEncoder()
y = le.fit_transform(df[target_col])

# === BERT pipeline ===
df['bert_text'] = df['complaint'].apply(basic_clean)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
X_bert = get_bert_embeddings(df['bert_text'].tolist(), tokenizer, model)
X_train_bert, X_test_bert, y_train_bert, y_test = train_test_split(X_bert, y, test_size=0.2, stratify=y, random_state=78)
clf_bert = LogisticRegression(max_iter=1000)
clf_bert.fit(X_train_bert, y_train_bert)
y_pred_bert = clf_bert.predict(X_test_bert)



00%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [00:06<00:00,  7.12it/s]

In [17]:
print("\n=== DistilBERT Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_bert))
print(classification_report(y_test, y_pred_bert, target_names=le.classes_))


=== DistilBERT Results ===
Accuracy: 0.7430555555555556
                        precision    recall  f1-score   support

               No need       0.00      0.00      0.00         3
                Others       0.56      0.61      0.58        23
      Yes, A carpenter       0.62      0.62      0.62        16
        Yes, A plumber       0.77      0.83      0.80        36
Yes, A wifi Technician       1.00      0.87      0.93        15
   Yes, An electrician       0.82      0.78      0.80        51

              accuracy                           0.74       144
             macro avg       0.63      0.62      0.62       144
          weighted avg       0.74      0.74      0.74       144

