In [99]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import os

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [101]:
from tqdm import tqdm

In [102]:
import torch

In [103]:
from transformers import DistilBertTokenizer, DistilBertModel

In [104]:
import warnings
warnings.filterwarnings('ignore')

In [105]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [106]:
input_data = pd.read_csv('./data_input.csv')
df = input_data

In [107]:
df

Unnamed: 0,complaint,priority,parts,repair_person
0,Mirror is broken in the second floor east side...,1,Yes,Others
1,Light is not working,4,Yes,"Yes, An electrician"
2,Leaky pipes under sinks on floor 5,3,Yes,"Yes, A plumber"
3,wall hook for clothes needs to be fixed,2,No,"Yes, A carpenter"
4,"WiFi drops out frequently, disrupting online work",3,No,"Yes, A wifi Technician"
...,...,...,...,...
557,RCCB tripping issue,4,No,"Yes, An electrician"
558,"The risk of electrocution, especially in the b...",5,Yes,"Yes, An electrician"
559,kindly solve the water issue,4,No,"Yes, A plumber"
560,Electric board of room no 83 get out of the wa...,3,Yes,"Yes, An electrician"


In [108]:
df.replace({'parts': {'Yes': 1, 'No': 0}}, inplace=True)

In [109]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [110]:
target_col = 'parts'
texts = df['complaint'].tolist()
labels = df[target_col].astype(int).tolist()

In [111]:
# Step 2: Load DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.to(device)
model.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [112]:
# Step 3: Generate BERT embeddings
def get_bert_embeddings(texts, tokenizer, model, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        encodings = tokenizer(batch_texts, truncation=True, padding=True, return_tensors='pt', max_length=128)
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Use the [CLS] token representation (first token)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # (batch_size, hidden_size)
            embeddings.append(cls_embeddings.cpu().numpy())

    return np.vstack(embeddings)

print("Generating BERT embeddings...")
X = get_bert_embeddings(texts, tokenizer, model)
y = np.array(labels)

Generating BERT embeddings...



00%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:07<00:00,  4.55it/s]

In [113]:
# Step 4: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25, stratify=y)

In [114]:
# Step 5: Train a traditional ML model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [115]:
# Step 6: Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8584070796460177
              precision    recall  f1-score   support

           0       0.86      0.89      0.87        61
           1       0.86      0.83      0.84        52

    accuracy                           0.86       113
   macro avg       0.86      0.86      0.86       113
weighted avg       0.86      0.86      0.86       113



In [116]:
# Step 7: Train another traditional ML model
clf1 = RandomForestClassifier()
clf1.fit(X_train, y_train)

In [117]:
y_pred = clf1.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8230088495575221
              precision    recall  f1-score   support

           0       0.81      0.89      0.84        61
           1       0.85      0.75      0.80        52

    accuracy                           0.82       113
   macro avg       0.83      0.82      0.82       113
weighted avg       0.83      0.82      0.82       113



In [118]:
from xgboost import XGBClassifier

In [119]:
# Step 8: Train another traditional ML model
clf2 = XGBClassifier()
clf2.fit(X_train, y_train)

In [120]:
y_pred = clf2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7964601769911505
              precision    recall  f1-score   support

           0       0.79      0.85      0.82        61
           1       0.81      0.73      0.77        52

    accuracy                           0.80       113
   macro avg       0.80      0.79      0.79       113
weighted avg       0.80      0.80      0.80       113



In [123]:
df

Unnamed: 0,complaint,priority,parts,repair_person
0,Mirror is broken in the second floor east side...,1,1,Others
1,Light is not working,4,1,"Yes, An electrician"
2,Leaky pipes under sinks on floor 5,3,1,"Yes, A plumber"
3,wall hook for clothes needs to be fixed,2,0,"Yes, A carpenter"
4,"WiFi drops out frequently, disrupting online work",3,0,"Yes, A wifi Technician"
...,...,...,...,...
557,RCCB tripping issue,4,0,"Yes, An electrician"
558,"The risk of electrocution, especially in the b...",5,1,"Yes, An electrician"
559,kindly solve the water issue,4,0,"Yes, A plumber"
560,Electric board of room no 83 get out of the wa...,3,1,"Yes, An electrician"


In [124]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels = le.fit_transform(df['repair_person'])
y = np.array(labels)

# Step 4: Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [126]:
# Step 5: Train logistic regression
clf21 = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
clf21.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = clf21.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.7433628318584071

Classification Report:
                        precision    recall  f1-score   support

               No need       0.00      0.00      0.00         3
                Others       0.50      0.53      0.52        15
      Yes, A carpenter       0.71      0.62      0.67         8
        Yes, A plumber       0.74      0.84      0.79        31
Yes, A wifi Technician       0.83      0.77      0.80        13
   Yes, An electrician       0.81      0.81      0.81        43

              accuracy                           0.74       113
             macro avg       0.60      0.60      0.60       113
          weighted avg       0.73      0.74      0.73       113



In [141]:
# Example: predict on new data
new_texts = [" cycle stand roof"]
new_embeddings = get_bert_embeddings(new_texts, tokenizer, model)
preds = clf21.predict(new_embeddings)

# Convert numeric predictions back to labels
predicted_labels = le.inverse_transform(preds)
print(predicted_labels)


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.47it/s]

['Others']



