In [3]:
print("Hello")

Hello


# Load the dataset

In [5]:
import pandas as pd
df = pd.read_csv("new_dataset.csv")
df.head()

Unnamed: 0,Complaint_text,Catagory
0,Please replace my meter; it burnt due to your ...,Meter
1,I need to upgrade my load for a new workshop.,Customer
2,My bill shows a reading too high for my apartm...,Bill
3,The meter board is damaged; I need a new one.,Meter
4,Please fix the damaged pole in my street.,Supply


# Check for missing data

In [6]:
# check for missing data
df.isnull().sum()

Complaint_text    0
Catagory          0
dtype: int64

# Normalize the data

In [12]:
import re
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"\[.*?\]", "", text)  # remove [Ref-1234]
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

# Apply cleaning
df["clean_text"] = df["Complaint_text"].apply(clean_text)
df[["Complaint_text", "clean_text", "Catagory"]].head()

Unnamed: 0,Complaint_text,clean_text,Catagory
0,Please replace my meter; it burnt due to your ...,please replace my meter it burnt due to your f...,Meter
1,I need to upgrade my load for a new workshop.,i need to upgrade my load for a new workshop,Customer
2,My bill shows a reading too high for my apartm...,my bill shows a reading too high for my apartment,Bill
3,The meter board is damaged; I need a new one.,the meter board is damaged i need a new one,Meter
4,Please fix the damaged pole in my street.,please fix the damaged pole in my street,Supply


# Check for duplicates

In [13]:
# Count duplicate rows (across all columns)
duplicate_rows = df.duplicated()
print("Number of duplicate rows:", duplicate_rows.sum())

Number of duplicate rows: 1703


In [14]:
df.shape

(2549, 3)

In [15]:
df.columns

Index(['Complaint_text', 'Catagory', 'clean_text'], dtype='object')

# Tokenization and Vectorization(TF-IDF)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

# Fit and transform the clean text
X = vectorizer.fit_transform(df["clean_text"])

# Target variable
y = df["Catagory"]

In [18]:
X.shape, y.shape

((2549, 423), (2549,))

# Train/Test Split

In [19]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Check the shape
X_train.shape, X_test.shape

((2039, 423), (510, 423))

# Model Training

In [20]:
from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model

In [21]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Make predictions
y_pred = model.predict(X_test)

# Print metrics
print("🔎 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))

🔎 Accuracy: 0.9725490196078431

📄 Classification Report:
                precision    recall  f1-score   support

         Bill       0.98      0.98      0.98        87
     Customer       0.97      0.98      0.98       121
  EEU Service       0.96      0.98      0.97        65
     Employee       1.00      1.00      1.00        51
        Meter       0.98      0.93      0.95        55
Miscellaneous       1.00      0.97      0.98        30
     Pre-paid       0.95      0.98      0.96        56
       Supply       0.98      0.93      0.95        45

     accuracy                           0.97       510
    macro avg       0.98      0.97      0.97       510
 weighted avg       0.97      0.97      0.97       510



# Random Forest Classifier

In [22]:
# Assuming you already have X_train and y_train
from sklearn.ensemble import RandomForestClassifier
import joblib

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')  # Also save the fitted vectorizer


['vectorizer.pkl']

# Cross validation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# 5-Fold Cross-Validation (you can increase to 10)
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", scores)
print("Mean CV Accuracy:", scores.mean())

# Optional: Get predicted values for detailed evaluation
y_pred = cross_val_predict(model, X, y, cv=5)
print("\nClassification Report:\n")
print(classification_report(y, y_pred))

# Optional: View confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Save and export the model

In [20]:
import joblib

# Save the model
joblib.dump(model, 'complaint_classifier_model.pkl')

# If you also want to save your vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']