# Community Complaint Classification System (Hybrid)

**Author:** Muhammad Affan Kabir  
**SAP ID:** 65139  
**Course:** Artificial Intelligence  

## 1. Project Overview
This project implements a **Hybrid AI System** to classify citizen complaints. It combines **Rule-Based Logic** (for guaranteed keyword detection) with **Machine Learning** (for complex sentence understanding).

### Methodology
1. **Synthetic Data Generation:** Creates a robust dataset with shared contexts.
2. **Hybrid Classification:**
   * **Priority 1 (Rules):** Checks for specific keywords (e.g., "electricity") to guarantee accuracy.
   * **Priority 2 (AI):** Uses a Naive Bayes classifier if no keywords are found.

In [1]:
# ==========================================
# IMPORT LIBRARIES
# ==========================================
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# ==========================================
# STEP 1: ROBUST SYNTHETIC DATA GENERATION
# ==========================================
def generate_dataset():
    print("1. Generating Robust Synthetic Data...")

    categories = ['Cleaning', 'Water', 'Electricity', 'Inflation', 'Irrelevant']

    # --- SHARED CONTEXTS ---
    common_contexts = [
        "since yesterday", "right now", "for the last 2 days", "every morning",
        "all week", "in our city", "at my house", "in street number 5",
        "near the school", "please fix it", "it is urgent", "today"
    ]

    # --- CATEGORY SPECIFIC WORDS ---
    # CLEANING
    clean_subjects = ["Garbage", "Trash", "Rubbish", "Waste", "Dustbin", "Sewage", "Dirt"]
    clean_actions = ["is piling up", "is overflowing", "has not been collected", "is smelling terrible", "is scattered"]

    # WATER
    water_subjects = ["The water supply", "My tap water", "The main pipe", "Water pressure", "Drinking water", "The tube well"]
    water_problems = ["is completely stopped", "is leaking badly", "is coming out dirty", "has a foul smell", "is extremely low", "is not coming"]

    # ELECTRICITY
    elec_subjects = ["The electricity", "My electricity meter", "The transformer", "Power supply", "The street wires", "Voltage", "The light"]
    elec_problems = ["is fluctuating", "sparked and caught fire", "is running too fast", "is disconnected", "is out", "is gone", "is damaging appliances"]

    # INFLATION
    inf_subjects = ["The price of flour", "Vegetable rates", "Electricity bills", "Petrol prices", "School fees", "Rent costs", "Food prices"]
    inf_actions = ["have doubled", "are too high", "have become unaffordable", "are increasing daily", "are out of control", "are expensive"]

    # IRRELEVANT
    irr_subjects = ["I", "He", "She", "My friend", "The player", "My brother", "The teacher", "We"]
    irr_actions = ["is playing", "is eating", "is going to", "likes", "is watching", "is buying", "loves"]
    irr_objects = ["cricket", "football", "pizza", "a movie", "school", "the gym", "burgers", "video games", "music"]

    data = []
    random.seed(42)

    # Generate 500 rows
    for i in range(500):
        cat = random.choice(categories)
        context = random.choice(common_contexts)

        if cat == 'Cleaning':
            text = f"{random.choice(clean_subjects)} {random.choice(clean_actions)} {context}"
        elif cat == 'Water':
            text = f"{random.choice(water_subjects)} {random.choice(water_problems)} {context}"
        elif cat == 'Electricity':
            text = f"{random.choice(elec_subjects)} {random.choice(elec_problems)} {context}"
        elif cat == 'Inflation':
            text = f"{random.choice(inf_subjects)} {random.choice(inf_actions)} {context}"
        else: # Irrelevant
            text = f"{random.choice(irr_subjects)} {random.choice(irr_actions)} {random.choice(irr_objects)}"

        row = {'Category': cat, 'Complaint_Text': text}
        data.append(row)

    df = pd.DataFrame(data)
    print(f"   - Successfully generated {len(df)} records.")
    return df

# Generate Data
df = generate_dataset()
df.head()

1. Generating Robust Synthetic Data...
   - Successfully generated 500 records.


Unnamed: 0,Category,Complaint_Text
0,Cleaning,Sewage has not been collected since yesterday
1,Water,My tap water is not coming every morning
2,Cleaning,Sewage is scattered it is urgent
3,Cleaning,Waste is piling up please fix it
4,Cleaning,Trash is overflowing right now


In [3]:
# ==========================================
# STEP 2: MODEL TRAINING (With Visualization)
# ==========================================
print("\n2. Training AI Model...")

X = df['Complaint_Text']
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- VISUALIZATION: SEEING THE NUMBERS ---
print("\n[DEBUG] Converting Text to Numbers (TF-IDF)...")

# We create a temporary vectorizer just to show you the data
temp_vectorizer = TfidfVectorizer()
X_train_numbers = temp_vectorizer.fit_transform(X_train)

# Convert to a readable DataFrame (Showing first 5 rows and a few random columns)
feature_names = temp_vectorizer.get_feature_names_out()
df_view = pd.DataFrame(X_train_numbers.toarray(), columns=feature_names)

print("--- Visualizing the Converted Data (First 5 Rows) ---")
print("These are the numbers the model actually learns from:")
display(df_view.iloc[:5, 10:20]) # Shows 10 columns (words) for readability
print(f"Full Matrix Shape: {X_train_numbers.shape} (Rows, Unique Words)\n")

# --- ACTUAL TRAINING ---
# Now we put it into the pipeline for the final model
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)
print("   - Training Complete.")

print("\n3. Evaluation Results")
y_pred = model.predict(X_test)
print(f"   - Model Accuracy: {accuracy_score(y_test, y_pred):.2f}")


2. Training AI Model...

[DEBUG] Converting Text to Numbers (TF-IDF)...
--- Visualizing the Converted Data (First 5 Rows) ---
These are the numbers the model actually learns from:


Unnamed: 0,burgers,buying,caught,city,collected,coming,completely,control,costs,cricket
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.385561,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.382767,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.366562,0.0,0.0,0.0,0.0


Full Matrix Shape: (400, 130) (Rows, Unique Words)

   - Training Complete.

3. Evaluation Results
   - Model Accuracy: 1.00


## 5. Live Demonstration (Hybrid Logic)
This system uses a dictionary of keywords. If a user inputs a word like "electricity", it **overrides** the model and assigns it to Electricity with 100% confidence.

In [4]:
# ==========================================
# STEP 4: HYBRID PREDICTION SYSTEM
# ==========================================
print("\n" + "="*40)
print(" LIVE PREDICTION DEMO (Hybrid Rule + AI)")
print("="*40)
print("Type a sentence below to test the model (type 'exit' to stop).\n")

def predict_complaint(text):
    text_lower = text.lower()

    # --- 1. RULE-BASED OVERRIDE ---
    # If these specific words appear, we force the category to 100%
    keywords = {
        'Electricity': ['electricity', 'voltage', 'transformer', 'power', 'wire', 'blackout', 'light'],
        'Water': ['water', 'pipe', 'tap', 'leak', 'supply', 'tube well'],
        'Cleaning': ['garbage', 'trash', 'rubbish', 'waste', 'dustbin', 'sewage', 'clean'],
        'Inflation': ['price', 'cost', 'rate', 'fee', 'expensive', 'inflation', 'doubled']
    }

    for category, words in keywords.items():
        for word in words:
            if word in text_lower:
                # Return immediately if keyword is found
                return category, 1.0

    # --- 2. AI MODEL FALLBACK ---
    # If no keywords are found, use the trained model
    probabilities = model.predict_proba([text])[0]
    max_confidence = probabilities.max()
    predicted_index = probabilities.argmax()
    predicted_category = model.classes_[predicted_index]

    # Logic for Irrelevant/Others
    if predicted_category == 'Irrelevant':
        return "Others", max_confidence

    # Safety Threshold
    if max_confidence < 0.45:
        return "Others", max_confidence

    return predicted_category, max_confidence

# --- Interactive Loop ---
while True:
    try:
        user_input = input("Enter Complaint: ")

        if user_input.lower() in ['exit', 'quit']:
            print("\nExiting Demo. Goodbye!")
            break

        if not user_input.strip():
            continue

        cat, conf = predict_complaint(user_input)

        # Print Result
        if conf == 1.0:
             print(f"   -> Result: {cat} (Guaranteed by Keyword)")
        elif cat == "Others":
            print(f"   -> Result: Others (Detected as Irrelevant/Low Confidence)")
        else:
            print(f"   -> Result: {cat} (AI Confidence: {conf:.1%})")
        print("-" * 40)

    except KeyboardInterrupt:
        print("\nStopped.")
        break


 LIVE PREDICTION DEMO (Hybrid Rule + AI)
Type a sentence below to test the model (type 'exit' to stop).

Enter Complaint: electricity down since afternoon
   -> Result: Electricity (Guaranteed by Keyword)
----------------------------------------

Stopped.
