<a href="https://colab.research.google.com/github/deepakjongra/MachineLearning/blob/main/data_preprocessing_and_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Complaint Classification using NLP</h1>

In [None]:
!pip install pandas numpy transformers torch indic-nlp-library googletrans==4.0.0-rc1

In [2]:
import pandas as pd
import numpy as np
from transformers import pipeline
from googletrans import Translator
from indicnlp.normalize import indic_normalize

In [None]:
data = {
    "complaint_id": [1, 2, 3, 4],
    "text": [
        "पाणी नाही आहे, मदत करा!",  # Marathi
        "बिजली कट हो गई है",       # Hindi
        "There is a pothole on the road",  # English
        "Garbage is not collected since 3 days"  # English
    ],
    "language": ["mr", "hi", "en", "en"]
}

df = pd.DataFrame(data)
df.head()


In [None]:
translator = Translator()

def translate_to_marathi(text, lang):
    if lang == "mr":  # Already in Marathi
        return text
    try:
        translated = translator.translate(text, src=lang, dest="mr")
        return translated.text
    except Exception as e:
        return f"Error: {str(e)}"

df["translated_text"] = df.apply(lambda x: translate_to_marathi(x["text"], x["language"]), axis=1)

df[["text", "translated_text"]]


In [None]:
normalizer = indic_normalize.DevanagariNormalizer()

def normalize_text(text):
    return normalizer.normalize(text)

df["normalized_text"] = df["translated_text"].apply(normalize_text)
df[["translated_text", "normalized_text"]]


In [None]:
!pip install transformers torch

In [10]:
from transformers import pipeline

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd

# Define the categories from Maharashtra Govt Departments
categories = [
    "Home Department", "Urban Development Department", "Public Works Department",
    "Water Resources Department", "Energy Department", "Health Department",
    "Education Department", "Food, Civil Supplies, and Consumer Protection Department",
    "Environment and Climate Change Department", "Social Justice Department",
    "Tribal Development Department", "Agriculture Department", "Labour Department",
    "Transport Department", "Tourism Department"
]

# Load IndicBERT Model and Tokenizer
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(categories))


In [10]:
def classify_complaint_indicbert(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits).item()
    return categories[predicted_class]

# Sample Complaint
complaint_text = "बिजली नहीं है, बहुत दिक्कत हो रही है।"
predicted_department = classify_complaint_indicbert(complaint_text)
print(f"Complaint: {complaint_text}\nPredicted Department: {predicted_department}")

# Sample Complaints DataFrame
data = {
    "complaint_text": [
        "पानी की सप्लाई बंद है, कुछ कीजिए!",
        "रास्ते में बहुत गड्ढे हैं, सफर मुश्किल हो गया है।",
        "राशन की दुकान पर लूटखसोट हो रही है।"
    ]
}

df = pd.DataFrame(data)

# Apply Classification to Each Complaint
df["department"] = df["complaint_text"].apply(classify_complaint_indicbert)
print(df)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Complaint: बिजली नहीं है, बहुत दिक्कत हो रही है।
Predicted Department: Environment and Climate Change Department
                                      complaint_text  \
0                  पानी की सप्लाई बंद है, कुछ कीजिए!   
1  रास्ते में बहुत गड्ढे हैं, सफर मुश्किल हो गया है।   
2                राशन की दुकान पर लूटखसोट हो रही है।   

                                  department  
0  Environment and Climate Change Department  
1  Environment and Climate Change Department  
2  Environment and Climate Change Department  


<h1>Fine Tunning Model</h1>

In [None]:
!pip install transformers torch datasets sentencepiece scikit-learn

<h1>Load Pretrained IndicBERT</h1>

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Define categories (Departments of Maharashtra Government) - Remove duplicate "Labour Department"
categories = [
    "School Education and Sports", "Agriculture Department", "Co-operation, Marketing and Textiles Department",
    "Dairy Development Department", "Department Of Animal Husbandry", "Department Of Minority Development",
    "Department of Industries, Energy, Labor and Mines", "Department of Sainik Welfare",
    "Environment and Climate Change Department", "Finance Department",
    "Fisheries Department", "Food, Civil Supplies and Consumer Protection Department", "Forest Department",
    "General Administration Department", "Law and Judiciary Department", "Higher and Technical Education Department",
    "Housing Department", "Labour Department", "Greater Mumbai Police"
]

# Load IndicBERT Model and Tokenizer
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(categories))

In [13]:
df = pd.read_csv("./complaints.csv")

# 🔹 Check for NaN values in the department column
df = df.dropna(subset=["department", "complaint_text"])  # Remove missing values

# 🔹 Convert department names to numerical labels
label_map = {dept: i for i, dept in enumerate(categories)}
df["label"] = df["department"].map(label_map)

# 🔹 Remove rows where the department is not in label_map (i.e., NaN labels)
df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)  # Ensure labels are integers

# Split into Train & Validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["complaint_text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

# Tokenize Data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Convert to Hugging Face Dataset (Ensuring Labels are Torch LongTensors)
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": torch.tensor(train_labels, dtype=torch.long)  # 🔹 FIX: Ensure correct label format
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": torch.tensor(val_labels, dtype=torch.long)  # 🔹 FIX: Ensure correct label format
})



In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,  # Increase training cycles
    per_device_train_batch_size=16,  # If GPU can handle more
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,  # Adjust learning rate
    weight_decay=0.01,
    logging_dir="./logs",
)




In [15]:
# Train Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.659791
2,No log,2.145175
3,No log,1.538072
4,No log,1.154503
5,No log,0.855028
6,No log,0.635423
7,No log,0.481551
8,No log,0.383515
9,No log,0.314073
10,1.174100,0.288564


TrainOutput(global_step=500, training_loss=1.17413623046875, metrics={'train_runtime': 5664.2043, 'train_samples_per_second': 1.412, 'train_steps_per_second': 0.088, 'total_flos': 24686332704000.0, 'train_loss': 1.17413623046875, 'epoch': 10.0})

In [27]:
# Define save path
save_directory = "./fine_tuned_indic_bert"

# Save model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model saved to {save_directory}")

Model saved to ./fine_tuned_indic_bert


In [16]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.2885638475418091, 'eval_runtime': 43.537, 'eval_samples_per_second': 4.594, 'eval_steps_per_second': 0.299, 'epoch': 10.0}


In [31]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the fine-tuned model
model_name = "./fine_tuned_indic_bert"  # Path where your model is saved
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Ensure model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features

In [32]:
test_texts = [
    "माझ्या गावातील रस्त्याची अवस्था खूपच वाईट आहे. पावसाळ्यात चिखलामुळे लोकांना चालणेही कठीण होते.",
    "शेतकऱ्यांसाठी नवीन अनुदान कधी मिळणार? सरकारी मदतीचा लाभ आम्हाला मिळावा.",
    "विजेचा सतत पुरवठा खंडित होतो. आम्हाला खूप अडचण होते, कृपया काहीतरी उपाय करा.",
    "सरकारी रुग्णालयात औषधांचा तुटवडा आहे. गरजू रुग्णांसाठी उपाययोजना करा.",
]

# Tokenize test grievances
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)


In [33]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

# Define department mapping (should match training labels)
department_map = {
    0: "School Education and Sports",
    1: "Agriculture Department",
    2: "Co-operation, Marketing and Textiles Department",
    3: "Dairy Development Department",
    4: "Department Of Animal Husbandry",
    5: "Department Of Minority Development",
    6: "Department of Industries, Energy, Labor and Mines",
    7: "Department of Sainik Welfare",
    8: "Environment and Climate Change Department",
    9: "Finance Department",
    10: "Fisheries Department",
    11: "Food, Civil Supplies and Consumer Protection Department",
    12: "Forest Department",
    13: "General Administration Department",
    14: "Goods and Services Tax Department",
    15: "Greater Mumbai Police",
    16: "Higher and Technical Education Department",
    17: "Home Department",
    18: "Housing Department",
    19: "Labour Department",
    20: "Law and Judiciary Department"
}

# Print predictions
for text, label in zip(test_texts, predictions):
    print(f"Complaint: {text}\nPredicted Department: {department_map[label]}\n{'-'*50}")


Complaint: माझ्या गावातील रस्त्याची अवस्था खूपच वाईट आहे. पावसाळ्यात चिखलामुळे लोकांना चालणेही कठीण होते.
Predicted Department: Department Of Minority Development
--------------------------------------------------
Complaint: शेतकऱ्यांसाठी नवीन अनुदान कधी मिळणार? सरकारी मदतीचा लाभ आम्हाला मिळावा.
Predicted Department: Department Of Animal Husbandry
--------------------------------------------------
Complaint: विजेचा सतत पुरवठा खंडित होतो. आम्हाला खूप अडचण होते, कृपया काहीतरी उपाय करा.
Predicted Department: Department Of Minority Development
--------------------------------------------------
Complaint: सरकारी रुग्णालयात औषधांचा तुटवडा आहे. गरजू रुग्णांसाठी उपाययोजना करा.
Predicted Department: Department Of Animal Husbandry
--------------------------------------------------


In [36]:
!zip -r fine_tuned_model.zip fine_tuned_indic_bert

  adding: fine_tuned_indic_bert/ (stored 0%)
  adding: fine_tuned_indic_bert/model.safetensors (deflated 7%)
  adding: fine_tuned_indic_bert/special_tokens_map.json (deflated 49%)
  adding: fine_tuned_indic_bert/spiece.model (deflated 60%)
  adding: fine_tuned_indic_bert/tokenizer_config.json (deflated 74%)
  adding: fine_tuned_indic_bert/tokenizer.json (deflated 77%)
  adding: fine_tuned_indic_bert/config.json (deflated 62%)
