In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('complaints_preprocessed.csv')

# Split the dataset (80% training, 20% testing)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Target'])

print(train_df['narrative'].isna().sum())  # Check NaN count in train set
print(test_df['narrative'].isna().sum())   # Check NaN count in test set

train_df['narrative'] = train_df['narrative'].fillna("")
test_df['narrative'] = test_df['narrative'].fillna("")

# Separate features (X) and target labels (y)
X_train, y_train = train_df['narrative'], train_df['Target']
X_test, y_test = test_df['narrative'], test_df['Target']

639
143


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 words
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [3]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

In [4]:
# Logistic Regression
models["Logistic Regression"].fit(X_train_tfidf, y_train)
y_pred = models["Logistic Regression"].predict(X_test_tfidf)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression
Accuracy: 0.8762499567489014
              precision    recall  f1-score   support

           0       0.91      0.94      0.93    241055
           1       0.79      0.70      0.74     53369
           2       0.85      0.84      0.84     45720
           3       0.86      0.88      0.87     31728
           4       0.79      0.74      0.76     32742

    accuracy                           0.88    404614
   macro avg       0.84      0.82      0.83    404614
weighted avg       0.87      0.88      0.87    404614



In [5]:
# Naive Bayes
models["Naive Bayes"].fit(X_train_tfidf, y_train)
y_pred = models["Naive Bayes"].predict(X_test_tfidf)

print("Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Naive Bayes
Accuracy: 0.8383545799206157
              precision    recall  f1-score   support

           0       0.90      0.91      0.90    241055
           1       0.76      0.61      0.68     53369
           2       0.74      0.86      0.80     45720
           3       0.80      0.86      0.83     31728
           4       0.70      0.65      0.67     32742

    accuracy                           0.84    404614
   macro avg       0.78      0.78      0.77    404614
weighted avg       0.84      0.84      0.84    404614



In [None]:
# Support Vector Machine
models["Support Vector Machine"].fit(X_train_tfidf, y_train)
y_pred = models["Support Vector Machine"].predict(X_test_tfidf)

print("Support Vector Machine")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Random Forest
models["Random Forest"].fit(X_train_tfidf, y_train)
y_pred = models["Random Forest"].predict(X_test_tfidf)

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# K-nearest Neighbors
models["K-nearest Neighbors"].fit(X_train_tfidf, y_train)
y_pred = models["K-nearest Neighbors"].predict(X_test_tfidf)

print("K-nearest Neighbors")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
""" 

# Iterate through models, train, predict, and evaluate
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)  # Train the model
    y_pred = model.predict(X_test_tfidf)  # Make predictions

    # Print evaluation metrics
    print(f"\n{name} Model Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred)) """

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



LogisticRegression Model Results:
Accuracy: 0.8762499567489014
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.93    241055
           1       0.79      0.70      0.74     53369
           2       0.85      0.84      0.84     45720
           3       0.86      0.88      0.87     31728
           4       0.79      0.74      0.76     32742

    accuracy                           0.88    404614
   macro avg       0.84      0.82      0.83    404614
weighted avg       0.87      0.88      0.87    404614


NaiveBayes Model Results:
Accuracy: 0.8383545799206157
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.91      0.90    241055
           1       0.76      0.61      0.68     53369
           2       0.74      0.86      0.80     45720
           3       0.80      0.86      0.83     31728
           4       0.70      0.65      0.67     32742

    accuracy     