In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('complaints_preprocessed.csv')

# Split the dataset (80% training, 20% testing)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Target'])

print(train_df['narrative'].isna().sum())  # Check NaN count in train set
print(test_df['narrative'].isna().sum())   # Check NaN count in test set

train_df['narrative'] = train_df['narrative'].fillna("")
test_df['narrative'] = test_df['narrative'].fillna("")

# Separate features (X) and target labels (y)
X_train, y_train = train_df['narrative'], train_df['Target']
X_test, y_test = test_df['narrative'], test_df['Target']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 words
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

In [None]:
# Logistic Regression
models["Logistic Regression"].fit(X_train_tfidf, y_train)
y_pred = models["Logistic Regression"].predict(X_test_tfidf)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Naive Bayes
models["Naive Bayes"].fit(X_train_tfidf, y_train)
y_pred = models["Naive Bayes"].predict(X_test_tfidf)

print("Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Support Vector Machine
models["Support Vector Machine"].fit(X_train_tfidf, y_train)
y_pred = models["Support Vector Machine"].predict(X_test_tfidf)

print("Support Vector Machine")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Random Forest
models["Random Forest"].fit(X_train_tfidf, y_train)
y_pred = models["Random Forest"].predict(X_test_tfidf)

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# K-nearest Neighbors
models["K-nearest Neighbors"].fit(X_train_tfidf, y_train)
y_pred = models["K-nearest Neighbors"].predict(X_test_tfidf)

print("K-nearest Neighbors")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
""" 

# Iterate through models, train, predict, and evaluate
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)  # Train the model
    y_pred = model.predict(X_test_tfidf)  # Make predictions

    # Print evaluation metrics
    print(f"\n{name} Model Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred)) """