In [None]:
# Step 1: Explanatory Data Analysis and Feature Engineering
import pandas as pd

# Load the dataset
df = pd.read_csv("complaints.csv")

# Step 2: Text Pre-Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

# Text Pre-processing functions
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

# Apply text pre-processing
df['complaint_text'] = df['Consumer complaint narrative'].apply(preprocess_text)

# Step 3: Selection of Multi Classification model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['complaint_text'], df['Product'], test_size=0.2, random_state=42)

# Use TF-IDF Vectorizer to convert text to numerical features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Step 4: Comparison of model performance (optional)
# You can try different models and evaluate their performance if needed.

# Step 5: Model Evaluation
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_tfidf)
report = classification_report(y_test, y_pred)

print(report)

# Step 6: Prediction
def predict_category(text):
    preprocessed_text = preprocess_text(text)
    tfidf_text = tfidf_vectorizer.transform([preprocessed_text])
    prediction = model.predict(tfidf_text)
    return prediction[0]

# Example usage:
complaint_text = "I am having issues with my credit card billing."
predicted_category = predict_category(complaint_text)
print("Predicted Category:", predicted_category)
