# ChatGPT Prompt Classifier

This notebook demonstrates the process of training and evaluating a classifier for ChatGPT prompts. We'll analyze the data, extract features, and train a model to categorize prompts into different roles.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import re
from collections import Counter

# Set style for plots
plt.style.use('seaborn')
sns.set_palette('husl')

## 1. Data Loading and Preprocessing

In [None]:
# Load the data
df = pd.read_csv('ChatGPT Prompts.csv')
print("Dataset shape:", df.shape)
df.head()

In [None]:
# Clean text function
def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', str(text))
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()

# Clean the text data
df['clean_prompt'] = df['prompt'].apply(clean_text)
df['clean_act'] = df['act'].apply(clean_text)

## 2. Role Categorization

In [None]:
def categorize_role(role):
    """Categorize roles into main categories"""
    role = role.lower()
    
    categories = {
        'technical': [
            'developer', 'programmer', 'engineer', 'coder', 'terminal', 'tech', 'sql', 'database',
            'python', 'javascript', 'web', 'software', 'api', 'system', 'computer', 'it', 'cyber',
            'security', 'infrastructure'
        ],
        'creative': [
            'writer', 'artist', 'designer', 'composer', 'creator', 'generator', 'storyteller',
            'poet', 'screenwriter', 'novelist', 'content', 'creative', 'music', 'visual'
        ],
        'educational': [
            'teacher', 'tutor', 'instructor', 'coach', 'trainer', 'educator', 'mentor',
            'guide', 'advisor', 'counselor', 'learning', 'teaching'
        ],
        'professional': [
            'analyst', 'researcher', 'scientist', 'consultant', 'manager', 'executive',
            'business', 'marketing', 'sales', 'financial', 'accountant', 'entrepreneur',
            'strategist', 'data', 'metrics', 'performance'
        ],
        'service': [
            'assistant', 'helper', 'support', 'service', 'translator', 'interpreter',
            'editor', 'proofreader', 'coordinator', 'planner', 'organizer'
        ]
    }
    
    for category, keywords in categories.items():
        if any(keyword in role for keyword in keywords):
            return category
    
    return 'other'

# Categorize roles
df['role_category'] = df['clean_act'].apply(categorize_role)

# Plot category distribution
plt.figure(figsize=(12, 6))
category_counts = df['role_category'].value_counts()
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Distribution of Role Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nRole category distribution:")
print(category_counts)

## 3. Feature Extraction

In [None]:
def extract_features(text):
    """Extract features from text"""
    features = {
        'word_count': len(text.split()),
        'avg_word_length': np.mean([len(word) for word in text.split()]),
        'analysis_score': text.count('analyze') + text.count('evaluate') + text.count('assess'),
        'teaching_score': text.count('teach') + text.count('explain') + text.count('guide'),
        'creativity_score': text.count('create') + text.count('design') + text.count('generate')
    }
    return features

# Extract features
print("Extracting features...")
features_list = [extract_features(text) for text in df['clean_prompt']]
features_df = pd.DataFrame(features_list)

# Create TF-IDF features
print("\nCreating TF-IDF features...")
tfidf = TfidfVectorizer(max_features=100, stop_words='english')
prompt_tfidf = tfidf.fit_transform(df['clean_prompt'])
tfidf_df = pd.DataFrame(prompt_tfidf.toarray(), columns=tfidf.get_feature_names_out())

# Combine features
X = pd.concat([features_df, tfidf_df], axis=1)
X = X.fillna(0).values

# Prepare target variable
le = LabelEncoder()
y = le.fit_transform(df['role_category'])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 4. Model Training and Evaluation

In [None]:
# Create and train the model
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)

# Perform cross-validation
cv_scores = cross_val_score(model, X_scaled, y, cv=3)
print("Cross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())

# Train final model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

## 5. Visualization and Analysis

In [None]:
# Plot confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.ylabel('True Category')
plt.xlabel('Predicted Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot feature importance
feature_names = list(features_df.columns) + list(tfidf_df.columns)
importances = pd.Series(model.feature_importances_, index=feature_names)
plt.figure(figsize=(12, 6))
importances.nlargest(20).plot(kind='bar')
plt.title('Top 20 Most Important Features')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 6. Save Model and Preprocessors

In [None]:
# Save model and preprocessors
joblib.dump(model, 'models/prompt_classifier.joblib')
joblib.dump(tfidf, 'models/tfidf_vectorizer.joblib')
joblib.dump(scaler, 'models/feature_scaler.joblib')
joblib.dump(le, 'models/label_encoder.joblib')

print("Model and preprocessors saved successfully!")

## 7. Example Predictions

In [None]:
def predict_category(prompt):
    """Make predictions for new prompts"""
    # Clean text
    clean_prompt = clean_text(prompt)
    
    # Extract features
    features = extract_features(clean_prompt)
    features_df = pd.DataFrame([features])
    
    # Create TF-IDF features
    prompt_tfidf = tfidf.transform([clean_prompt])
    tfidf_df = pd.DataFrame(prompt_tfidf.toarray(), columns=tfidf.get_feature_names_out())
    
    # Combine features
    X = pd.concat([features_df, tfidf_df], axis=1)
    X = X.fillna(0).values
    
    # Scale features
    X_scaled = scaler.transform(X)
    
    # Make prediction
    pred = model.predict(X_scaled)
    prob = model.predict_proba(X_scaled)
    
    return le.inverse_transform(pred)[0], prob[0][pred[0]]

# Test examples
test_prompts = [
    "Act as a Python developer and help me write clean code",
    "Be a creative writer and help me write a story",
    "Act as a math teacher and explain calculus",
    "Be a business consultant and analyze my startup idea",
    "Act as a helpful assistant and organize my schedule"
]

print("Example Predictions:")
for prompt in test_prompts:
    category, confidence = predict_category(prompt)
    print(f"\nPrompt: {prompt}")
    print(f"Predicted Category: {category}")
    print(f"Confidence: {confidence:.2f}")