# Career Recommendation System - KNN Model Development

This notebook trains and evaluates a K-Nearest Neighbors (KNN) model for career recommendations based on student data.

In [None]:
# Install required packages
!pip install pandas scikit-learn matplotlib seaborn numpy

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import joblib
import os
from google.colab import files

# Set random seed for reproducibility
np.random.seed(42)

## 1. Load and Prepare Data

You can either upload your own student dataset CSV or use the sample dataset generated below.

In [None]:
# Option 1: Upload your own dataset
try:
    uploaded = files.upload()
    for fn in uploaded.keys():
        print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')
    
    # Assuming the first uploaded file is the CSV
    if len(uploaded.keys()) > 0:
        first_file = list(uploaded.keys())[0]
        df = pd.read_csv(first_file)
        print("Dataset loaded successfully!")
        print("\nDataset shape:", df.shape)
        df.head()
    else:
        print("No file uploaded, will create sample dataset instead.")
        raise FileNotFoundError
        
except Exception as e:
    print(f"Error loading the uploaded file: {e}")
    print("Creating a sample dataset instead...")
    
    # Option 2: Create a sample dataset
    # Number of samples
    n_samples = 1000
    
    # Career options
    careers = [
        "Software Engineer", "Data Scientist", "Graphic Designer", 
        "Marketing Manager", "Systems Analyst", "Content Writer",
        "Doctor", "Lawyer", "Financial Analyst", "Civil Engineer",
        "Mechanical Engineer", "Teacher", "Research Scientist", 
        "HR Manager", "Entrepreneur"
    ]
    
    # Generate synthetic data
    np.random.seed(42)  # For reproducibility
    
    # Create feature dataframe
    data = {
        # Features scaled 1-5
        'interests_technical': np.random.randint(1, 6, n_samples),
        'interests_creative': np.random.randint(1, 6, n_samples),
        'interests_social': np.random.randint(1, 6, n_samples),
        'interests_investigative': np.random.randint(1, 6, n_samples),
        'skills_analytical': np.random.randint(1, 6, n_samples),
        'skills_communication': np.random.randint(1, 6, n_samples),
        'skills_technical': np.random.randint(1, 6, n_samples),
        'skills_problem_solving': np.random.randint(1, 6, n_samples),
        'academic_science': np.random.randint(1, 6, n_samples),
        'academic_humanities': np.random.randint(1, 6, n_samples),
        'academic_commerce': np.random.randint(1, 6, n_samples),
    }
    
    # Creating correlation between features and careers
    # This is a simplified approach - in real model, correlations would be more complex
    career_list = []
    
    for i in range(n_samples):
        # Simplified logic to assign careers based on feature values
        if data['interests_technical'][i] > 3 and data['skills_analytical'][i] > 3 and data['academic_science'][i] > 3:
            # Technical roles
            career_list.append(np.random.choice(["Software Engineer", "Data Scientist", "Systems Analyst", "Mechanical Engineer"]))
            
        elif data['interests_creative'][i] > 3 and data['skills_communication'][i] > 3:
            # Creative roles
            career_list.append(np.random.choice(["Graphic Designer", "Content Writer", "Marketing Manager"]))
            
        elif data['interests_social'][i] > 3 and data['academic_humanities'][i] > 3:
            # Social/humanities roles
            career_list.append(np.random.choice(["Teacher", "HR Manager", "Lawyer"]))
            
        elif data['interests_investigative'][i] > 3 and data['academic_science'][i] > 3:
            # Research/scientific roles
            career_list.append(np.random.choice(["Research Scientist", "Doctor", "Civil Engineer"]))
            
        elif data['skills_analytical'][i] > 3 and data['academic_commerce'][i] > 3:
            # Business/finance roles
            career_list.append(np.random.choice(["Financial Analyst", "Entrepreneur"]))
            
        else:
            # Random assignment for cases not fitting above patterns
            career_list.append(np.random.choice(careers))
    
    # Add the target variable to the dataset
    data['recommended_career'] = career_list
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Save the synthetic dataset to CSV
    df.to_csv('student_data.csv', index=False)
    print("Sample dataset created and saved as 'student_data.csv'")
    print("\nDataset shape:", df.shape)
    df.head()

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Display basic statistics
print("\nBasic Statistics:")
df.describe()

In [None]:
# Distribution of recommended careers
plt.figure(figsize=(12, 8))
career_counts = df['recommended_career'].value_counts()
sns.barplot(x=career_counts.values, y=career_counts.index)
plt.title('Distribution of Recommended Careers')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap for numeric features
plt.figure(figsize=(12, 10))
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='viridis', linewidths=0.5)
plt.title('Correlation Heatmap of Numeric Features')
plt.tight_layout()
plt.show()

In [None]:
# Boxplots for each feature grouped by career
# This helps visualize how different careers relate to different interest/skill levels
feature_categories = {
    'Interests': [col for col in df.columns if col.startswith('interests_')],
    'Skills': [col for col in df.columns if col.startswith('skills_')],
    'Academics': [col for col in df.columns if col.startswith('academic_')]
}

# Select top 5 careers by frequency for clearer visualization
top_careers = career_counts.index[:5].tolist()
filtered_df = df[df['recommended_career'].isin(top_careers)]

for category, features in feature_categories.items():
    plt.figure(figsize=(15, 10))
    plt.suptitle(f'Distribution of {category} by Top 5 Careers', fontsize=16)
    
    for i, feature in enumerate(features, 1):
        plt.subplot(len(features), 1, i)
        sns.boxplot(x='recommended_career', y=feature, data=filtered_df)
        plt.title(feature.replace('_', ' ').title())
        plt.xticks(rotation=45)
        plt.tight_layout()
    
    plt.subplots_adjust(top=0.9)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

## 3. Feature Engineering and Preprocessing

In [None]:
# Separate features and target
X = df.drop('recommended_career', axis=1)
y = df['recommended_career']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
joblib.dump(scaler, 'career_scaler.pkl')

## 4. Model Development - K-Nearest Neighbors (KNN)

In [None]:
# Find the optimal value of k using cross-validation
k_values = list(range(1, 31, 2))
cross_val_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)
    cross_val_scores.append(scores.mean())

# Plot k values vs accuracy
plt.figure(figsize=(10, 6))
plt.plot(k_values, cross_val_scores, 'o-')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Optimal k Value')
plt.grid(True)
plt.show()

# Find the best k
best_k = k_values[cross_val_scores.index(max(cross_val_scores))]
print(f"Best k value: {best_k} with accuracy: {max(cross_val_scores):.4f}")

In [None]:
# Train the model with the best k value
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled)

# Save the trained model
joblib.dump(knn, 'career_knn_model.pkl')

## 5. Model Evaluation

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
plt.figure(figsize=(14, 12))
cm = confusion_matrix(y_test, y_pred)

# Create a list of unique classes
classes = sorted(y.unique())

# Generate the confusion matrix heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=classes, 
            yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Feature importance analysis for KNN
# For KNN we'll use permutation importance technique
from sklearn.inspection import permutation_importance

result = permutation_importance(
    knn, X_test_scaled, y_test, n_repeats=10, random_state=42
)

# Sort features by importance
sorted_idx = result.importances_mean.argsort()
feature_names = X.columns

# Plot feature importances
plt.figure(figsize=(10, 8))
plt.barh(range(len(sorted_idx)), result.importances_mean[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Permutation Importance')
plt.title('Feature Importance for KNN Model')
plt.tight_layout()
plt.show()

## 6. Making Predictions with the Model

In [None]:
# Function to make predictions with the model
def predict_career(interests_technical, interests_creative, interests_social, interests_investigative,
                  skills_analytical, skills_communication, skills_technical, skills_problem_solving,
                  academic_science, academic_humanities, academic_commerce):
    
    # Create a DataFrame with the input features
    input_data = pd.DataFrame({
        'interests_technical': [interests_technical],
        'interests_creative': [interests_creative],
        'interests_social': [interests_social],
        'interests_investigative': [interests_investigative],
        'skills_analytical': [skills_analytical],
        'skills_communication': [skills_communication],
        'skills_technical': [skills_technical],
        'skills_problem_solving': [skills_problem_solving],
        'academic_science': [academic_science],
        'academic_humanities': [academic_humanities],
        'academic_commerce': [academic_commerce]
    })
    
    # Scale the input data
    input_scaled = scaler.transform(input_data)
    
    # Get predicted career
    prediction = knn.predict(input_scaled)[0]
    
    # Get probabilities for each class
    # For KNN, we can use predict_proba which returns the fraction of neighbors from each class
    probabilities = knn.predict_proba(input_scaled)[0]
    
    # Get top 5 careers with their probabilities
    careers = knn.classes_
    career_probs = [(career, prob * 100) for career, prob in zip(careers, probabilities)]
    top_careers = sorted(career_probs, key=lambda x: x[1], reverse=True)[:5]
    
    return prediction, top_careers

# Example prediction
# Values are on scale of 1-5
prediction, top_careers = predict_career(
    interests_technical=4, 
    interests_creative=3, 
    interests_social=2, 
    interests_investigative=5,
    skills_analytical=5, 
    skills_communication=3, 
    skills_technical=4, 
    skills_problem_solving=5,
    academic_science=5, 
    academic_humanities=2, 
    academic_commerce=3
)

print(f"Top predicted career: {prediction}\n")
print("Top 5 career recommendations:")
for career, prob in top_careers:
    print(f"{career}: {prob:.1f}%")

In [None]:
# Visualize the predictions for our example input
plt.figure(figsize=(10, 6))
careers, scores = zip(*top_careers)
colors = ['#FF9999' if i == 0 else '#99CCFF' for i in range(len(top_careers))]
plt.bar(careers, scores, color=colors)
plt.title('Top 5 Career Recommendations')
plt.xlabel('Career')
plt.ylabel('Confidence Score (%)')
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 100)
for i, (_, score) in enumerate(zip(careers, scores)):
    plt.text(i, score + 1, f"{score:.1f}%", ha='center')
plt.tight_layout()
plt.show()

## 7. Download the Trained Model and Scaler

In [None]:
# Download the trained model and scaler for use in the Flask API
files.download('career_knn_model.pkl')
files.download('career_scaler.pkl')
files.download('student_data.csv')

## 8. Conclusion

This notebook has demonstrated:

1. **Data Preparation**: Creating/loading and preprocessing student data
2. **Exploratory Data Analysis**: Visualizing distributions and relationships
3. **Model Training**: Finding the optimal K value and training a KNN model
4. **Model Evaluation**: Calculating accuracy, confusion matrix, and other metrics
5. **Feature Importance**: Identifying which features most influence career recommendations
6. **Making Predictions**: Demonstrating how to use the model for new students

For the web application:
1. Download the model (`career_knn_model.pkl`) and scaler (`career_scaler.pkl`)
2. Place them in your Flask app directory as specified in the SETUP-GUIDE.md
3. The Flask API will load these files to make predictions for new users

The model can be improved over time by:
1. Collecting more real student data
2. Experimenting with different algorithms (Random Forest, SVM, etc.)
3. Adding more nuanced features about student preferences