In [15]:
import pandas as pd
import seaborn as sb
import matplotlib as mp
import numpy as np

df = pd.read_csv('education_career_success.csv') # make sure dataset is in notebook folder


In [16]:
# Creating new columns that we will use for prediction
df['Risk_Score'] = (10 - df['Career_Satisfaction']) + (10 - df['Work_Life_Balance']) + (df['Years_to_Promotion'] - 1)
df['Instability_Flag'] = ((df['Job_Offers'] < 1) | (df['Starting_Salary'] < 40000) | (df['University_Ranking'] > 700)).astype(int)
df['Change_Tendency'] = ((df['Entrepreneurship'] == 'Yes') | (df['Projects_Completed'] > 5)).astype(int)

In [17]:
# Creating target variables
df['Crisis_Age'] = df['Age'] + np.random.randint(10, 20) - (df['Risk_Score'] // 3)
df['Crisis_Intensity'] = np.clip((df['Risk_Score'] // 2) + df['Instability_Flag']*2, 1, 5)

def assign_crisis_form(row):
    if row['Change_Tendency']:
        return np.random.choice(['Career change', 'Entrepreneurship', 'Extreme hobby'])
    elif row['Field_of_Study'] in ['Business', 'Finance']:
        return np.random.choice(['Extreme purchase', 'Weird hobby'])
    else:
        return np.random.choice(['Breakup', 'Travel adventure', 'Mild career shift'])

df['Crisis_Form'] = df.apply(assign_crisis_form, axis=1)

In [19]:
# Model building
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Pick features and target
X = df[['High_School_GPA', 'University_GPA', 'Internships_Completed', 'Risk_Score', 'Soft_Skills_Score', 'Networking_Score']]
y = df['Crisis_Intensity']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# See how well it does
print("Model Accuracy:", model.score(X_test, y_test))

Model Accuracy: 0.825


In [20]:
# is the model good

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

MAE: 0.264
MSE: 0.442
R2 Score: 0.3880963983520135


In [23]:
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Generate a synthetic crisis_intensity score
np.random.seed(42)
crisis_intensity = (
    (4 - df['Work_Life_Balance']) * 0.3 + 
    (10 - df['Career_Satisfaction']) * 0.4 +
    (4.0 - df['University_GPA']) * 0.3 +
    np.random.normal(0, 0.2, len(df))  # add a little randomness
)

# Clip the values between 0 and 10, like a rating scale
crisis_intensity = np.clip(crisis_int)

# features and target
X = df[['University_GPA', 'Soft_Skills_Score', 'Networking_Score', 'Career_Satisfaction', 'Work_Life_Balance']]
y = crisis_intensity  # the same synthetic target we generated

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# predictions
y_pred = rf_model.predict(X_test)

# metrics
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


NameError: name 'crisis_int' is not defined