# Modeling
## 1. Select modeling techniques
## 2. Generate test design
## 3. Build model
## 4. Assess model

In [6]:
# StackOverflow Developer Survey - Modeling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [7]:
print("Linear Regression Modeling for Developer Salary Prediction")
print("=" * 60)

# 1. Load processed data from feature engineering
print("\n1. Loading the dataset...")
X_train = pd.read_csv('../data/processed/X_train_processed.csv')
X_test = pd.read_csv('../data/processed/X_test_processed.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').squeeze()
y_test = pd.read_csv('../data/processed/y_test.csv').squeeze()

# Load preprocessing objects and metadata
scaler = joblib.load('../data/processed/feature_scaler.pkl')
selector = joblib.load('../data/processed/feature_selector.pkl')

with open('../data/processed/feature_engineering_metadata.json', 'r') as f:
    metadata = json.load(f)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Target variable range: ${y_train.min():,.0f} - ${y_train.max():,.0f}")
print(f"Target variable mean: ${y_train.mean():,.0f}")

Linear Regression Modeling for Developer Salary Prediction

1. Loading the dataset...
Training set shape: (19142, 100)
Test set shape: (4786, 100)
Target variable range: $1 - $244,319
Target variable mean: $86,782


In [8]:
# Display feature importance from feature selection
print(f"\nUsing top {len(metadata['selected_features'])} features selected by F-regression")
print("Top 10 most important features:")
for i, feature_info in enumerate(metadata['feature_importance'][:10]):
    print(f"  {i+1}. {feature_info['feature']}: F-score = {feature_info['f_score']:.2f}")


Using top 100 features selected by F-regression
Top 10 most important features:
  1. Country_frequency: F-score = 7290.73
  2. YearsCode: F-score = 3247.87
  3. WorkExp: F-score = 2641.36
  4. SODuration_Between 10 and 15 years: F-score = 943.55
  5. SODuration_More than 15 years, or since Stack Overflow started in 2008: F-score = 897.61
  6. SODuration_Between 3 and 5 years: F-score = 816.72
  7. Employment_Student: F-score = 644.58
  8. PlatformHaveWorkedWith_Terraform: F-score = 639.05
  9. Age_35-44 years old: F-score = 603.43
  10. PlatformHaveWorkedWith_Homebrew: F-score = 581.18


In [9]:
# 2. Train Linear Regression Model
print("\n2. Training Linear Regression Model...")

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)


2. Training Linear Regression Model...


In [10]:
coefficients = model.coef_

feature_names = [
    'WorkExp',
    'YearsCOde',
    'DevType',
    'Industry',
    'Language_Satisfaction',
    'Database_Satisfaction',
    'CommPlatform_Satisfaction'
]
for feature, coef in zip(feature_names, coefficients):
    print(f"Coefficient for {feature}: {coef}")

Coefficient for WorkExp: 6825.237018587604
Coefficient for YearsCOde: 10492.46795287326
Coefficient for DevType: -221.28719446071818
Coefficient for Industry: 1830.3848046641324
Coefficient for Language_Satisfaction: -1844.0738573424414
Coefficient for Database_Satisfaction: 434.4815858098136
Coefficient for CommPlatform_Satisfaction: -650.9818891310479


In [11]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Linear Regression Results:")
print(f"  Training RMSE: ${train_rmse:,.0f}")
print(f"  Test RMSE: ${test_rmse:,.0f}")
print(f"  Training MAE: ${train_mae:,.0f}")
print(f"  Test MAE: ${test_mae:,.0f}")
print(f"  Training R²: {train_r2:.4f}")
print(f"  Test R²: {test_r2:.4f}")

Linear Regression Results:
  Training RMSE: $44,844
  Test RMSE: $44,651
  Training MAE: $32,982
  Test MAE: $32,963
  Training R²: 0.5186
  Test R²: 0.5252


In [None]:
# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, 
                           cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f"  Cross-validation RMSE: ${cv_rmse.mean():,.0f} ± ${cv_rmse.std():,.0f}")

In [None]:
# 3. Feature Importance Analysis
print("\n3. Feature Importance Analysis...")

# Get feature importance (coefficients) from the model
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': model.coef_,
    'abs_coefficient': np.abs(model.coef_)
}).sort_values('abs_coefficient', ascending=False)

print("Top 15 Most Important Features:")
for i, (_, row) in enumerate(feature_importance.head(15).iterrows()):
    direction = "↑" if row['coefficient'] > 0 else "↓"
    print(f"  {i+1:2d}. {row['feature']:<35} {direction} {row['coefficient']:8.0f}")


In [None]:
# Analyze career satisfaction features specifically
satisfaction_features = [f for f in feature_importance['feature'] 
                       if any(keyword in f for keyword in ['Satisfaction', 'Difference', 'Ratio'])]

if satisfaction_features:
    print("\nCareer Satisfaction Features Impact:")
    for feature in satisfaction_features:
        coef = feature_importance[feature_importance['feature'] == feature]['coefficient'].iloc[0]
        direction = "positive" if coef > 0 else "negative"
        print(f"  {feature}: ${coef:,.0f} ({direction} impact on salary)")


In [None]:
# 4. Model Diagnostics and Visualizations
print("\n4. Creating Model Diagnostics...")

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Linear Regression Model Diagnostics', fontsize=16, fontweight='bold')
