# HR: Workforce Forecasting & Productivity Analysis


In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, classification_report
wf = pd.read_csv('../data/workforce_history_and_drivers.csv')
emp = pd.read_csv('../data/employee_productivity_snapshot.csv')
wf.head()

## Baseline Forecast for Next 6 Months per Department

In [None]:
wf['date'] = pd.to_datetime(wf['date'])
future_months = 6
forecasts = []
for d, g in wf.groupby('department'):
    g = g.sort_values('date')
    g = g.copy(); g['t'] = np.arange(len(g))
    X = g[['current_headcount','projects','attrition_rate','productivity_index','t']]
    y = g['required_headcount']
    model = LinearRegression().fit(X, y)
    last_t = g['t'].iloc[-1]
    last_row = g.iloc[-1]
    for k in range(1, future_months+1):
        new_t = last_t + k
        season = 1 + 0.1*np.sin(2*np.pi*((g['date'].dt.month.iloc[-1]+k)%12)/12.0)
        Xf = [[last_row['current_headcount'], max(0, last_row['projects'] + (1 if k%3==0 else 0)), last_row['attrition_rate'], last_row['productivity_index']*season, new_t]]
        yhat = float(model.predict(Xf)[0])
        forecasts.append({'department': d,'month_ahead': k,'forecast_required_headcount': int(round(yhat))})
forecast_df = pd.DataFrame(forecasts)
forecast_df.head(12)

## Employee Productivity Models

In [None]:
X = emp[['tenure_years','remote_ratio','sick_days_last_quarter','training_hours_last_quarter','overtime_hours_last_quarter']]
y = emp['kpi_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
reg = LinearRegression().fit(X_train, y_train)
pred = reg.predict(X_test)
print('MAE:', mean_absolute_error(y_train, reg.predict(X_train)))
print('MAE (test):', mean_absolute_error(y_test, pred))
print('R2 (test):', r2_score(y_test, pred))

emp['is_high_performer'] = (emp['kpi_score']>=85).astype(int)
Xc = X; yc = emp['is_high_performer']
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.25, random_state=42, stratify=yc)
clf = LogisticRegression(max_iter=200).fit(Xc_train, yc_train)
yc_pred = clf.predict(Xc_test)
print(classification_report(yc_test, yc_pred))

## Hiring Plan Table (Next 3 Months)

In [None]:
latest = wf.sort_values('date').groupby('department').tail(1)[['department','current_headcount']]
plan = forecast_df[forecast_df['month_ahead']<=3].merge(latest, on='department', how='left')
plan['gap_to_hire'] = (plan['forecast_required_headcount'] - plan['current_headcount']).clip(lower=0)
plan.sort_values(['month_ahead','gap_to_hire'], ascending=[True, False]).head(20)

Next: Upgrade to Vertex AI Forecasting/BigQuery ML & include constraints (budget, hiring velocity).