Midterm Regression Model:

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    accuracy_score,
    classification_report
)

RANDOM_STATE = 42

# Replace with your real path if different
df = pd.read_csv(".../DS3001_Midterm/social_media_vs_productivity.csv")

# Optional quick cleaning of empty strings to NaN
df = df.replace({"": np.nan, "NA": np.nan, "na": np.nan, "N/A": np.nan})

# Make sure boolean like columns are real booleans
for c in ["uses_focus_apps", "has_digital_wellbeing_enabled"]:
    if c in df.columns:
        df[c] = df[c].map(
            {True: True, False: False, "True": True, "False": False, 1: True, 0: False}
        ).astype("boolean")

# Coerce numeric targets and some key numeric columns
for c in [
    "actual_productivity_score",
    "perceived_productivity_score",
    "daily_social_media_time",
    "work_hours_per_day",
    "sleep_hours",
    "number_of_notifications",
    "coffee_consumption_per_day",
    "screen_time_before_sleep",
    "job_satisfaction_score",
    "weekly_offline_hours",
    "days_feeling_burnout_per_month",
    "breaks_during_work"
]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Light capping to reduce extreme noise
cap_rules = {
    "sleep_hours": (0, 16),
    "daily_social_media_time": (0, 16),
    "work_hours_per_day": (0, 16),
    "number_of_notifications": (0, 300),
    "coffee_consumption_per_day": (0, 15),
}
for c, (lo, hi) in cap_rules.items():
    if c in df.columns:
        df[c] = df[c].clip(lower=lo, upper=hi)

# =========================
# 1. Drop rows with missing regression target
# =========================
n0 = len(df)
df = df.dropna(subset=["actual_productivity_score"]).copy()
print(f"Dropped {n0 - len(df)} rows with missing actual_productivity_score")

# =========================
# 2. Define features and targets
# =========================
# Regression target
y_reg = df["actual_productivity_score"]
X_reg = df.drop(columns=["actual_productivity_score"])

# Classification label: low productivity if actual score < 5
df["low_prod"] = (y_reg < 5).astype(int)
y_clf = df["low_prod"]
X_clf = X_reg.copy()

# Identify feature types
num_cols = X_reg.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_reg.select_dtypes(include=["object", "category", "boolean"]).columns.tolist()

# =========================
# 3. Preprocessing
# =========================
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# handle scikit versions
try:
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
except TypeError:
    # older scikit versions use 'sparse' not 'sparse_output'
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

# =========================
# 4. Linear Regression
# =========================
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=RANDOM_STATE
)

linreg = Pipeline([
    ("prep", preprocessor),
    ("model", LinearRegression())
])

linreg.fit(X_train_r, y_train_r)
y_pred_r = linreg.predict(X_test_r)

r2 = r2_score(y_test_r, y_pred_r)
rmse = mean_squared_error(y_test_r, y_pred_r) ** 0.5

print("\n=== Linear Regression on test set ===")
print(f"R2:   {r2:.3f}")
print(f"RMSE: {rmse:.3f}")