Midterm Regression Model:

In [None]:
#Setup and Cleaning
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
)
RANDOM_STATE = 42

# Replace with your real path if different
#r"C:\Users\dorot\Downloads\
df = pd.read_csv("social_media_vs_productivity.csv")

# Turn empty strings to NA. Will coerce to NA numbers later
df = df.replace({"": np.nan, "NA": np.nan, "na": np.nan, "N/A": np.nan})


# Coerce numeric targets and some key numeric columns. Will make non number data into numbers or floats
for c in [
    "actual_productivity_score",
    "perceived_productivity_score",
    "daily_social_media_time",
    "work_hours_per_day",
    "sleep_hours",
    "number_of_notifications",
    "coffee_consumption_per_day",
    "screen_time_before_sleep",
    "job_satisfaction_score",
    "weekly_offline_hours",
    "days_feeling_burnout_per_month",
    "breaks_during_work"
]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")
#Drop the rows without the actual productivity score (Target Column)
n0 = len(df)
df = df.dropna(subset=["actual_productivity_score"]).copy()
print(f"Dropped {n0 - len(df)} rows with missing actual_productivity_score")

In [None]:
# Regression model: All columns
# Regression target
y_reg = df["actual_productivity_score"]
X_reg = df.drop(columns=["actual_productivity_score"])

# Identify feature types
num_cols = X_reg.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_reg.select_dtypes(include=["object", "category", "boolean"]).columns.tolist()

#preprocess
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

try:
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
except TypeError:
    # older scikit versions use 'sparse' not 'sparse_output'
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

# Make regression model, split data and train
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=RANDOM_STATE
)

linreg = Pipeline([
    ("prep", preprocessor),
    ("model", LinearRegression())
])

linreg.fit(X_train_r, y_train_r)
y_pred_r = linreg.predict(X_test_r)

r2 = r2_score(y_test_r, y_pred_r)
rmse = mean_squared_error(y_test_r, y_pred_r) ** 0.5

print("\nLinear Regression for all columns")
print(f"R2:   {r2:.3f}")
print(f"RMSE: {rmse:.3f}")

In [None]:
# Regression graph all Columns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

plt.figure(figsize=(6,6))
plt.scatter(y_test_r, y_pred_r, alpha=0.3, color = "#02bf8c")
plt.plot(
    [y_test_r.min(), y_test_r.max()],
    [y_test_r.min(), y_test_r.max()],
    "r--", lw=2
)

plt.xlabel("Actual Productivity Score")
plt.ylabel("Predicted Productivity Score")
plt.title("Linear Regression: Predicted vs Actual")
plt.show()

In [None]:
#Regression model: Social Media Columns Only
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt

social_media_cols = [
    "daily_social_media_time",
    "number_of_notifications",
    "screen_time_before_sleep",
    "uses_focus_apps",
    "has_digital_wellbeing_enabled",
    "weekly_offline_hours"
]

X_social = df[social_media_cols].copy()
y_social = df["actual_productivity_score"]
 
bool_cols_s = X_social.select_dtypes(include=["bool", "boolean"]).columns
X_social[bool_cols_s] = X_social[bool_cols_s].astype("int8")

# Split data and train model
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_social, y_social, test_size=0.2, random_state=RANDOM_STATE
)

# Identify dtypes after the cast
num_cols_s = X_social.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_s = X_social.select_dtypes(include=["object", "category"]).columns.tolist()

# Preprocess
num_pipe_s = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

try:
    cat_pipe_s = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
except TypeError:
    cat_pipe_s = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

preprocessor_s = ColumnTransformer([
    ("num", num_pipe_s, num_cols_s),
    ("cat", cat_pipe_s, cat_cols_s)
])

# Model
social_linreg = Pipeline([
    ("prep", preprocessor_s),
    ("model", LinearRegression())
])

# Train & predict
social_linreg.fit(X_train_s, y_train_s)
y_pred_s = social_linreg.predict(X_test_s)

r2_s = r2_score(y_test_s, y_pred_s)
rmse_s = mean_squared_error(y_test_s, y_pred_s) ** 0.5

print("\nSocial Media Only Linear Regression")
print(f"R2:   {r2_s:.3f}")
print(f"RMSE: {rmse_s:.3f}")

In [None]:
#Regression graph just social media
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

plt.figure(figsize=(6,6))
plt.scatter(y_test_s, y_pred_s, alpha=0.3, color="#fecd6b")
plt.plot([y_test_s.min(), y_test_s.max()], [y_test_s.min(), y_test_s.max()], "r--", lw=2)
plt.xlabel("Actual Productivity Score")
plt.ylabel("Predicted Productivity Score")
plt.title("Social Media Features Only: Predicted vs Actual")
plt.show()

In [None]:
#Regression Model: Lifestyle Columns Only
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt

lifestyle_cols = [
    "sleep_hours",
    "stress_level",
    "coffee_consumption_per_day",
    "days_feeling_burnout_per_month",
    "breaks_during_work",
    "weekly_offline_hours",
    "job_satisfaction_score",
    "work_hours_per_day"
]

X_social = df[lifestyle_cols].copy()
y_social = df["actual_productivity_score"]

# Split data and train model
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_social, y_social, test_size=0.2, random_state=RANDOM_STATE
)

# Identify numeric vs categorical for this subset
num_cols_s = X_social.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_s = X_social.select_dtypes(include=["object", "category", "boolean"]).columns.tolist()

# Preprocess
num_pipe_s = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

try:
    cat_pipe_s = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
except TypeError:
    cat_pipe_s = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

preprocessor_s = ColumnTransformer([
    ("num", num_pipe_s, num_cols_s),
    ("cat", cat_pipe_s, cat_cols_s)
])

# Model
social_linreg = Pipeline([
    ("prep", preprocessor_s),
    ("model", LinearRegression())
])

# Train & predict
social_linreg.fit(X_train_s, y_train_s)
y_pred_s = social_linreg.predict(X_test_s)

r2_s = r2_score(y_test_s, y_pred_s)
rmse_s = mean_squared_error(y_test_s, y_pred_s) ** 0.5

print("\nocial Media Only Linear Regression")
print(f"R2:   {r2_s:.3f}")
print(f"RMSE: {rmse_s:.3f}")

In [None]:
#Regression graph just lifestyle habits
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

plt.figure(figsize=(6,6))
plt.scatter(y_test_s, y_pred_s, alpha=0.3, color="#a7e6ce")
plt.plot([y_test_s.min(), y_test_s.max()], [y_test_s.min(), y_test_s.max()], "r--", lw=2)
plt.xlabel("Actual Productivity Score")
plt.ylabel("Predicted Productivity Score")
plt.title("Lifestyle and Social Media Features")
plt.show()