Question 1


In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

flyers = np.array([1, 2, 3, 1, 2]).reshape(-1, 1)
cars_washed = np.array([12, 22, 29, 14, 24])

model = LinearRegression()
model.fit(flyers, cars_washed)

coef = model.coef_[0]
intercept = model.intercept_

baseline = np.mean(cars_washed)

predictions = model.predict(flyers)
shap_values = predictions - baseline

verification = np.isclose(predictions, baseline + shap_values)

comparison = ["Over" if p > a else "Under" if p < a else "Exact"
              for p, a in zip(predictions, cars_washed)]

df_results = pd.DataFrame({
    "Flyers (100s)": flyers.flatten(),
    "Actual Cars": cars_washed,
    "Predicted Cars": predictions.round(2),
    "Baseline": baseline,
    "SHAP Value": shap_values.round(2),
    "Verification": verification,
    "Prediction Type": comparison
})

print("\n Linear Regression Implementation with Coefficients ")
print(f"Coefficient: {coef:.2f}, Intercept: {intercept:.2f}")

print("\n Baseline (Mean of y) ")
print(f"Baseline Value: {baseline:.2f}")

print("\n Table of SHAP values and Predictions ")
print(df_results)

print("\n Explanation of Input Influence ")
for i, (x, shap, pred, actual) in enumerate(zip(flyers.flatten(), shap_values, predictions, cars_washed)):
    influence = "increased" if shap > 0 else "decreased"
    print(f"Record {i+1}: Flyers={x} {influence} prediction by {abs(shap):.2f} cars. "
          f"Predicted={pred:.2f}, Actual={actual}, Prediction Type={comparison[i]}")

print("\n Model Accuracy ")
r2 = r2_score(cars_washed, predictions)
mae = mean_absolute_error(cars_washed, predictions)
rmse = np.sqrt(mean_squared_error(cars_washed, predictions))
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

print("\n Trend Analysis ")
if coef > 0:
    print("As the number of flyers increases, the number of cars washed also increases.")
else:
    print("Increasing flyers does not positively impact the number of cars washed.")

print("\n SHAP Interpretation Insights ")
print("SHAP values show how each flyer count influenced predictions compared to the baseline.")
print("Positive SHAP values indicate more flyers than average led to higher predicted washes,")
print("while negative values indicate fewer flyers led to lower predicted washes.")



 Linear Regression Implementation with Coefficients 
Coefficient: 8.29, Intercept: 5.29

 Baseline (Mean of y) 
Baseline Value: 20.20

 Table of SHAP values and Predictions 
   Flyers (100s)  Actual Cars  Predicted Cars  Baseline  SHAP Value  \
0              1           12           13.57      20.2       -6.63   
1              2           22           21.86      20.2        1.66   
2              3           29           30.14      20.2        9.94   
3              1           14           13.57      20.2       -6.63   
4              2           24           21.86      20.2        1.66   

   Verification Prediction Type  
0          True            Over  
1          True           Under  
2          True            Over  
3          True           Under  
4          True           Under  

 Explanation of Input Influence 
Record 1: Flyers=1 decreased prediction by 6.63 cars. Predicted=13.57, Actual=12, Prediction Type=Over
Record 2: Flyers=2 increased prediction by 1.66 cars. Pre

Question 2

In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# ===== Step 0: Create dataset =====
data = pd.DataFrame({
    'Chargers': [5, 3, 4, 2, 5],
    'PeakHour': [1, 0, 1, 0, 0],
    'Sessions': [80, 40, 70, 30, 60]
})

# Independent variables (X) and dependent variable (y)
X = data[['Chargers', 'PeakHour']]
y = data['Sessions']

# ===== Step 1: Perform Multiple Linear Regression =====
model = LinearRegression()
model.fit(X, y)

# Regression coefficients and intercept
coef_chargers, coef_peak = model.coef_
intercept = model.intercept_

print("Regression Equation: Sessions = {:.2f} + {:.2f}*(Chargers) + {:.2f}*(PeakHour)"
      .format(intercept, coef_chargers, coef_peak))

# ===== Step 2: Calculate Baseline Value =====
baseline = y.mean()
print("Baseline Value (mean of Sessions):", baseline)

# ===== Step 3: Calculate SHAP Values =====
# SHAP for linear regression: (feature_value - mean_feature_value) * coefficient
mean_chargers = X['Chargers'].mean()
mean_peak = X['PeakHour'].mean()

shap_chargers = (X['Chargers'] - mean_chargers) * coef_chargers
shap_peak = (X['PeakHour'] - mean_peak) * coef_peak

# ===== Step 4: Compute Final Predictions =====
predictions = baseline + shap_chargers + shap_peak

# Verify decomposition
verification = np.isclose(predictions, model.predict(X))

# ===== Step 5: Combine Results =====
results = pd.DataFrame({
    'Chargers': X['Chargers'],
    'PeakHour': X['PeakHour'],
    'Actual Sessions': y,
    'Predicted Sessions': model.predict(X).round(2),
    'Baseline': baseline,
    'SHAP Chargers': shap_chargers.round(2),
    'SHAP PeakHour': shap_peak.round(2),
    'Verified': verification
})

# Interpret over/underprediction
results['Difference'] = results['Predicted Sessions'] - results['Actual Sessions']
results['Interpretation'] = results['Difference'].apply(
    lambda d: 'Overpredicted' if d > 0 else ('Underpredicted' if d < 0 else 'Exact')
)

print("\nFinal Results:")
print(results)


Regression Equation: Sessions = 10.00 + 10.00*(Chargers) + 20.00*(PeakHour)
Baseline Value (mean of Sessions): 56.0

Final Results:
   Chargers  PeakHour  Actual Sessions  Predicted Sessions  Baseline  \
0         5         1               80                80.0      56.0   
1         3         0               40                40.0      56.0   
2         4         1               70                70.0      56.0   
3         2         0               30                30.0      56.0   
4         5         0               60                60.0      56.0   

   SHAP Chargers  SHAP PeakHour  Verified  Difference Interpretation  
0           12.0           12.0      True         0.0          Exact  
1           -8.0           -8.0      True         0.0          Exact  
2            2.0           12.0      True         0.0          Exact  
3          -18.0           -8.0      True         0.0          Exact  
4           12.0           -8.0      True         0.0          Exact  


Question 3

In [24]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# 1. Load dataset
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = pd.Series(diabetes.target, name="disease_progression")

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# 3. Fit Multiple Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# 4. Calculate baseline (mean of training targets)
baseline = y_train.mean()

# 5. Calculate SHAP-like contributions
feature_means = X_train.mean()
coefs = pd.Series(model.coef_, index=X.columns)

# contributions = coef_j * (x_j - mean_j)
contribs = (X_test - feature_means).multiply(coefs, axis=1)

# 6. Predictions
y_pred_model = pd.Series(model.predict(X_test), index=X_test.index)
y_pred_decomp = baseline + contribs.sum(axis=1)

# Verify decomposition
max_abs_diff = np.abs(y_pred_model - y_pred_decomp).max()
print(f"Max absolute diff between model prediction and decomposition: {max_abs_diff}")

# 7. Build explanations
explanations = []
for idx in X_test.index:
    contrib_dict = contribs.loc[idx].round(3).to_dict()
    top_5 = sorted(contrib_dict.items(), key=lambda kv: abs(kv[1]), reverse=True)[:5]
    explanations.append({
        "index": idx,
        "actual": y_test.loc[idx],
        "predicted": y_pred_model.loc[idx],
        "baseline": baseline,
        "error": y_pred_model.loc[idx] - y_test.loc[idx],
        "contributions": contrib_dict,
        "top_5_features": top_5
    })

explanations_df = pd.DataFrame(explanations).set_index("index")

# Show first few explanations
print("\nFirst 5 explanations:")
print(explanations_df.head())

# Example explanation for first test record
first_idx = X_test.index[0]
print("\nExample explanation for test record:", first_idx)
print("Actual:", y_test.loc[first_idx])
print("Predicted:", y_pred_model.loc[first_idx])
print("Baseline:", baseline)
print("Sum of contributions:", contribs.loc[first_idx].sum())
print("Top 5 contributions:", explanations_df.loc[first_idx, "top_5_features"])


Max absolute diff between model prediction and decomposition: 4.263256414560601e-14

First 5 explanations:
       actual   predicted    baseline       error  \
index                                               
287     219.0  137.949089  154.344411  -81.050911   
211      70.0  182.533354  154.344411  112.533354   
72      202.0  129.852954  154.344411  -72.147046   
321     230.0  292.563092  154.344411   62.563092   
73      111.0  124.867882  154.344411   13.867882   

                                           contributions  \
index                                                      
287    {'age': 2.117, 'sex': 10.871, 'bmi': -4.538, '...   
211    {'age': 4.371, 'sex': 10.871, 'bmi': 18.396, '...   
72     {'age': 2.984, 'sex': -12.196, 'bmi': -3.392, ...   
321    {'age': 4.545, 'sex': 10.871, 'bmi': 26.423, '...   
73     {'age': 0.555, 'sex': -12.196, 'bmi': -11.992,...   

                                          top_5_features  
index                                    

Question 4

In [26]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import shap

# Student Performance dataset
data = {
    'study_time': [2, 4, 3, 2, 4, 5, 3, 2, 4, 3],
    'parental_education': [4, 4, 3, 2, 4, 4, 3, 2, 4, 3],
    'absences': [2, 1, 3, 4, 2, 1, 3, 4, 2, 1],
    'failures': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
    'health': [4, 4, 3, 2, 4, 4, 3, 2, 4, 3],
    'final_score': [70, 80, 60, 50, 85, 90, 65, 55, 80, 75]
}

df = pd.DataFrame(data)

# Define features and target
X = df[['study_time', 'parental_education', 'absences', 'failures', 'health']]
y = df['final_score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform Multiple Linear Regression Analysis
model = LinearRegression()
model.fit(X_train, y_train)

# Calculate the Baseline Value
baseline = np.mean(y_train)

# Calculate SHAP Values
explainer = shap.LinearExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

# Compute Final Prediction for Each Record
predictions = model.predict(X_test)
for i in range(len(X_test)):
    shap_contributions = shap_values[i]
    prediction_from_shap = baseline + sum(shap_contributions)
    print(f"Record {i}:")
    print(f"Predicted score from model: {predictions[i]}")
    print(f"Predicted score from SHAP: {prediction_from_shap}")
    print(f"Actual score: {y_test.iloc[i]}")
    print(f"Baseline value: {baseline}")
    print("Feature contributions:")
    for feature, contribution in zip(X_test.columns, shap_contributions):
        print(f"{feature}: {contribution} ({'+' if contribution > 0 else ''}{contribution:.2f})")
    print(f"Model {'overpredicted' if predictions[i] > y_test.iloc[i] else 'underpredicted'}\n")

# Interpret the Results
for i in range(len(X_test)):
    print(f"Record {i} Interpretation:")
    print(f"Actual final score: {y_test.iloc[i]}")
    print(f"Predicted final score: {predictions[i]}")
    difference = predictions[i] - y_test.iloc[i]
    if difference > 0:
        print(f"The model overpredicted by {difference:.2f}")
    elif difference < 0:
        print(f"The model underpredicted by {abs(difference):.2f}")
    else:
        print("The model predicted perfectly")
    print("SHAP values for each feature:")
    for feature, value in zip(X_test.columns, shap_values[i]):
        if value > 0:
            print(f"{feature}: +{value:.2f} (increased the prediction)")
        else:
            print(f"{feature}: {value:.2f} (decreased the prediction)")
    print("\n")




Record 0:
Predicted score from model: 80.55555555555556
Predicted score from SHAP: 80.55555555555556
Actual score: 80
Baseline value: 68.75
Feature contributions:
study_time: 4.722222222222222 (+4.72)
parental_education: 3.888888888888883 (+3.89)
absences: 1.3888888888888884 (+1.39)
failures: -2.083333333333334 (-2.08)
health: 3.888888888888897 (+3.89)
Model overpredicted

Record 1:
Predicted score from model: 87.5
Predicted score from SHAP: 87.5
Actual score: 80
Baseline value: 68.75
Feature contributions:
study_time: 4.722222222222222 (+4.72)
parental_education: 3.888888888888883 (+3.89)
absences: 4.166666666666665 (+4.17)
failures: 2.083333333333334 (+2.08)
health: 3.888888888888897 (+3.89)
Model overpredicted

Record 0 Interpretation:
Actual final score: 80
Predicted final score: 80.55555555555556
The model overpredicted by 0.56
SHAP values for each feature:
study_time: +4.72 (increased the prediction)
parental_education: +3.89 (increased the prediction)
absences: +1.39 (increased 