In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.formula.api import ols
import itertools
from sklearn.linear_model import LassoCV
import time
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
file_path = r'C:\Users\ADMIN\Downloads\garments_worker_productivity.csv' 
df=pd.read_csv(file_path)
df['department'] = df['department'].str.strip()

# Get unique values after cleaning
unique_departments = df['department'].unique()
df['date'] = pd.to_datetime(df['date'])

# Extract month and day only (to compare mm-dd)
df['mmdd'] = df['date'].dt.strftime('%d-%m')

# Define custom quarters
def assign_custom_quarter(mmdd):
    if '01-01' <= mmdd <= '03-31':
        return 'Q1'
    elif '04-01' <= mmdd <= '06-30':
        return 'Q2'
    elif '07-01' <= mmdd <= '09-30':
        return 'Q3'
    else:
        return 'Q4'

# Apply function to every row
df['quarter'] = df['mmdd'].apply(assign_custom_quarter)

# Optional: Drop mmdd helper column
df.drop(columns=['mmdd'], inplace=True)
df.drop(['wip'],axis=1 ,inplace =True)
numeric_columns = df.select_dtypes(include=['number']).columns
# Remove categorical-encoded columns manually if needed
numeric_columns = [col for col in numeric_columns if col not in ['quarter', 'department', 'day']]
# Detect outliers
for column in numeric_columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]

    median = df[column].median() 

    # Replace outliers with median
    df[column] = df[column].apply(lambda x: median if x < lower_bound or x > upper_bound else x)

df = df.drop(columns=['date'])
data_encoded = pd.get_dummies(df, columns=['quarter', 'department', 'day'], drop_first=True)

numerical_features = ['team', 'targeted_productivity', 'smv','over_time','incentive','idle_time','idle_men','no_of_style_change',
                      'no_of_workers','actual_productivity']

scaler = MinMaxScaler()
data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])

# For regression:
x = data_encoded.drop('actual_productivity', axis=1)
y = data_encoded['actual_productivity']  # no encoding!

# Then split without stratify:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=0  # <- remove stratify!
)
def linearRegression(X_train, y_train, X_test, y_test):
    # ===== Ensure numeric =====
    X_train = X_train.astype(float)
    X_test = X_test.astype(float)

    # ===== Sklearn Linear Regression =====
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    # Predict on test set
    y_pred = lin_reg.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print(f"R-squared (Test set): {r2:.4f}")

    # ===== Statsmodels (for ANOVA and coefficients) =====
    df_train = X_train.copy()
    df_train['target'] = y_train.values if hasattr(y_train, 'values') else y_train

    formula = 'target ~ ' + ' + '.join(df_train.columns.drop('target'))
    model = ols(formula, data=df_train).fit()

    print(f"Adjusted R-squared: {model.rsquared_adj:.4f}")
    print(f"F-statistic: {model.fvalue:.4f}")
    print(f"F-test p-value: {model.f_pvalue:.4e}")

    # ANOVA Table
    anova_table = sm.stats.anova_lm(model, typ=2)
    print("\nANOVA Table:")
    print(anova_table)

    # SSR
    ssr = anova_table.drop('Residual', errors='ignore')['sum_sq'].sum()
    print(f"\nSSR (Sum of Squares for Regression): {ssr:.4f}")

    # Coefficients
    coef_df = pd.DataFrame({
        'Feature': model.params.index,
        'Coefficient': model.params.values
    }).reset_index(drop=True)

    coef_df['Feature'] = coef_df['Feature'].replace({'Intercept': '(Intercept)'})

    print("\nRegression Coefficients (including Intercept):")
    print(coef_df)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    print("R-squared:", round(r2, 4))
    print("MAE:", round(mae, 4))
    print("MSE:", round(mse, 4))
    print("RMSE:", round(rmse, 4))

    return model, y_pred, r2, anova_table, coef_df


# === 1. BEFORE TUNING: Use all features ===
print("\n--- BEFORE TUNING (Full Linear Regression) ---")
_, _, r2_before, _, coef_before = linearRegression(X_train, y_train, X_test, y_test)





# === 4. Summary of R² Scores ===
print("\n📊 R² Score Comparison:")
print(f"Before Tuning (All Features): {r2_before:.4f}")




--- BEFORE TUNING (Full Linear Regression) ---
R-squared (Test set): 0.2982
Adjusted R-squared: 0.3181
F-statistic: 26.9957
F-test p-value: 1.4094e-61

ANOVA Table:
                          sum_sq     df           F        PR(>F)
team                    0.456315    1.0   18.321261  2.086334e-05
targeted_productivity   0.468209    1.0   18.798800  1.633048e-05
smv                     0.356906    1.0   14.329942  1.645731e-04
over_time               0.044038    1.0    1.768135  1.839823e-01
incentive               4.567039    1.0  183.368817  7.452254e-38
idle_time               1.030992    1.0   41.394837  2.111580e-10
idle_men                0.471943    1.0   18.948750  1.512273e-05
no_of_style_change      0.238044    1.0    9.557565  2.058550e-03
no_of_workers           0.128912    1.0    5.175865  2.315982e-02
quarter_Q2              0.142926    1.0    5.738530  1.681972e-02
quarter_Q3              0.035488    1.0    1.424871  2.329480e-01
quarter_Q4              0.011699    1.0   

In [8]:
significant_features = [
    'team',
    'targeted_productivity',
    'smv',
    'incentive',
    'idle_time',
    'idle_men',
    'no_of_style_change',
    'no_of_workers',
    'quarter_Q2',
    'department_sweing'
]
# Call your linear regression function using only the selected features
model, y_pred, r2, anova_table, coef_df = linearRegression(
    X_train[significant_features], 
    y_train, 
    X_test[significant_features], 
    y_test
)



R-squared (Test set): 0.2945
Adjusted R-squared: 0.3175
F-statistic: 56.5693
F-test p-value: 3.2868e-66

ANOVA Table:
                          sum_sq     df           F        PR(>F)
team                    0.435108    1.0   17.456255  3.251875e-05
targeted_productivity   0.505342    1.0   20.274023  7.671546e-06
smv                     0.377200    1.0   15.133015  1.082245e-04
incentive               4.588342    1.0  184.081383  5.149945e-38
idle_time               0.018547    1.0    0.744091  3.886031e-01
idle_men                0.001649    1.0    0.066143  7.971018e-01
no_of_style_change           NaN    1.0         NaN           NaN
no_of_workers           0.094507    1.0    3.791542  5.184951e-02
quarter_Q2              0.123269    1.0    4.945474  2.642780e-02
department_sweing       0.776880    1.0   31.167935  3.207237e-08
Residual               20.663334  829.0         NaN           NaN

SSR (Sum of Squares for Regression): 6.9208

Regression Coefficients (including Intercept

  F /= J


In [9]:
significant_features = [
    'team',
    'targeted_productivity',
    'smv',
    'incentive',
    'quarter_Q2',
    'department_sweing'
]
# Call your linear regression function using only the selected features
model, y_pred, r2, anova_table, coef_df = linearRegression(
    X_train[significant_features], 
    y_train, 
    X_test[significant_features], 
    y_test
)


R-squared (Test set): 0.2852
Adjusted R-squared: 0.3152
F-statistic: 65.1465
F-test p-value: 2.5214e-66

ANOVA Table:
                          sum_sq     df           F        PR(>F)
team                    0.510325    1.0   20.405281  7.173611e-06
targeted_productivity   0.459085    1.0   18.356453  2.046530e-05
smv                     0.282874    1.0   11.310702  8.059113e-04
incentive               4.955659    1.0  198.151469  1.626867e-40
quarter_Q2              0.131124    1.0    5.242960  2.228592e-02
department_sweing       0.762153    1.0   30.474602  4.524557e-08
Residual               20.757841  830.0         NaN           NaN

SSR (Sum of Squares for Regression): 7.1012

Regression Coefficients (including Intercept):
                 Feature  Coefficient
0            (Intercept)     0.526802
1                   team    -0.082917
2  targeted_productivity     0.092845
3                    smv    -0.186339
4              incentive     0.477866
5             quarter_Q2     0.03