In [13]:
pip install imbalanced-learn





In [14]:
import pandas as pd
from imblearn.over_sampling import SMOTE
import os

# Load dataset
input_path = r"C:\Users\Acer\Downloads\FINAL PROJECT\reduced_features.csv"
df = pd.read_csv(input_path)

# Separate features and target
X = df.drop(columns=['Class'])
y = df['Class']

# Show original distribution
print("Original class distribution:")
print(y.value_counts())

# SMOTE strategy: all classes ‚Üí 8028
sampling_strategy = {cls: 8028 for cls in y.unique()}

smote = SMOTE(
    sampling_strategy=sampling_strategy,
    random_state=42
)

# Apply SMOTE
X_resampled, y_resampled = smote.fit_resample(X, y)

# Recombine into DataFrame
df_balanced = pd.concat(
    [
        pd.DataFrame(X_resampled, columns=X.columns),
        pd.Series(y_resampled, name='Class')
    ],
    axis=1
)

# Output path
output_path = r"C:\Users\Acer\Downloads\FINAL PROJECT\reduced_features_SMOTE_8028.csv"

# Save file
df_balanced.to_csv(output_path, index=False)

# Show new distribution
print("\nBalanced class distribution:")
print(df_balanced['Class'].value_counts())

print(f"\n‚úÖ Balanced dataset saved to:\n{output_path}")


Original class distribution:
Class
1    8028
0    3808
2    3370
Name: count, dtype: int64

Balanced class distribution:
Class
0    8028
1    8028
2    8028
Name: count, dtype: int64

‚úÖ Balanced dataset saved to:
C:\Users\Acer\Downloads\FINAL PROJECT\reduced_features_SMOTE_8028.csv


In [15]:
import pandas as pd

print("="*80)
print("STEP 2 ‚Äî DATA CLEANING & PREPARATION")
print("="*80)

# ------------------------------------------------------------------
# Load SMOTEd dataset
# ------------------------------------------------------------------
file_path = r"C:\Users\Acer\Downloads\FINAL PROJECT\reduced_features_SMOTE_8028.csv"
df = pd.read_csv(file_path)

print("\nDataset loaded successfully.")
print(f"Initial dataset shape: {df.shape}")

# ------------------------------------------------------------------
# 1. Handling Missing Values
# ------------------------------------------------------------------
print("\n" + "-"*80)
print("1. MISSING VALUES CHECK")
print("-"*80)

missing_before = df.isnull().sum()
print("\nMissing values BEFORE cleaning:")
print(missing_before)

# No imputation needed (but included for documentation)
df_cleaned = df.dropna()

missing_after = df_cleaned.isnull().sum()
print("\nMissing values AFTER cleaning:")
print(missing_after)

# ------------------------------------------------------------------
# 2. Removal of Duplicates
# ------------------------------------------------------------------
print("\n" + "-"*80)
print("2. DUPLICATE RECORDS CHECK")
print("-"*80)

duplicates_before = df_cleaned.duplicated().sum()
print(f"\nDuplicate rows BEFORE removal: {duplicates_before}")

df_cleaned = df_cleaned.drop_duplicates()

duplicates_after = df_cleaned.duplicated().sum()
print(f"Duplicate rows AFTER removal: {duplicates_after}")

print(f"\nDataset shape after duplicate removal: {df_cleaned.shape}")

# ------------------------------------------------------------------
# 3. Variable Type Adjustments
# ------------------------------------------------------------------
print("\n" + "-"*80)
print("3. VARIABLE TYPE VERIFICATION")
print("-"*80)

print("\nData types of variables:")
print(df_cleaned.dtypes)

# Ensure Class is treated as categorical
df_cleaned['Class'] = df_cleaned['Class'].astype('category')

print("\nUpdated data types:")
print(df_cleaned.dtypes)

# ------------------------------------------------------------------
# 4. Creation of New Variables
# ------------------------------------------------------------------
print("\n" + "-"*80)
print("4. NEW VARIABLE CREATION")
print("-"*80)

print("\nNo new variables were created.")
print("Existing feature set is sufficient after feature reduction.")

# ------------------------------------------------------------------
# BEFORE & AFTER SUMMARY
# ------------------------------------------------------------------
print("\n" + "="*80)
print("BEFORE & AFTER SUMMARY")
print("="*80)

print("\nClass distribution AFTER cleaning:")
print(df_cleaned['Class'].value_counts())

print("\nFinal dataset shape:")
print(df_cleaned.shape)

print("\nSTEP 2 COMPLETED SUCCESSFULLY")
print("="*80)


STEP 2 ‚Äî DATA CLEANING & PREPARATION

Dataset loaded successfully.
Initial dataset shape: (24084, 6)

--------------------------------------------------------------------------------
1. MISSING VALUES CHECK
--------------------------------------------------------------------------------

Missing values BEFORE cleaning:
R              0
H_hsv          0
Contrast       0
Correlation    0
Energy         0
Class          0
dtype: int64

Missing values AFTER cleaning:
R              0
H_hsv          0
Contrast       0
Correlation    0
Energy         0
Class          0
dtype: int64

--------------------------------------------------------------------------------
2. DUPLICATE RECORDS CHECK
--------------------------------------------------------------------------------

Duplicate rows BEFORE removal: 5
Duplicate rows AFTER removal: 0

Dataset shape after duplicate removal: (24079, 6)

--------------------------------------------------------------------------------
3. VARIABLE TYPE VERIFICAT

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# ------------------------------------------------------------------
# PATH SETUP
# ------------------------------------------------------------------
base_path = r"C:\Users\Acer\Downloads\FINAL PROJECT"
output_folder = os.path.join(base_path, "STEP_3_DESCRIPTIVE_STATISTICS")
os.makedirs(output_folder, exist_ok=True)

data_path = os.path.join(base_path, "reduced_features_SMOTE_8028.csv")

print("="*100)
print("STEP 3 ‚Äî DESCRIPTIVE STATISTICS (GROUPED BY CLASS)")
print("="*100)

# ------------------------------------------------------------------
# LOAD DATASET
# ------------------------------------------------------------------
df = pd.read_csv(data_path)

class_map = {
    0: "Underdried",
    1: "Perfectly_Dried",
    2: "Overdried"
}
df['Class_Name'] = df['Class'].map(class_map)

features = ['R', 'H_hsv', 'Contrast', 'Correlation', 'Energy']

# ------------------------------------------------------------------
# FREQUENCY TABLE
# ------------------------------------------------------------------
freq_table = df['Class_Name'].value_counts()
freq_table.to_csv(os.path.join(output_folder, "frequency_table.csv"))

print("\nClass Distribution:")
print(freq_table)

# ------------------------------------------------------------------
# DESCRIPTIVE STATISTICS PER CLASS
# ------------------------------------------------------------------
all_stats = []

for cls, group in df.groupby('Class_Name'):
    stats = pd.DataFrame(index=features)
    stats['Mean'] = group[features].mean()
    stats['Median'] = group[features].median()
    stats['Mode'] = group[features].mode().iloc[0]
    stats['Min'] = group[features].min()
    stats['Max'] = group[features].max()
    stats['Range'] = stats['Max'] - stats['Min']
    stats['Variance'] = group[features].var()
    stats['Std_Dev'] = group[features].std()
    stats['Q1'] = group[features].quantile(0.25)
    stats['Q3'] = group[features].quantile(0.75)
    stats['IQR'] = stats['Q3'] - stats['Q1']
    stats['Class'] = cls

    stats.to_csv(os.path.join(output_folder, f"descriptive_stats_{cls}.csv"))
    all_stats.append(stats)

    print(f"\nSaved descriptive statistics for: {cls}")

# Combined summary
combined_stats = pd.concat(all_stats)
combined_stats.to_csv(os.path.join(output_folder, "descriptive_stats_ALL_CLASSES.csv"))

# ------------------------------------------------------------------
# HISTOGRAMS
# ------------------------------------------------------------------
hist_folder = os.path.join(output_folder, "Histograms")
os.makedirs(hist_folder, exist_ok=True)

for feature in features:
    plt.figure()
    for cls in class_map.values():
        plt.hist(
            df[df['Class_Name'] == cls][feature],
            bins=30,
            alpha=0.5,
            label=cls
        )
    plt.title(f"Histogram of {feature} by Class")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(hist_folder, f"histogram_{feature}.png"))
    plt.close()

# ------------------------------------------------------------------
# BOXPLOTS
# ------------------------------------------------------------------
box_folder = os.path.join(output_folder, "Boxplots")
os.makedirs(box_folder, exist_ok=True)

for feature in features:
    plt.figure()
    df.boxplot(column=feature, by='Class_Name')
    plt.title(f"Boxplot of {feature} by Class")
    plt.suptitle("")
    plt.xlabel("Class")
    plt.ylabel(feature)
    plt.tight_layout()
    plt.savefig(os.path.join(box_folder, f"boxplot_{feature}.png"))
    plt.close()

print("\nALL RESULTS SAVED SUCCESSFULLY")
print(f"üìÅ Location: {output_folder}")
print("="*100)


STEP 3 ‚Äî DESCRIPTIVE STATISTICS (GROUPED BY CLASS)

Class Distribution:
Class_Name
Underdried         8028
Perfectly_Dried    8028
Overdried          8028
Name: count, dtype: int64

Saved descriptive statistics for: Overdried

Saved descriptive statistics for: Perfectly_Dried

Saved descriptive statistics for: Underdried

ALL RESULTS SAVED SUCCESSFULLY
üìÅ Location: C:\Users\Acer\Downloads\FINAL PROJECT\STEP_3_DESCRIPTIVE_STATISTICS


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np

# ------------------------------------------------------------------
# PATH SETUP
# ------------------------------------------------------------------
base_path = r"C:\Users\Acer\Downloads\FINAL PROJECT"
output_folder = os.path.join(base_path, "STEP_4_EDA")
os.makedirs(output_folder, exist_ok=True)

data_path = os.path.join(base_path, "reduced_features_SMOTE_8028.csv")

print("="*100)
print("STEP 4 ‚Äî EXPLORATORY DATA ANALYSIS (EDA)")
print("="*100)

# ------------------------------------------------------------------
# LOAD DATASET
# ------------------------------------------------------------------
df = pd.read_csv(data_path)

class_map = {
    0: "Underdried",
    1: "Perfectly_Dried",
    2: "Overdried"
}
df['Class_Name'] = df['Class'].map(class_map)

features = ['R', 'H_hsv', 'Contrast', 'Correlation', 'Energy']

# ------------------------------------------------------------------
# 1. SCATTER PLOTS (PAIRWISE)
# ------------------------------------------------------------------
scatter_folder = os.path.join(output_folder, "Scatterplots")
os.makedirs(scatter_folder, exist_ok=True)

print("\nGenerating scatter plots...")

for x in features:
    for y in features:
        if x != y:
            plt.figure()
            for cls in class_map.values():
                subset = df[df['Class_Name'] == cls]
                plt.scatter(
                    subset[x],
                    subset[y],
                    alpha=0.4,
                    label=cls
                )
            plt.xlabel(x)
            plt.ylabel(y)
            plt.title(f"{x} vs {y} by Class")
            plt.legend()
            plt.tight_layout()
            plt.savefig(os.path.join(scatter_folder, f"{x}_vs_{y}.png"))
            plt.close()

# ------------------------------------------------------------------
# 2. CORRELATION MATRIX (PER CLASS)
# ------------------------------------------------------------------
corr_folder = os.path.join(output_folder, "Correlation_Matrix")
os.makedirs(corr_folder, exist_ok=True)

print("\nComputing correlation matrices...")

for cls, group in df.groupby('Class_Name'):
    corr = group[features].corr()
    corr.to_csv(os.path.join(corr_folder, f"correlation_matrix_{cls}.csv"))

# ------------------------------------------------------------------
# 3. HEATMAPS (CLEAN STYLE)
# ------------------------------------------------------------------
heatmap_folder = os.path.join(output_folder, "Heatmaps")
os.makedirs(heatmap_folder, exist_ok=True)

print("\nGenerating clean correlation heatmaps...")

def plot_clean_heatmap(corr, title, filename):
    plt.figure(figsize=(7, 6))
    im = plt.imshow(corr, cmap='coolwarm', vmin=-1, vmax=1)
    plt.colorbar(im, fraction=0.046, pad=0.04)

    plt.xticks(range(len(corr.columns)), corr.columns, rotation=45)
    plt.yticks(range(len(corr.index)), corr.index)

    # Annotate values
    for i in range(len(corr.index)):
        for j in range(len(corr.columns)):
            plt.text(
                j, i,
                f"{corr.iloc[i, j]:.2f}",
                ha="center",
                va="center",
                color="black",
                fontsize=10
            )

    plt.title(title, fontsize=13, pad=12)
    plt.tight_layout()
    plt.savefig(os.path.join(heatmap_folder, filename), dpi=300)
    plt.close()

# ------------------------------------------------------------------
# A. HEATMAP FOR ALL DATA (NO CLASS SEPARATION)
# ------------------------------------------------------------------
corr_all = df[features].corr()

plot_clean_heatmap(
    corr_all,
    title="Reduced Feature Correlation Heatmap (All Data)",
    filename="heatmap_ALL_DATA.png"
)

# Save numeric correlation matrix
corr_all.to_csv(os.path.join(heatmap_folder, "correlation_ALL_DATA.csv"))

# ------------------------------------------------------------------
# B. HEATMAPS PER CLASS
# ------------------------------------------------------------------
for cls, group in df.groupby('Class_Name'):
    corr_class = group[features].corr()

    plot_clean_heatmap(
        corr_class,
        title=f"Reduced Feature Correlation Heatmap ({cls})",
        filename=f"heatmap_{cls}.png"
    )

    corr_class.to_csv(
        os.path.join(heatmap_folder, f"correlation_{cls}.csv")
    )

print("Clean heatmaps generated successfully.")



print("\nSTEP 4 EDA COMPLETED SUCCESSFULLY")
print(f"üìÅ All outputs saved to: {output_folder}")
print("="*100)


STEP 4 ‚Äî EXPLORATORY DATA ANALYSIS (EDA)

Generating scatter plots...

Computing correlation matrices...

Generating clean correlation heatmaps...
Clean heatmaps generated successfully.

STEP 4 EDA COMPLETED SUCCESSFULLY
üìÅ All outputs saved to: C:\Users\Acer\Downloads\FINAL PROJECT\STEP_4_EDA


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy import stats

# ------------------------------------------------------------------
# PATH SETUP
# ------------------------------------------------------------------
base_path = r"C:\Users\Acer\Downloads\FINAL PROJECT"
output_folder = os.path.join(base_path, "STEP_5_PROBABILITY_ANALYSIS")
os.makedirs(output_folder, exist_ok=True)

data_path = os.path.join(base_path, "reduced_features_SMOTE_8028.csv")

print("="*100)
print("STEP 5 ‚Äî PROBABILITY DISTRIBUTION ANALYSIS")
print("="*100)

# ------------------------------------------------------------------
# LOAD DATASET
# ------------------------------------------------------------------
df = pd.read_csv(data_path)

variable = "Contrast"
data = df[variable]

# ------------------------------------------------------------------
# 1. DISTRIBUTION SHAPE
# ------------------------------------------------------------------
mean_val = data.mean()
std_val = data.std()
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data)

print("\nDISTRIBUTION SHAPE ANALYSIS")
print("-"*60)
print(f"Mean      : {mean_val:.4f}")
print(f"Std Dev   : {std_val:.4f}")
print(f"Skewness  : {skewness:.4f}")
print(f"Kurtosis  : {kurtosis:.4f}")

# Interpretation logic
if skewness > 0:
    shape = "Right-skewed (positively skewed)"
elif skewness < 0:
    shape = "Left-skewed (negatively skewed)"
else:
    shape = "Approximately symmetric"

print(f"Distribution Shape: {shape}")

# ------------------------------------------------------------------
# 2. Z-SCORES & PROBABILITY CALCULATIONS
# ------------------------------------------------------------------
z_scores = (data - mean_val) / std_val

df_z = pd.DataFrame({
    variable: data,
    "Z_Score": z_scores
})

df_z.to_csv(os.path.join(output_folder, "contrast_z_scores.csv"), index=False)

# Example probability: P(X < Mean + 1 Std)
threshold = mean_val + std_val
probability = stats.norm.cdf(threshold, mean_val, std_val)

print("\nPROBABILITY CALCULATION")
print("-"*60)
print(f"P({variable} < Mean + 1œÉ) = {probability:.4f}")

# Percentiles
percentiles = [10, 25, 50, 75, 90]
percentile_values = np.percentile(data, percentiles)

percentile_table = pd.DataFrame({
    "Percentile": percentiles,
    "Contrast_Value": percentile_values
})

percentile_table.to_csv(
    os.path.join(output_folder, "contrast_percentiles.csv"),
    index=False
)

print("\nPERCENTILES")
print(percentile_table)

# ------------------------------------------------------------------
# 3. PROBABILITY PLOTS
# ------------------------------------------------------------------
# Histogram with Normal Curve
plt.figure()
plt.hist(data, bins=40, density=True, alpha=0.6)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 200)
p = stats.norm.pdf(x, mean_val, std_val)
plt.plot(x, p)
plt.title("Contrast Distribution with Normal Curve")
plt.xlabel("Contrast")
plt.ylabel("Probability Density")
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "contrast_histogram_normal.png"))
plt.close()

# Q-Q Plot
plt.figure()
stats.probplot(data, dist="norm", plot=plt)
plt.title("Q-Q Plot for Contrast")
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "contrast_qq_plot.png"))
plt.close()

print("\nALL STEP 5 RESULTS SAVED SUCCESSFULLY")
print(f"üìÅ Location: {output_folder}")
print("="*100)


STEP 5 ‚Äî PROBABILITY DISTRIBUTION ANALYSIS

DISTRIBUTION SHAPE ANALYSIS
------------------------------------------------------------
Mean      : 77.0442
Std Dev   : 129.9912
Skewness  : 3.6266
Kurtosis  : 16.0835
Distribution Shape: Right-skewed (positively skewed)

PROBABILITY CALCULATION
------------------------------------------------------------
P(Contrast < Mean + 1œÉ) = 0.8413

PERCENTILES
   Percentile  Contrast_Value
0          10        5.313759
1          25       12.448589
2          50       29.823085
3          75       81.356904
4          90      191.472756

ALL STEP 5 RESULTS SAVED SUCCESSFULLY
üìÅ Location: C:\Users\Acer\Downloads\FINAL PROJECT\STEP_5_PROBABILITY_ANALYSIS


In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import statsmodels.api as sm

# ------------------------------------------------------------------
# PATH SETUP
# ------------------------------------------------------------------
base_path = r"C:\Users\Acer\Downloads\FINAL PROJECT"
output_folder = os.path.join(base_path, "STEP_6_REGRESSION_ANALYSIS")
os.makedirs(output_folder, exist_ok=True)

data_path = os.path.join(base_path, "reduced_features_SMOTE_8028.csv")

print("="*100)
print("STEP 6 ‚Äî REGRESSION ANALYSIS")
print("="*100)

# ------------------------------------------------------------------
# LOAD DATASET
# ------------------------------------------------------------------
df = pd.read_csv(data_path)

# ================================================================
# 1. SIMPLE LINEAR REGRESSION
# ================================================================
print("\nSIMPLE LINEAR REGRESSION")
print("-"*100)

X_simple = df['Energy']
y = df['Contrast']

X_simple = sm.add_constant(X_simple)
simple_model = sm.OLS(y, X_simple).fit()

print(simple_model.summary())

# Save summary
with open(os.path.join(output_folder, "simple_regression_summary.txt"), "w") as f:
    f.write(simple_model.summary().as_text())

# Regression equation
b0, b1 = simple_model.params
print(f"\nRegression Equation:")
print(f"Contrast = {b0:.4f} + {b1:.4f}(Energy)")

# Residual plot
plt.figure()
plt.scatter(simple_model.fittedvalues, simple_model.resid, alpha=0.5)
plt.axhline(0)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot ‚Äî Simple Linear Regression")
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "simple_regression_residuals.png"))
plt.close()

# ================================================================
# 2. MULTIPLE LINEAR REGRESSION
# ================================================================
print("\nMULTIPLE LINEAR REGRESSION")
print("-"*100)

X_multi = df[['R', 'H_hsv', 'Correlation', 'Energy']]
X_multi = sm.add_constant(X_multi)

multi_model = sm.OLS(y, X_multi).fit()
print(multi_model.summary())

# Save summary
with open(os.path.join(output_folder, "multiple_regression_summary.txt"), "w") as f:
    f.write(multi_model.summary().as_text())

# ================================================================
# RESIDUAL DIAGNOSTICS
# ================================================================
# Residual vs Fitted
plt.figure()
plt.scatter(multi_model.fittedvalues, multi_model.resid, alpha=0.5)
plt.axhline(0)
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot ‚Äî Multiple Linear Regression")
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "multiple_regression_residuals.png"))
plt.close()

# Q-Q Plot
plt.figure()
sm.qqplot(multi_model.resid, line='45', fit=True)
plt.title("Q-Q Plot ‚Äî Multiple Linear Regression Residuals")
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "multiple_regression_qq.png"))
plt.close()

print("\nALL STEP 6 RESULTS SAVED SUCCESSFULLY")
print(f"üìÅ Location: {output_folder}")
print("="*100)


STEP 6 ‚Äî REGRESSION ANALYSIS

SIMPLE LINEAR REGRESSION
----------------------------------------------------------------------------------------------------
                            OLS Regression Results                            
Dep. Variable:               Contrast   R-squared:                       0.197
Model:                            OLS   Adj. R-squared:                  0.197
Method:                 Least Squares   F-statistic:                     5898.
Date:                Sun, 21 Dec 2025   Prob (F-statistic):               0.00
Time:                        22:44:07   Log-Likelihood:            -1.4876e+05
No. Observations:               24084   AIC:                         2.975e+05
Df Residuals:                   24082   BIC:                         2.975e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|

<Figure size 640x480 with 0 Axes>

In [24]:
import pandas as pd
import os
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# ------------------------------------------------------------------
# PATH SETUP
# ------------------------------------------------------------------
base_path = r"C:\Users\Acer\Downloads\FINAL PROJECT"
output_folder = os.path.join(base_path, "STEP_8_ANOVA")
os.makedirs(output_folder, exist_ok=True)

data_path = os.path.join(base_path, "reduced_features_SMOTE_8028.csv")

print("="*100)
print("STEP 8 ‚Äî ONE-WAY ANOVA")
print("="*100)

# ------------------------------------------------------------------
# LOAD DATASET
# ------------------------------------------------------------------
df = pd.read_csv(data_path)

class_map = {
    0: "Underdried",
    1: "Perfectly_Dried",
    2: "Overdried"
}
df['Class_Name'] = df['Class'].map(class_map)

features = ['R', 'H_hsv', 'Contrast', 'Correlation', 'Energy']

anova_results = []

# ------------------------------------------------------------------
# ONE-WAY ANOVA PER FEATURE
# ------------------------------------------------------------------
for feature in features:
    group0 = df[df['Class'] == 0][feature]
    group1 = df[df['Class'] == 1][feature]
    group2 = df[df['Class'] == 2][feature]

    F_stat, p_value = f_oneway(group0, group1, group2)

    anova_results.append({
        "Feature": feature,
        "F_statistic": F_stat,
        "p_value": p_value
    })

    print(f"\nANOVA ‚Äî {feature}")
    print(f"F-statistic = {F_stat:.4f}")
    print(f"p-value     = {p_value:.6f}")

    # ------------------------------------------------------------------
    # POST-HOC TEST (Tukey HSD) IF SIGNIFICANT
    # ------------------------------------------------------------------
    if p_value < 0.05:
        print("‚Üí Significant result: running Tukey HSD post-hoc test")

        tukey = pairwise_tukeyhsd(
            endog=df[feature],
            groups=df['Class_Name'],
            alpha=0.05
        )

        # Save Tukey results
        tukey_df = pd.DataFrame(
            data=tukey.summary().data[1:],
            columns=tukey.summary().data[0]
        )

        tukey_df.to_csv(
            os.path.join(output_folder, f"tukey_{feature}.csv"),
            index=False
        )

        print(tukey.summary())
    else:
        print("‚Üí Not significant: post-hoc test not required")

# ------------------------------------------------------------------
# SAVE ANOVA SUMMARY
# ------------------------------------------------------------------
anova_df = pd.DataFrame(anova_results)
anova_df.to_csv(os.path.join(output_folder, "anova_summary.csv"), index=False)

print("\nALL STEP 8 RESULTS SAVED SUCCESSFULLY")
print(f"üìÅ Location: {output_folder}")
print("="*100)


STEP 8 ‚Äî ONE-WAY ANOVA

ANOVA ‚Äî R
F-statistic = 29983.3792
p-value     = 0.000000
‚Üí Significant result: running Tukey HSD post-hoc test
           Multiple Comparison of Means - Tukey HSD, FWER=0.05            
     group1          group2      meandiff p-adj   lower     upper   reject
--------------------------------------------------------------------------
      Overdried Perfectly_Dried  -93.4961   0.0  -94.5884  -92.4038   True
      Overdried      Underdried -103.4146   0.0 -104.5068 -102.3223   True
Perfectly_Dried      Underdried   -9.9185   0.0  -11.0107   -8.8262   True
--------------------------------------------------------------------------

ANOVA ‚Äî H_hsv
F-statistic = 7137.3116
p-value     = 0.000000
‚Üí Significant result: running Tukey HSD post-hoc test
          Multiple Comparison of Means - Tukey HSD, FWER=0.05          
     group1          group2     meandiff p-adj  lower    upper   reject
---------------------------------------------------------------------

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import os

# --------------------------------------------------
# PATH SETUP
# --------------------------------------------------
base_path = r"C:\Users\Acer\Downloads\FINAL PROJECT"
data_path = os.path.join(base_path, "reduced_features_SMOTE_8028.csv")
output_folder = os.path.join(base_path, "STEP_4_EDA", "PCA_Plots")
os.makedirs(output_folder, exist_ok=True)

# --------------------------------------------------
# LOAD DATA
# --------------------------------------------------
df = pd.read_csv(data_path)

features = ["R", "H_hsv", "Contrast", "Correlation", "Energy"]
X = df[features]
y = df["Class"]

# --------------------------------------------------
# STANDARDIZE FEATURES
# --------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --------------------------------------------------
# PCA (3 COMPONENTS)
# --------------------------------------------------
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

explained_var = pca.explained_variance_ratio_ * 100

# --------------------------------------------------
# PCA 2D SCATTER PLOT
# --------------------------------------------------
plt.figure(figsize=(7, 6))
scatter = plt.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=y,
    cmap="viridis",
    alpha=0.6,
    s=15
)

plt.xlabel(f"PC1 ({explained_var[0]:.1f}%)")
plt.ylabel(f"PC2 ({explained_var[1]:.1f}%)")
plt.title("PCA: 2D Projection")
plt.colorbar(scatter, label="Class")
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "PCA_2D.png"), dpi=300)
plt.close()

# --------------------------------------------------
# PCA 3D SCATTER PLOT
# --------------------------------------------------
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d")

p = ax.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    X_pca[:, 2],
    c=y,
    cmap="viridis",
    alpha=0.6,
    s=15
)

ax.set_xlabel(f"PC1 ({explained_var[0]:.1f}%)")
ax.set_ylabel(f"PC2 ({explained_var[1]:.1f}%)")
ax.set_zlabel(f"PC3 ({explained_var[2]:.1f}%)")
ax.set_title("PCA: 3D Projection")

fig.colorbar(p, label="Class")
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "PCA_3D.png"), dpi=300)
plt.close()

# --------------------------------------------------
# PRINT SUMMARY
# --------------------------------------------------
print("PCA completed successfully.")
print(f"Explained variance ratios:")
print(f"PC1: {explained_var[0]:.2f}%")
print(f"PC2: {explained_var[1]:.2f}%")
print(f"PC3: {explained_var[2]:.2f}%")
print(f"Plots saved in: {output_folder}")


PCA completed successfully.
Explained variance ratios:
PC1: 36.42%
PC2: 29.13%
PC3: 17.12%
Plots saved in: C:\Users\Acer\Downloads\FINAL PROJECT\STEP_4_EDA\PCA_Plots


In [27]:
# STEP 6 ‚Äî REGRESSION ANALYSIS
# Simple Linear Regression & Multiple Linear Regression
# Uses SMOTED dataset: reduced_features_SMOTE_8028.csv

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats

# ============================
# FILE PATHS (FIXED)
# ============================
DATA_PATH = r"C:\Users\Acer\Downloads\FINAL PROJECT\reduced_features_SMOTE_8028.csv"
OUTPUT_DIR = r"C:\Users\Acer\Downloads\FINAL PROJECT\STEP_6_Regression_Results"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================
# LOAD DATA
# ============================
df = pd.read_csv(DATA_PATH)

print("Dataset loaded successfully")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ============================
# 1. SIMPLE LINEAR REGRESSION
# Y = Contrast | X = Energy
# ============================
print("\n" + "="*60)
print("SIMPLE LINEAR REGRESSION")
print("Y = Contrast | X = Energy")
print("="*60)

X_simple = df["Energy"]
Y_simple = df["Contrast"]

X_simple_const = sm.add_constant(X_simple)
simple_model = sm.OLS(Y_simple, X_simple_const).fit()

# Print regression summary
print(simple_model.summary())

# Regression equation
b0, b1 = simple_model.params
print(f"\nRegression Equation:")
print(f"Contrast = {b0:.4f} + {b1:.4f}(Energy)")
print(f"R¬≤ = {simple_model.rsquared:.4f}")

# ============================
# SIMPLE REGRESSION DIAGNOSTICS
# ============================
residuals_simple = simple_model.resid
fitted_simple = simple_model.fittedvalues

# Residual vs Fitted
plt.figure(figsize=(6,4))
sns.scatterplot(x=fitted_simple, y=residuals_simple, alpha=0.5)
plt.axhline(0, color='red')
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot ‚Äî Simple Linear Regression")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "simple_regression_residuals.png"))
plt.close()

# Q-Q Plot
plt.figure(figsize=(6,4))
stats.probplot(residuals_simple, plot=plt)
plt.title("Q-Q Plot ‚Äî Simple Linear Regression Residuals")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "simple_regression_qq.png"))
plt.close()

# ============================
# 2. MULTIPLE LINEAR REGRESSION
# Y = Contrast
# X = R, H_hsv, Correlation, Energy
# ============================
print("\n" + "="*60)
print("MULTIPLE LINEAR REGRESSION")
print("Y = Contrast")
print("X = R, H_hsv, Correlation, Energy")
print("="*60)

X_multi = df[["R", "H_hsv", "Correlation", "Energy"]]
Y_multi = df["Contrast"]

X_multi_const = sm.add_constant(X_multi)
multi_model = sm.OLS(Y_multi, X_multi_const).fit()

# Print regression summary
print(multi_model.summary())

# Regression equation
params = multi_model.params
print("\nRegression Equation:")
print(
    f"Contrast = {params['const']:.4f} "
    f"+ {params['R']:.4f}(R) "
    f"+ {params['H_hsv']:.4f}(H_hsv) "
    f"+ {params['Correlation']:.4f}(Correlation) "
    f"+ {params['Energy']:.4f}(Energy)"
)

print(f"R¬≤ = {multi_model.rsquared:.4f}")
print(f"Adjusted R¬≤ = {multi_model.rsquared_adj:.4f}")

# ============================
# MULTIPLE REGRESSION DIAGNOSTICS
# ============================
residuals_multi = multi_model.resid
fitted_multi = multi_model.fittedvalues

# Residual vs Fitted
plt.figure(figsize=(6,4))
sns.scatterplot(x=fitted_multi, y=residuals_multi, alpha=0.5)
plt.axhline(0, color='red')
plt.xlabel("Fitted Values")
plt.ylabel("Residuals")
plt.title("Residual Plot ‚Äî Multiple Linear Regression")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "multiple_regression_residuals.png"))
plt.close()

# Q-Q Plot
plt.figure(figsize=(6,4))
stats.probplot(residuals_multi, plot=plt)
plt.title("Q-Q Plot ‚Äî Multiple Linear Regression Residuals")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "multiple_regression_qq.png"))
plt.close()

print("\nSTEP 6 completed successfully.")
print(f"All outputs saved in:\n{OUTPUT_DIR}")


Dataset loaded successfully
Shape: (24084, 6)
Columns: ['R', 'H_hsv', 'Contrast', 'Correlation', 'Energy', 'Class']

SIMPLE LINEAR REGRESSION
Y = Contrast | X = Energy
                            OLS Regression Results                            
Dep. Variable:               Contrast   R-squared:                       0.197
Model:                            OLS   Adj. R-squared:                  0.197
Method:                 Least Squares   F-statistic:                     5898.
Date:                Wed, 24 Dec 2025   Prob (F-statistic):               0.00
Time:                        15:16:28   Log-Likelihood:            -1.4876e+05
No. Observations:               24084   AIC:                         2.975e+05
Df Residuals:                   24082   BIC:                         2.975e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          

In [29]:
# ============================================
# STEP 8 ‚Äî ONE-WAY ANOVA + TUKEY + VISUALIZATION
# ============================================

import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ============================
# CONFIGURATION
# ============================
DATA_PATH = r"C:\Users\Acer\Downloads\FINAL PROJECT\reduced_features_SMOTE_8028.csv"
OUTPUT_DIR = r"C:\Users\Acer\Downloads\FINAL PROJECT\STEP_8_ANOVA_RESULTS"

FEATURES = ["R", "H_hsv", "Contrast", "Correlation", "Energy"]
CLASS_COL = "Class"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================
# LOAD DATA
# ============================
df = pd.read_csv(DATA_PATH)

class_map = {
    0: "Underdried",
    1: "Perfectly Dried",
    2: "Overdried"
}
df["Class_Name"] = df[CLASS_COL].map(class_map)

print("Dataset loaded successfully")
print("Shape:", df.shape)

# ============================
# ONE-WAY ANOVA
# ============================
anova_results = []

print("\n" + "="*60)
print("STEP 8 ‚Äî ONE-WAY ANOVA RESULTS")
print("="*60)

for feature in FEATURES:
    groups = [
        df[df[CLASS_COL] == cls][feature]
        for cls in sorted(df[CLASS_COL].unique())
    ]

    F_stat, p_value = stats.f_oneway(*groups)

    anova_results.append({
        "Feature": feature,
        "F_statistic": F_stat,
        "p_value": p_value
    })

    print(f"\nFeature: {feature}")
    print(f"F-statistic = {F_stat:.4f}")
    print(f"p-value     = {p_value:.6e}")

# Save ANOVA numeric results
anova_df = pd.DataFrame(anova_results)
anova_df.to_csv(os.path.join(OUTPUT_DIR, "anova_summary.csv"), index=False)

# ============================
# TUKEY POST-HOC TEST
# ============================
print("\n" + "="*60)
print("TUKEY HSD POST-HOC TEST")
print("="*60)

for feature in FEATURES:
    tukey = pairwise_tukeyhsd(
        endog=df[feature],
        groups=df["Class_Name"],
        alpha=0.05
    )

    print(f"\nTukey results for {feature}")
    print(tukey.summary())

    tukey_df = pd.DataFrame(
        data=tukey.summary().data[1:],
        columns=tukey.summary().data[0]
    )
    tukey_df.to_csv(
        os.path.join(OUTPUT_DIR, f"tukey_{feature}.csv"),
        index=False
    )

# ============================
# ANOVA VISUALIZATION (BOXPLOTS)
# ============================
print("\nGenerating ANOVA visualization images...")

sns.set(style="whitegrid")

for feature in FEATURES:
    plt.figure(figsize=(7,5))
    sns.boxplot(
        x="Class_Name",
        y=feature,
        data=df,
        palette="Set2"
    )
    plt.title(f"ANOVA Result ‚Äî {feature}")
    plt.xlabel("Drying Condition")
    plt.ylabel(feature)
    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, f"anova_boxplot_{feature}.png"),
        dpi=300
    )
    plt.close()

print("\nSTEP 8 completed successfully.")
print("All results and images saved in:")
print(OUTPUT_DIR)


Dataset loaded successfully
Shape: (24084, 7)

STEP 8 ‚Äî ONE-WAY ANOVA RESULTS

Feature: R
F-statistic = 29983.3792
p-value     = 0.000000e+00

Feature: H_hsv
F-statistic = 7137.3116
p-value     = 0.000000e+00

Feature: Contrast
F-statistic = 1045.6462
p-value     = 0.000000e+00

Feature: Correlation
F-statistic = 665.7887
p-value     = 3.655790e-282

Feature: Energy
F-statistic = 692.7998
p-value     = 2.875354e-293

TUKEY HSD POST-HOC TEST

Tukey results for R
           Multiple Comparison of Means - Tukey HSD, FWER=0.05            
     group1          group2      meandiff p-adj   lower     upper   reject
--------------------------------------------------------------------------
      Overdried Perfectly Dried  -93.4961   0.0  -94.5884  -92.4038   True
      Overdried      Underdried -103.4146   0.0 -104.5068 -102.3223   True
Perfectly Dried      Underdried   -9.9185   0.0  -11.0107   -8.8262   True
--------------------------------------------------------------------------

Tukey 

In [30]:
# =========================================================
# STEP 8 ‚Äî ONE-WAY ANOVA + TUKEY HSD + IMAGE OUTPUTS
# =========================================================

import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import os

# ============================
# CONFIGURATION
# ============================
DATA_PATH = r"C:\Users\Acer\Downloads\FINAL PROJECT\reduced_features_SMOTE_8028.csv"
OUTPUT_DIR = r"C:\Users\Acer\Downloads\FINAL PROJECT\STEP_8_ANOVA_RESULTS"

FEATURES = ["R", "H_hsv", "Contrast", "Correlation", "Energy"]
CLASS_COL = "Class"

class_map = {
    0: "Underdried",
    1: "Perfectly Dried",
    2: "Overdried"
}

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================
# LOAD DATA
# ============================
df = pd.read_csv(DATA_PATH)
df["Class_Name"] = df[CLASS_COL].map(class_map)

print("Dataset loaded successfully")
print("Shape:", df.shape)

# ============================
# ONE-WAY ANOVA
# ============================
anova_results = []

print("\n" + "="*60)
print("STEP 8 ‚Äî ONE-WAY ANOVA RESULTS")
print("="*60)

for feature in FEATURES:
    groups = [
        df[df[CLASS_COL] == cls][feature]
        for cls in sorted(df[CLASS_COL].unique())
    ]

    F_stat, p_value = stats.f_oneway(*groups)

    anova_results.append([
        feature,
        f"{F_stat:.2f}",
        f"{p_value:.2e}",
        "Reject H‚ÇÄ"
    ])

    print(f"{feature}: F = {F_stat:.4f}, p = {p_value:.6e}")

anova_df = pd.DataFrame(
    anova_results,
    columns=["Feature", "F-statistic", "p-value", "Decision"]
)

# ============================
# SAVE ANOVA TABLE AS IMAGE
# ============================
fig, ax = plt.subplots(figsize=(9, 3))
ax.axis('off')

anova_table = ax.table(
    cellText=anova_df.values,
    colLabels=anova_df.columns,
    loc='center',
    cellLoc='center'
)

anova_table.auto_set_font_size(False)
anova_table.set_fontsize(11)
anova_table.scale(1, 1.6)

plt.title("One-Way ANOVA Summary Results", pad=20)
plt.tight_layout()
plt.savefig(
    os.path.join(OUTPUT_DIR, "anova_summary_table.png"),
    dpi=300,
    bbox_inches="tight"
)
plt.close()

# ============================
# TUKEY HSD + IMAGE TABLES
# ============================
print("\n" + "="*60)
print("TUKEY HSD POST-HOC RESULTS")
print("="*60)

for feature in FEATURES:
    tukey = pairwise_tukeyhsd(
        endog=df[feature],
        groups=df["Class_Name"],
        alpha=0.05
    )

    print(f"\nTukey results for {feature}")
    print(tukey.summary())

    tukey_df = pd.DataFrame(
        data=tukey.summary().data[1:],
        columns=tukey.summary().data[0]
    )

    # Save Tukey table as IMAGE
    fig, ax = plt.subplots(figsize=(10, 3))
    ax.axis('off')

    table = ax.table(
        cellText=tukey_df.values,
        colLabels=tukey_df.columns,
        loc='center',
        cellLoc='center'
    )

    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 1.6)

    plt.title(f"Tukey HSD Post-Hoc Results ‚Äî {feature}", pad=20)
    plt.tight_layout()

    plt.savefig(
        os.path.join(OUTPUT_DIR, f"tukey_{feature}.png"),
        dpi=300,
        bbox_inches="tight"
    )
    plt.close()

# ============================
# ANOVA VISUALIZATION (BOXPLOTS)
# ============================
sns.set(style="whitegrid")

for feature in FEATURES:
    plt.figure(figsize=(7,5))
    sns.boxplot(
        x="Class_Name",
        y=feature,
        data=df,
        palette="Set2"
    )
    plt.title(f"ANOVA Result ‚Äî {feature}")
    plt.xlabel("Drying Condition")
    plt.ylabel(feature)
    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, f"anova_boxplot_{feature}.png"),
        dpi=300
    )
    plt.close()

print("\nSTEP 8 completed successfully.")
print("All results and images saved in:")
print(OUTPUT_DIR)


Dataset loaded successfully
Shape: (24084, 7)

STEP 8 ‚Äî ONE-WAY ANOVA RESULTS
R: F = 29983.3792, p = 0.000000e+00
H_hsv: F = 7137.3116, p = 0.000000e+00
Contrast: F = 1045.6462, p = 0.000000e+00
Correlation: F = 665.7887, p = 3.655790e-282
Energy: F = 692.7998, p = 2.875354e-293

TUKEY HSD POST-HOC RESULTS

Tukey results for R
           Multiple Comparison of Means - Tukey HSD, FWER=0.05            
     group1          group2      meandiff p-adj   lower     upper   reject
--------------------------------------------------------------------------
      Overdried Perfectly Dried  -93.4961   0.0  -94.5884  -92.4038   True
      Overdried      Underdried -103.4146   0.0 -104.5068 -102.3223   True
Perfectly Dried      Underdried   -9.9185   0.0  -11.0107   -8.8262   True
--------------------------------------------------------------------------

Tukey results for H_hsv
          Multiple Comparison of Means - Tukey HSD, FWER=0.05          
     group1          group2     meandiff p-adj 

In [31]:
# =========================================================
# STEP 3 ‚Äî DESCRIPTIVE STATISTICS (IMPROVED BOXPLOTS)
# =========================================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# ============================
# CONFIGURATION
# ============================
DATA_PATH = r"C:\Users\Acer\Downloads\FINAL PROJECT\reduced_features_SMOTE_8028.csv"
OUTPUT_DIR = r"C:\Users\Acer\Downloads\FINAL PROJECT\STEP_3_Descriptive_Statistics"

FEATURES = ["R", "H_hsv", "Contrast", "Correlation", "Energy"]
CLASS_COL = "Class"

class_map = {
    0: "Underdried",
    1: "Perfectly Dried",
    2: "Overdried"
}

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ============================
# LOAD DATA
# ============================
df = pd.read_csv(DATA_PATH)
df["Class_Name"] = df[CLASS_COL].map(class_map)

# ============================
# DESCRIPTIVE STATISTICS
# ============================
desc_stats = df.groupby("Class_Name")[FEATURES].describe()
desc_stats.to_csv(os.path.join(OUTPUT_DIR, "descriptive_statistics_by_class.csv"))

# ============================
# CLEAR BOXPLOTS
# ============================
sns.set(style="whitegrid")

for feature in FEATURES:
    plt.figure(figsize=(8, 6))

    sns.boxplot(
        x="Class_Name",
        y=feature,
        data=df,
        width=0.5,
        showfliers=False,          # üî¥ Makes box visible
        linewidth=2,               # üî¥ Thicker box lines
        palette="Set2"
    )

    plt.title(f"Boxplot of {feature} by Drying Class", fontsize=14)
    plt.xlabel("Drying Class", fontsize=12)
    plt.ylabel(feature, fontsize=12)

    plt.tight_layout()
    plt.savefig(
        os.path.join(OUTPUT_DIR, f"boxplot_{feature}.png"),
        dpi=300
    )
    plt.close()

print("STEP 3 completed successfully.")
print("Descriptive statistics and clear boxplots saved in:")
print(OUTPUT_DIR)


STEP 3 completed successfully.
Descriptive statistics and clear boxplots saved in:
C:\Users\Acer\Downloads\FINAL PROJECT\STEP_3_Descriptive_Statistics
