In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Detect and Handle Outliers
def remove_outliers(df, columns, threshold=3):
    for col in columns:
        z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
        df = df[z_scores < threshold]
    return df

# Linear Regression
def linear_regression_model(X_train, y_train, X_test, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"Linear Regression - MSE: {mse:.4f}, R2: {r2:.4f}, MAE: {mae:.4f}")
    return model, y_pred

# Random Forest with Hyperparameter Tuning
def random_forest_model(X_train, y_train, X_test, y_test):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)
    
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"Random Forest - MSE: {mse:.4f}, R2: {r2:.4f}, MAE: {mae:.4f}")
    return model, y_pred

# Support Vector Regression with Hyperparameter Tuning
def svr_model(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['rbf', 'linear']}
    grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)
    
    model = grid_search.best_estimator_
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"SVR - MSE: {mse:.4f}, R2: {r2:.4f}, MAE: {mae:.4f}")
    return model, y_pred

# Plot Actual vs Predicted
def plot_actual_vs_predicted(y_test, y_pred, title):
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(f"Actual vs Predicted - {title}")
    plt.show()

# Plot Feature Importance for Random Forest
def plot_feature_importance(model, features, title):
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        plt.figure(figsize=(10, 6))
        plt.title(f"Feature Importance - {title}")
        plt.bar(range(len(features)), importances[indices])
        plt.xticks(range(len(features)), [features[i] for i in indices], rotation=90)
        plt.show()
    else:
        print(f"Feature importance not available for {title}")

# Cross-Validation Function
def cross_validate_model(model, X, y, cv=5):
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
    print(f"Cross-validated R2: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    return cv_scores

# Load and preprocess the data
df = pd.read_csv("apple_quality.csv")
df = df.drop("A_id", axis=1)
df = df.drop(index=4000)

# Convert numeric columns to float, replacing non-numeric values with NaN
numeric_columns = ['Size', 'Weight', 'Sweetness', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Remove rows with NaN values
df = df.dropna()
# Handle outliers and transform skewed features
df = remove_outliers(df, numeric_columns)

# Separate features and targets
features = ['Size', 'Weight', 'Crunchiness', 'Juiciness', 'Ripeness', 'Acidity']
X = df[features]
y_sweetness = df['Sweetness']

# Split data into training and testing sets
X_train, X_test, y_train_sweetness, y_test_sweetness = train_test_split(X, y_sweetness, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate models for Sweetness
print("### Sweetness Prediction ###")
lr_model_sweetness, lr_pred_sweetness = linear_regression_model(X_train_scaled, y_train_sweetness, X_test_scaled, y_test_sweetness)
rf_model_sweetness, rf_pred_sweetness = random_forest_model(X_train_scaled, y_train_sweetness, X_test_scaled, y_test_sweetness)
svr_model_sweetness, svr_pred_sweetness = svr_model(X_train_scaled, y_train_sweetness, X_test_scaled, y_test_sweetness)

# Visualize and interpret results for Sweetness
plot_actual_vs_predicted(y_test_sweetness, lr_pred_sweetness, "Linear Regression - Sweetness")
plot_actual_vs_predicted(y_test_sweetness, rf_pred_sweetness, "Random Forest - Sweetness")
plot_actual_vs_predicted(y_test_sweetness, svr_pred_sweetness, "SVR - Sweetness")

plot_feature_importance(rf_model_sweetness, features, "Random Forest - Sweetness")

# Cross-validate models for Sweetness
cross_validate_model(lr_model_sweetness, X_train_scaled, y_train_sweetness)
cross_validate_model(rf_model_sweetness, X_train_scaled, y_train_sweetness)
cross_validate_model(svr_model_sweetness, X_train_scaled, y_train_sweetness)

### Sweetness Prediction ###
Linear Regression - MSE: 2.6971, R2: 0.3150, MAE: 1.3103


KeyboardInterrupt: 

In [None]:
from sklearn.decomposition import PCA

# Applying PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train models on PCA-transformed data
lr_model_pca, lr_pred_pca = linear_regression_model(X_train_pca, y_train_sweetness, X_test_pca, y_test_sweetness)
rf_model_pca, rf_pred_pca = random_forest_model(X_train_pca, y_train_sweetness, X_test_pca, y_test_sweetness)
svr_model_pca, svr_pred_pca = svr_model(X_train_pca, y_train_sweetness, X_test_pca, y_test_sweetness)

# Evaluate and plot
plot_actual_vs_predicted(y_test_sweetness, lr_pred_pca, "Linear Regression with PCA - Sweetness")

