In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.stats.diagnostic import linear_rainbow
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('/mnt/data/tips (data for regression problem).csv')

# 1. Data Analysis and Visualization

# Scatter plot of total bill vs tip
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='total_bill', y='tip')
plt.title('Total Bill vs Tip Amount')
plt.show()

# Correlation matrix heatmap for numeric features
plt.figure(figsize=(10, 8))
numeric_cols = ['total_bill', 'tip', 'size']
correlation_matrix = df[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Pair plot to visualize relationships between numeric variables
sns.pairplot(df[numeric_cols])
plt.show()

# Box plots for categorical variables
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
sns.boxplot(data=df, x='sex', y='tip', ax=axes[0, 0])
sns.boxplot(data=df, x='smoker', y='tip', ax=axes[0, 1])
sns.boxplot(data=df, x='day', y='tip', ax=axes[1, 0])
sns.boxplot(data=df, x='time', y='tip', ax=axes[1, 1])
plt.tight_layout()
plt.show()

# 2. Data Preprocessing
# Split features and target
X = df.drop('tip', axis=1)
y = df['tip']

# Define numeric and categorical columns
numeric_features = ['total_bill', 'size']
categorical_features = ['sex', 'smoker', 'day', 'time']

# Create preprocessing pipelines
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)  # Fixed here

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Linearity Check - Rainbow Test
X_const = sm.add_constant(X[numeric_features])  # Adding a constant for OLS
model = sm.OLS(y, X_const).fit()
rainbow_stat, rainbow_p_val = linear_rainbow(model)

print("Rainbow Test:")
print("Statistic:", rainbow_stat)
print("p-value:", rainbow_p_val)
if rainbow_p_val > 0.05:
    print("No significant departure from linearity.")
else:
    print("Significant non-linearity detected.")

# 4. Model Building and Evaluation with Hyperparameter Tuning
def evaluate_model(model, name, param_grid=None):
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    if param_grid:
        # Perform grid search for hyperparameter tuning
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        best_params = grid_search.best_params_
    else:
        # Standard fit without grid search
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        best_params = None
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
    
    return {
        'Model': name,
        'MSE': mse,
        'R2 Score': r2,
        'CV Score Mean': cv_scores.mean(),
        'CV Score Std': cv_scores.std(),
        'Best Params': best_params
    }

# Define models with parameter grids for hyperparameter tuning
models = {
    'Linear Regression': (LinearRegression(), None),
    'Ridge Regression': (Ridge(), {'regressor__alpha': [0.1, 1.0, 10.0]}),
    'Lasso Regression': (Lasso(), {'regressor__alpha': [0.01, 0.1, 1.0]}),
    'Decision Tree': (DecisionTreeRegressor(random_state=42), {'regressor__max_depth': [3, 5, 10, None]}),
    'Random Forest': (RandomForestRegressor(random_state=42), {'regressor__n_estimators': [50, 100, 150], 'regressor__max_depth': [3, 5, 10, None]}),
    'SVR': (SVR(), {'regressor__C': [0.1, 1, 10], 'regressor__gamma': ['scale', 'auto']}),
    'KNN': (KNeighborsRegressor(), {'regressor__n_neighbors': [3, 5, 7, 10]})
}

# Evaluate all models
results = []
for name, (model, param_grid) in models.items():
    results.append(evaluate_model(model, name, param_grid))

# Create results dataframe
results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:")
print(results_df)

# 5. Feature Importance Analysis (using Random Forest)
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
feature_importances = rf_pipeline.named_steps['regressor'].feature_importances_

# Get feature names after preprocessing
feature_names = (numeric_features + 
                [f"{feat}_{val}" for feat, vals in 
                 zip(categorical_features, 
                     rf_pipeline.named_steps['preprocessor']
                     .named_transformers_['cat'].categories_) 
                 for val in vals[1:]])

# Plot feature importance
plt.figure(figsize=(12, 6))
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values('Importance', ascending=False)
sns.barplot(data=importance_df, x='Importance', y='Feature')
plt.title('Feature Importance Analysis')
plt.show()

# 6. Residual Analysis for Best Model (Random Forest)
best_model = rf_pipeline
y_pred = best_model.predict(X_test)
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
