# Modling 


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Extract the IDs from the test data
test_ids = test_data['Id']

# Preprocess data function
def preprocess_data(df):
    # Handling missing values for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    
    # Handling missing values for categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
    
    # Encode categorical features
    for col in categorical_cols:
        df[col] = df[col].astype('category').cat.codes
    
    return df

# Preprocess the data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Separate features and target
X = train_data.drop(['SalePrice', 'Id'], axis=1)
y = train_data['SalePrice']
X_test = test_data.drop(['Id'], axis=1)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Function to evaluate models using cross-validation with RMSE
def evaluate_model(model, X, y, cv=5):
    scoring = make_scorer(mean_squared_error, squared=False)
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
    return scores

# Evaluate each model using cross-validation
results = {}
for model_name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results[model_name] = {
        'mean_rmse': np.mean(scores['test_score']),
        'std_rmse': np.std(scores['test_score'])
    }

# Display the cross-validation results
results_df = pd.DataFrame(results).T
print(results_df)

# Choose the best model based on cross-validation results (e.g., lowest mean RMSE)
best_model_name = results_df['mean_rmse'].idxmin()
best_model = models[best_model_name]

# Retrain the best model on the entire training set
best_model.fit(X_train, y_train)

# Predict on the test dataset using the best model
test_predictions = best_model.predict(X_test)

# Create a DataFrame to store the predictions
test_predictions_df = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})

# Save the predictions to a CSV file
test_predictions_df.to_csv('test_predictions.csv', index=False)

print("Predictions saved to test_predictions.csv")




                      mean_rmse      std_rmse
Linear Regression  41574.502372  10913.318398
Random Forest      30410.103464   4977.277022
Gradient Boosting  28766.667361   4344.554704
Predictions saved to test_predictions.csv


### Alternative approach: Using Grid Search for Hyperparameter Tuning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Extract the IDs from the test data
test_ids = test_data['Id']

# Preprocess data function
def preprocess_data(df):
    # Handling missing values for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
    
    # Handling missing values for categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
    
    # Encode categorical features
    for col in categorical_cols:
        df[col] = df[col].astype('category').cat.codes
    
    return df

# Preprocess the data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Separate features and target
X = train_data.drop(['SalePrice', 'Id'], axis=1)
y = train_data['SalePrice']
X_test = test_data.drop(['Id'], axis=1)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize the models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Perform Grid Search for Random Forest
grid_search_rf = GridSearchCV(estimator=models["Random Forest"], param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_log_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
best_params_rf = grid_search_rf.best_params_
print("Best parameters for Random Forest: ", best_params_rf)

# Perform Grid Search for Gradient Boosting
grid_search_gb = GridSearchCV(estimator=models["Gradient Boosting"], param_grid=param_grid_gb, cv=5, scoring='neg_mean_squared_log_error', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)
best_params_gb = grid_search_gb.best_params_
print("Best parameters for Gradient Boosting: ", best_params_gb)

# Update models with best parameters
models["Random Forest"] = RandomForestRegressor(**best_params_rf, random_state=42)
models["Gradient Boosting"] = GradientBoostingRegressor(**best_params_gb, random_state=42)

# Function to evaluate models using cross-validation with RMSE
def evaluate_model(model, X, y, cv=5):
    scoring = make_scorer(mean_squared_error, squared=False)
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
    return scores

# Evaluate each model using cross-validation
results = {}
for model_name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results[model_name] = {
        'mean_rmse': np.mean(scores['test_score']),
        'std_rmse': np.std(scores['test_score'])
    }

# Display the cross-validation results
results_df = pd.DataFrame(results).T
print(results_df)

# Choose the best model based on cross-validation results (e.g., lowest mean RMSE)
best_model_name = results_df['mean_rmse'].idxmin()
best_model = models[best_model_name]

# Retrain the best model on the entire training set
best_model.fit(X_train, y_train)

# Predict on the test dataset using the best model
test_predictions = best_model.predict(X_test)

# Create a DataFrame to store the predictions
test_predictions_df = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})

# Save the predictions to a CSV file
test_predictions_df.to_csv('test_predictions.csv', index=False)

print("Predictions saved to test_predictions.csv")