<a href="https://colab.research.google.com/github/daisysong76/AI--Machine--learning/blob/main/KaggleX_Skill_Assessment_Challenge_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Setup and Data Loading
from google.colab import drive
import pandas as pd
import numpy as np

# Mount Google Drive to access datasets
drive.mount('/content/drive')

# Load datasets
train = pd.read_csv('/content/drive/My Drive/path/to/train.csv')
test = pd.read_csv('/content/drive/My Drive/path/to/test.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/path/to/sample_submission.csv')

# Display the first few rows of the train dataset
train.head()

# Step 2: Data Preprocessing
# Check for missing values
train.isnull().sum()

# Handle missing values (example: fill with median)
train.fillna(train.median(), inplace=True)

# Encode categorical features
categorical_features = train.select_dtypes(include=['object']).columns
train = pd.get_dummies(train, columns=categorical_features, drop_first=True)
test = pd.get_dummies(test, columns=categorical_features, drop_first=True)

# Ensure the test set has the same columns as the train set
missing_cols = set(train.columns) - set(test.columns)
for c in missing_cols:
    test[c] = 0
test = test[train.columns.drop('price')]

# Normalize/Standardize numerical features
from sklearn.preprocessing import StandardScaler

numerical_features = train.select_dtypes(include=[np.number]).columns.drop('price')
scaler = StandardScaler()
train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

# Step 3: Feature Engineering
# Create new features (example: car age)
train['car_age'] = 2024 - train['year']
test['car_age'] = 2024 - test['year']

# Drop the 'year' column as it's redundant now
train.drop(columns=['year'], inplace=True)
test.drop(columns=['year'], inplace=True)

# Step 4: Model Development
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Split the training data into training and validation sets
X = train.drop(columns=['price', 'id'])
y = train['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train baseline models and evaluate using RMSE
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)
rmse_lr = np.sqrt(mean_squared_error(y_val, y_pred_lr))

# Decision Tree
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_val)
rmse_dt = np.sqrt(mean_squared_error(y_val, y_pred_dt))

# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))

# Gradient Boosting
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
rmse_gb = np.sqrt(mean_squared_error(y_val, y_pred_gb))

# Print RMSE of all models
print(f"Linear Regression RMSE: {rmse_lr}")
print(f"Decision Tree RMSE: {rmse_dt}")
print(f"Random Forest RMSE: {rmse_rf}")
print(f"Gradient Boosting RMSE: {rmse_gb}")

# Step 5: Hyperparameter Tuning (example: Random Forest)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# Evaluate the best model
y_pred_best_rf = best_rf.predict(X_val)
rmse_best_rf = np.sqrt(mean_squared_error(y_val, y_pred_best_rf))
print(f"Best Random Forest RMSE: {rmse_best_rf}")

# Step 6: Advanced Models (example: XGBoost)
import xgboost as xgb

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
print(f"XGBoost RMSE: {rmse_xgb}")

# Step 7: Model Ensemble (example: Stacking)
from sklearn.ensemble import StackingRegressor

estimators = [
    ('rf', best_rf),
    ('xgb', xgb_model)
]
stacking_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_val)
rmse_stacking = np.sqrt(mean_squared_error(y_val, y_pred_stacking))
print(f"Stacking Model RMSE: {rmse_stacking}")

# Step 8: Final Model Selection
# Assuming stacking_model is the best
final_model = stacking_model

# Step 9: Prediction and Submission
# Generate predictions for the test set
X_test = test.drop(columns=['id'])
test['price'] = final_model.predict(X_test)

# Prepare the submission file
submission = test[['id', 'price']]
submission.to_csv('/content/drive/My Drive/path/to/submission.csv', index=False)

# Step 10: Documentation and Reporting
# Create a report of the process and results (example: in a separate markdown file or notebook cell)
