<a href="https://colab.research.google.com/github/daisysong76/AI--Machine--learning/blob/main/kagglex_skill_assessment_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Setup and Data Loading
from google.colab import drive
import pandas as pd
import numpy as np

In [None]:

# Mount Google Drive to access datasets
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load datasets
train = pd.read_csv('/content/drive/My Drive/KaggleX/train.csv')
test = pd.read_csv('/content/drive/My Drive/KaggleX/test.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/KaggleX/sample_submission.csv')

In [None]:
# Display the first few rows of the train dataset
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [None]:
# Step 2: Data Preprocessing
# Check for missing values
train.isnull().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [None]:
# Handle missing values (example: fill with median)
train.fillna(train.median(), inplace=True)

TypeError: could not convert string to float: 'Ford'

In [None]:
# Encode categorical features
categorical_features = train.select_dtypes(include=['object']).columns
train = pd.get_dummies(train, columns=categorical_features, drop_first=True)
test = pd.get_dummies(test, columns=categorical_features, drop_first=True)

In [None]:
# Ensure the test set has the same columns as the train set
missing_cols = set(train.columns) - set(test.columns)
for c in missing_cols:
    test[c] = 0
test = test[train.columns.drop('price')]

In [None]:
# Normalize/Standardize numerical features
from sklearn.preprocessing import StandardScaler

numerical_features = train.select_dtypes(include=[np.number]).columns.drop('price')
scaler = StandardScaler()
train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])


In [None]:
# Step 3: Feature Engineering
# Create new features (example: car age)
train['car_age'] = 2024 - train['year']
test['car_age'] = 2024 - test['year']

In [None]:
# Drop the 'year' column as it's redundant now
train.drop(columns=['year'], inplace=True)
test.drop(columns=['year'], inplace=True)

In [None]:
# Step 4: Model Development
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Split the training data into training and validation sets
X = train.drop(columns=['price', 'id'])
y = train['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train baseline models and evaluate using RMSE
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)
rmse_lr = np.sqrt(mean_squared_error(y_val, y_pred_lr))

In [None]:
# Decision Tree
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_val)
rmse_dt = np.sqrt(mean_squared_error(y_val, y_pred_dt))

In [None]:
# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))

In [None]:
# Gradient Boosting
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
rmse_gb = np.sqrt(mean_squared_error(y_val, y_pred_gb))

In [None]:
# Print RMSE of all models
print(f"Linear Regression RMSE: {rmse_lr}")
print(f"Decision Tree RMSE: {rmse_dt}")
print(f"Random Forest RMSE: {rmse_rf}")
print(f"Gradient Boosting RMSE: {rmse_gb}")

In [None]:
# Step 5: Hyperparameter Tuning (example: Random Forest)
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7]
}
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

In [None]:
# Evaluate the best model
y_pred_best_rf = best_rf.predict(X_val)
rmse_best_rf = np.sqrt(mean_squared_error(y_val, y_pred_best_rf))
print(f"Best Random Forest RMSE: {rmse_best_rf}")

In [None]:
# Step 6: Advanced Models (example: XGBoost)
import xgboost as xgb

xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
print(f"XGBoost RMSE: {rmse_xgb}")

In [None]:
# Step 7: Model Ensemble (example: Stacking)
from sklearn.ensemble import StackingRegressor

estimators = [
    ('rf', best_rf),
    ('xgb', xgb_model)
]
stacking_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stacking_model.fit(X_train, y_train)
y_pred_stacking = stacking_model.predict(X_val)
rmse_stacking = np.sqrt(mean_squared_error(y_val, y_pred_stacking))
print(f"Stacking Model RMSE: {rmse_stacking}")

In [None]:
# Step 8: Final Model Selection
# Assuming stacking_model is the best
final_model = stacking_model

In [None]:

# Step 9: Prediction and Submission
# Generate predictions for the test set
X_test = test.drop(columns=['id'])
test['price'] = final_model.predict(X_test)

In [None]:
# Prepare the submission file
submission = test[['id', 'price']]
submission.to_csv('/content/drive/My Drive/path/to/submission.csv', index=False)

# Step 10: Documentation and Reporting
# Create a report of the process and results (example: in a separate markdown file or notebook cell)
