# Regression of Used Car Prices

### Overview
Welcome to the 2024 Kaggle Playground Series! We plan to continue in the spirit of previous playgrounds, providing interesting an approachable datasets for our community to practice their machine learning skills, and anticipate a competition each month.

### Your Goal: 
The goal of this competition is to predict the price of used cars based on various attributes.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

## Data
#### Loading Data

In [3]:
# training data
train = pd.read_csv('train.csv')
print(f"Shape: {train.shape}")
train.head()

Shape: (188533, 13)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [4]:
# check the availabe columns
train.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price'],
      dtype='object')

In [5]:
# testing data
test = pd.read_csv('test.csv')
print(f"Shape: {test.shape}")
test.head()

Shape: (125690, 12)


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


### About data
#### 1. train data

In [11]:
# train information
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [13]:
# statistical information
train.describe()

Unnamed: 0,id,model_year,milage,price
count,188533.0,188533.0,188533.0,188533.0
mean,94266.0,2015.829998,65705.295174,43878.02
std,54424.933488,5.660967,49798.158076,78819.52
min,0.0,1974.0,100.0,2000.0
25%,47133.0,2013.0,24115.0,17000.0
50%,94266.0,2017.0,57785.0,30825.0
75%,141399.0,2020.0,95400.0,49900.0
max,188532.0,2024.0,405000.0,2954083.0


#### 2. test data

In [16]:
# test information
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            125690 non-null  int64 
 1   brand         125690 non-null  object
 2   model         125690 non-null  object
 3   model_year    125690 non-null  int64 
 4   milage        125690 non-null  int64 
 5   fuel_type     122307 non-null  object
 6   engine        125690 non-null  object
 7   transmission  125690 non-null  object
 8   ext_col       125690 non-null  object
 9   int_col       125690 non-null  object
 10  accident      124058 non-null  object
 11  clean_title   111451 non-null  object
dtypes: int64(3), object(9)
memory usage: 11.5+ MB


In [18]:
# statistical information
test.describe()

Unnamed: 0,id,model_year,milage
count,125690.0,125690.0,125690.0
mean,251377.5,2015.797526,66042.58151
std,36283.722005,5.673797,50223.858435
min,188533.0,1974.0,100.0
25%,219955.25,2013.0,24500.0
50%,251377.5,2017.0,57500.0
75%,282799.75,2020.0,95798.0
max,314222.0,2024.0,405000.0


### Data Cleaning
#### Missing Values

In [21]:
# function to handle missing values
def missing_values(df):
    # filling in missing values for numerical columns
    for column in df.select_dtypes(include=['number']).columns:
        if df[column].isnull().any():
            median_value = df[column].median()
            df[column].fillna(median_value, inplace=True)

    # filling in missing values for categorical columns
    for column in df.select_dtypes(include=['object']).columns:
        if df[column].isnull().any():
            mode_value = df[column].mode()[0]
            df[column].fillna(mode_value, inplace=True)

    return df

##### Train

In [24]:
# check for missing values
train.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [26]:
# fill in missing values
train = missing_values(train)

In [27]:
# re-check for missing values
train.isnull().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

##### Test

In [31]:
# check for missing values in test data
test.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        3383
engine              0
transmission        0
ext_col             0
int_col             0
accident         1632
clean_title     14239
dtype: int64

In [33]:
# handle missing values
test = missing_values(test)

In [35]:
test.isnull().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
dtype: int64

#### Duplicates

In [38]:
## Train
# check for duplicates
duplicates = train.duplicated()

# view duplicate info
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


In [40]:
# remove duplicates
train_cleaned = train.drop_duplicates(inplace=False)

In [41]:
## Test
# check for duplicates in test data
duplicates = test.duplicated()

# view duplicate info
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


In [44]:
# remove duplicates
test_cleaned = test.drop_duplicates(inplace=False)

#### Log Transformations

In [None]:
from scipy.stats import skew

In [None]:
# check skewness of price and milage
print(f"Skewness of price: {skew(train_cleaned['price'])}")
print(f"Skewness of milage: {skew(train_cleaned['milage'])}")

In [None]:
# apply log transformation to price and milage
train_cleaned['price_log'] = np.log1p(train_cleaned['price'])  # log1p handles zero and small values
train_cleaned['milage_log'] = np.log1p(train_cleaned['milage'])

# apply log transformation to test milage
test_cleaned['milage_log'] = np.log1p(test_cleaned['milage'])

In [None]:
train_cleaned.head()

### Correlation Analysis

In [None]:
# correlation threshold
correlation_threshold = 0.5

##### Encode categorical columns

In [None]:
# copy train data
train_copy = train.copy()

# initialize label encoder
le = LabelEncoder()

# loop through columns and apply label encoding to object
for column in train_copy.select_dtypes(include=['object']).columns:
    train_copy[column] = le.fit_transform(train_copy[column].astype(str))

In [None]:
# correlation matrix for all columns
correlation_matrix = train_copy.corr()

# upper triangular matrix
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

In [None]:
# iterate over the columns and rows to find correlations greater than or equal to threshold
for column in upper_tri.columns:
    for row in upper_tri.index:
        correlation_value = upper_tri.loc[row, column]
        if abs(correlation_value) >= correlation_threshold:
            print(f"Correlation between {row} and {column}: {correlation_value:.4f}")

### Feature Engineering

In [46]:
# import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# import encoding library
from sklearn.preprocessing import LabelEncoder

In [48]:
# function to perform feature analysis
def feature_engineering_and_importance(df, is_test=False):
    ## Feature Engineering
    df_encoded = df.copy()

    # engine Power (Horsepower)
    # df_encoded['horsepower'] = df_encoded['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)

    # label encoding
    label_encoders = {}
    for col in df_encoded.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].fillna('Unknown'))
        label_encoders[col] = le

    # creating new features
    # car age
    df_encoded['car_age'] = 2024 - df_encoded['model_year']

    # mileage per Year
    df_encoded['mileage_per_year'] = df_encoded['milage'] / df_encoded['car_age']
    df_encoded['mileage_per_year'].replace([np.inf, -np.inf], 0, inplace=True)  # handle division by zero

    # drop irrelevant columns
    df_encoded.drop(['id', 'model_year'], axis=1, inplace=True)

    # return test dataframe and performing the above
    if is_test:
        return df_encoded    
    
    # split dataset into features and target
    X = df_encoded.drop(columns=['price'])
    y = df_encoded['price']

    # split dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # train a model to assess feature importance
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)


    
    ## Feature Importance
    importance = model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importance
    }).sort_values(by='Importance', ascending=False)

    # prediction and evaluation
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    # display results
    print(f"Model RMSE: {rmse}\n")
    print("Feature Importance: ")
    print(feature_importance_df)
    print()    

    return feature_importance_df, df_encoded, X_train, X_test, y_train, y_test

In [50]:
# using feature analysis function
feature_importance, train_cleaned, X_train, X_test, y_train, y_test = feature_engineering_and_importance(train_cleaned)

Model RMSE: 73499.72816268813

Feature Importance: 
             Feature  Importance
2             milage    0.248320
11  mileage_per_year    0.204314
1              model    0.109004
4             engine    0.101177
6            ext_col    0.093748
7            int_col    0.065107
10           car_age    0.063416
5       transmission    0.060187
0              brand    0.036831
3          fuel_type    0.009149
8           accident    0.008747
9        clean_title    0.000000



In [51]:
# train_cleaned = train_cleaned.drop(columns=['horsepower'])

In [52]:
train_cleaned.isnull().sum()

brand               0
model               0
milage              0
fuel_type           0
engine              0
transmission        0
ext_col             0
int_col             0
accident            0
clean_title         0
price               0
car_age             0
mileage_per_year    0
dtype: int64

### Model Experimentation

In [54]:
# importing some necessary libraries for tunning
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import optuna
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [55]:
# function for baseline models
def baseline_models(df):
    X = df.drop(columns=['price'])
    y = df['price']

    # train-test split
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    results = {}
    models = {}

    # model 1: Linear Regression
    try:
        lr = LinearRegression()
        # lr.fit(X_train, y_train)
        # lr_pred = lr.predict(X_test)
        # results['Linear Regression'] = np.sqrt(mean_squared_error(y_test, lr_pred))
        # models['Linear Regression'] = lr

        lr_cv_scores = cross_val_score(lr, X_scaled, y, cv=5, scoring='neg_root_mean_squared_error')
        results['Linear Regression'] = -np.mean(lr_cv_scores)  # Convert to positive RMSE
        models['Linear Regression'] = lr
    except Exception as error:
        print(f"An error occurred with Linear Regression: {error}")

    # model 2: Random Forest
    try:
        rf = RandomForestRegressor()
        # rf.fit(X_train, y_train)
        # rf_pred = rf.predict(X_test)
        # results['Random Forest'] = np.sqrt(mean_squared_error(y_test, rf_pred))
        # models['Random Forest'] = rf

        rf_cv_scores = cross_val_score(rf, X_scaled, y, cv=5, scoring='neg_root_mean_squared_error')
        results['Random Forest'] = -np.mean(rf_cv_scores)  # Convert to positive RMSE
        models['Random Forest'] = rf
    except Exception as error:
        print(f"An error occurred with Random Forest: {error}")

    # model 3: Elastic Net
    try:
        en = ElasticNet()
        # en.fit(X_train, y_train)
        # en_pred = en.predict(X_test)
        # results['Elastic Net'] = np.sqrt(mean_squared_error(y_test, en_pred))
        # models['Elastic Net'] = en

        en_cv_scores = cross_val_score(en, X_scaled, y, cv=5, scoring='neg_root_mean_squared_error')
        results['Elastic Net'] = -np.mean(en_cv_scores)  # Convert to positive RMSE
        models['Elastic Net'] = en
    except Exception as error:
        print(f"An error occurred with Elastic Net: {error}")

    # results
    for model, rmse in results.items():
        print(f"{model} RMSE: {rmse:.2f}")

    # return the best model based on rmse
    best_model_name = min(results, key=results.get)
    best_model_instance = models[best_model_name]
    return best_model_name, best_model_instance, results

In [56]:
# function for baseline models
def complex_models(df):
    X = df.drop(columns=['price'])
    y = df['price']

    # train-test split
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # feature Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    results = {}
    models = {}

    # model 4: Gradient Boosting
    try:
        gb = GradientBoostingRegressor()
        # gb.fit(X_train, y_train)
        # gb_pred = gb.predict(X_test)
        # results['Gradient Boosting'] = np.sqrt(mean_squared_error(y_test, gb_pred))
        # models['Gradient Boosting'] = gb

        gb_cv_scores = cross_val_score(gb, X_scaled, y, cv=5, scoring='neg_root_mean_squared_error')
        results['Gradient Boosting'] = -np.mean(gb_cv_scores)  # Convert to positive RMSE
        models['Gradient Boosting'] = gb
    except Exception as error:
        print(f"An error occurred with Gradient Boosting: {error}")

    # model 5: XGBoost
    try:
        xgb = XGBRegressor()
        # xgb.fit(X_train, y_train)
        # xgb_pred = xgb.predict(X_test)
        # results['XGBoost'] = np.sqrt(mean_squared_error(y_test, xgb_pred))
        # models['XGBoost'] = xgb

        xgb_cv_scores = cross_val_score(xgb, X_scaled, y, cv=5, scoring='neg_root_mean_squared_error')
        results['XGBoost'] = -np.mean(xgb_cv_scores)  # Convert to positive RMSE
        models['XGBoost'] = xgb
    except Exception as error:
        print(f"An error occurred with XGB: {error}")

     # model 6: LightGBM
    try:
        lgb = LGBMRegressor(learning_rate=0.1, n_estimators=100)
        # lgb.fit(X_train, y_train)
        # lgb_pred = lgb.predict(X_test)
        # results['LightGBM'] = np.sqrt(mean_squared_error(y_test, lgb_pred))
        # models['LightGBM'] = lgb

        lgb_cv_scores = cross_val_score(lgb, X_scaled, y, cv=5, scoring='neg_root_mean_squared_error')
        results['LightGBM'] = -np.mean(lgb_cv_scores)  # Convert to positive RMSE
        models['LightGBM'] = lgb
    except Exception as error:
        print(f"An error occurred with LightGBM: {error}")

    # print results
    for model, rmse in results.items():
        print(f"{model} RMSE: {rmse:.2f}")

    # return best model based on RMSE
    best_model_name = min(results, key=results.get)
    best_model_instance = models[best_model_name]
    return best_model_name, best_model_instance, results

In [None]:
# function to compare baseline and complex models
def baseline_vs_complex(baseline_results, complex_results):
    print("\nComparison of Baseline and Complex Models:")
    combined_results = {**baseline_results[1], **complex_results[1]}
    
    for model, rmse in combined_results.items():
        print(f"{model} RMSE: {rmse:.2f}")

    best_model = min(combined_results, key=combined_results.get)
    print(f"\nBest Model: {best_model} with RMSE: {combined_results[best_model]:.2f}")
    return best_model

In [57]:
# baseline model results
baseline_model, baseline_model_instance, baseline_result = baseline_models(train_cleaned)

Linear Regression RMSE: 74699.55
Random Forest RMSE: 77916.09
Elastic Net RMSE: 74879.00


In [58]:
print(f"Baseline model: {baseline_model}\n")
print(f"Baseline model instance: {baseline_model_instance}\n")
print(f"Baseline model result: {baseline_result}\n")

Baseline model: Linear Regression

Baseline model instance: LinearRegression()

Baseline model result: {'Linear Regression': 74699.55302524555, 'Random Forest': 77916.09088362011, 'Elastic Net': 74879.00182217477}



In [59]:
# complex model results
complex_model, complex_model_instance, complex_result = complex_models(train_cleaned)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007255 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1535
[LightGBM] [Info] Number of data points in the train set: 150826, number of used features: 11
[LightGBM] [Info] Start training from score 43859.547492
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1537
[LightGBM] [Info] Number of data points in the train set: 150826, number of used features: 11
[LightGBM] [Info] Start training from score 43829.197671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005700 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is

In [60]:
print(f"Complex model: {complex_model}\n")
print(f"Complex model instance: {complex_model_instance}\n")
print(f"Complex model result: {complex_result}\n")

Complex model: Gradient Boosting

Complex model instance: GradientBoostingRegressor()

Complex model result: {'Gradient Boosting': 72982.47272820731, 'XGBoost': 75714.83029075306, 'LightGBM': 72986.03045340911}



In [None]:
# find the best model
best_model = baseline_vs_complex(baseline_result, complex_result)

### Hyperparameter Tunning

In [None]:
# Function for hyperparameter tuning
def tuning_function(df, best_model):
    X = df.drop(columns=['price'])
    y = df['price']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if best_model == 'Linear Regression':
        model = LinearRegression()
        param_grid = {}
    elif best_model == 'Random Forest':
        model = RandomForestRegressor()
        param_grid = {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]}
    elif best_model == 'Gradient Boosting':
        model = GradientBoostingRegressor()
        param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}
    elif best_model == 'XGBoost':
        model = XGBRegressor()
        param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}
    elif best_model == 'LightGBM':
        model = LGBMRegressor()
        param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1]}
    else:
        print("Model not recognized.")
        return

    # Grid Search
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    print(f"Grid Search Best Parameters: {grid_search.best_params_} with RMSE: {np.sqrt(-cross_val_score(grid_search.best_estimator_, X_test, y_test, cv=5, scoring='neg_mean_squared_error')).mean()}\n\n\n\n")

    # Randomized Search
    random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=5)
    random_search.fit(X_train, y_train)
    print(f"Randomized Search Best Parameters: {random_search.best_params_} with RMSE: {np.sqrt(-cross_val_score(random_search.best_estimator_, X_test, y_test, cv=5, scoring='neg_mean_squared_error')).mean()}\n\n\n\n")

    # Optuna
    def objective(trial):
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'learning_rate': trial.suggest_uniform('learning_rate', 0.05, 0.2),  # Restricted range
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'num_leaves': trial.suggest_int('num_leaves', 20, 40)
        }
        model.set_params(**param)
        model.fit(X_train, y_train)
        return mean_squared_error(y_test, model.predict(X_test))

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)
    print(f"Optuna Best Parameters: {study.best_params} with RMSE: {study.best_value}")

In [None]:
best_model = 'LightGBM'

In [None]:
# hyperparameter tunning
tuning_function(train_cleaned, best_model)

### Submission

In [71]:
# using feature analysis function on test data
test_cleaned = feature_engineering_and_importance(test_cleaned, is_test=True)

In [73]:
print(f"Train shape: {train_cleaned.shape}")
print(f"Test shape: {test_cleaned.shape}")

Train shape: (188533, 13)
Test shape: (125690, 12)


In [75]:
# Add missing columns from train to test and fill with 0
missing_cols = set(train_cleaned.columns) - set(test_cleaned.columns)
for col in missing_cols:
    test_cleaned[col] = 0

In [77]:
# Ensure the test data has the same features as the train data used by the best model
test_cleaned = test_cleaned.reindex(columns=train_cleaned.columns, fill_value=0)

# remove price column from test data
test_cleaned_features = test_cleaned.drop('price', axis=1)

In [79]:
# make predictions using the best model
predictions = complex_model_instance.predict(test_cleaned_features)

NotFittedError: This GradientBoostingRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
sub['price'] = predictions

In [None]:
sub.to_csv('final_submission.csv', index=False)

## Improving Score

### Stacking model

In [None]:
from sklearn.model_selection import KFold
from sklearn.base import clone
import lightgbm as lgb

In [None]:
# function to create stacking features from base models
def get_stacking_data(X, y, base_models, n_folds=5):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    out_of_fold_predictions = np.zeros((X.shape[0], len(base_models)))
    
    for i, model in enumerate(base_models):
        for train_index, val_index in kf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
            
            # Clone the model to avoid affecting the original instance
            cloned_model = clone(model)
            cloned_model.fit(X_train, y_train)
            out_of_fold_predictions[val_index, i] = cloned_model.predict(X_val)
    
    return out_of_fold_predictions

In [None]:
# define base models
base_models = [
    LinearRegression(),
    RandomForestRegressor(n_estimators=100, random_state=42),
    lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1)
]

In [None]:
# Generate out-of-fold predictions (meta-features)
X_stack_train = get_stacking_data(X_train, y_train, base_models)

In [None]:
# Train meta-model (you can use any regressor; Ridge or Lasso can be good choices)
meta_model = LinearRegression()
meta_model.fit(X_stack_train, y_train)

In [None]:
# Generate stacking features for test data
X_stack_test = np.column_stack([model.fit(X_train, y_train).predict(X_test) for model in base_models])

In [None]:
# Make final predictions
final_predictions = meta_model.predict(X_stack_test)

In [None]:
predictions = meta_model.predict(test_cleaned)

### Catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
# identify categorical columns
categorical_cols = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident']

In [None]:
# initialize and train CatBoost
catboost_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=categorical_cols,  # Pass the categorical columns directly
    random_seed=42
)

In [None]:
# Train the model (ensure the data has no missing values or duplicates)
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=100)

In [None]:
# Make predictions on the test set
catboost_predictions = catboost_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.base import clone
import numpy as np
import pandas as pd

# Function to create stacking features from base models
def get_stacking_data(X, y, base_models, n_folds=5):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    out_of_fold_predictions = np.zeros((X.shape[0], len(base_models)))
    
    for i, model in enumerate(base_models):
        for train_index, val_index in kf.split(X, y):
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
            
            # Clone the model to avoid affecting the original instance
            cloned_model = clone(model)
            cloned_model.fit(X_train, y_train)
            out_of_fold_predictions[val_index, i] = cloned_model.predict(X_val)
    
    return out_of_fold_predictions

# Define base models
base_models = [
    LinearRegression(),
    RandomForestRegressor(n_estimators=100, random_state=42),
    lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1)
]

# Generate out-of-fold predictions (meta-features)
X_stack_train = get_stacking_data(X_train, y_train, base_models)

# Train meta-model (you can use any regressor; Ridge or Lasso can be good choices)
meta_model = LinearRegression()
meta_model.fit(X_stack_train, y_train)

# Generate stacking features for test data
X_stack_test = np.column_stack([model.fit(X_train, y_train).predict(X_test) for model in base_models])

# Make final predictions
final_predictions = meta_model.predict(X_stack_test)

# Calculate RMSE on validation set (if you have one)
# Assuming X_val and y_val are your validation data
X_val_stack = get_stacking_data(X_val, y_val, base_models)
final_val_predictions = meta_model.predict(X_val_stack)
rmse = mean_squared_error(y_val, final_val_predictions, squared=False)
print(f'Validation RMSE: {rmse:.2f}')

# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_cleaned['id'],  # Replace with the correct identifier column
    'price': final_predictions
})

# Save submission file
submission_df.to_csv('submission.csv', index=False)
