In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

data = pd.read_csv('./data/train.csv')

# Fill missing values for 'fuel_type', 'accident', and 'clean_title'
data['fuel_type'].fillna('Unknown', inplace=True)
data['accident'].fillna('No information', inplace=True)
data['clean_title'].fillna('Unknown', inplace=True)

# Extract horsepower from 'engine' column
data['horsepower'] = data['engine'].str.extract(r'(\d+)\.0HP').astype(float)

# Drop the original 'engine' column
data.drop(columns=['engine'], inplace=True)

# One-hot encoding for categorical variables
categorical_columns = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)



In [2]:
data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,horsepower
0,0,MINI,Cooper S Base,2007,213000,Gasoline,A/T,Yellow,Gray,None reported,Yes,4200,172.0
1,1,Lincoln,LS V8,2002,143250,Gasoline,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999,252.0
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,A/T,Blue,Gray,None reported,Yes,13900,320.0
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000,420.0
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,7-Speed A/T,Black,Beige,None reported,Yes,97500,208.0


In [3]:
# Drop the 'id' column
data_encoded = data_encoded.drop(columns=['id'])
data_encoded.head()


Unnamed: 0,model_year,milage,price,horsepower,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,...,int_col_WHITE,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_No information,accident_None reported,clean_title_Yes
0,2007,213000,4200,172.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,2002,143250,4999,252.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2002,136731,13900,320.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,2017,19500,45000,420.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,2021,7388,97500,208.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [4]:
# Check for remaining NaNs
data_encoded.fillna(0, inplace=True)

# Check for outliers in price
print(data['price'].describe())

# Define features (X) and target (y)
X = data_encoded.drop(columns=['price'])
y = data_encoded['price']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions and calculate RMSE
y_pred = model.predict(X_val_scaled)
rmse = mean_squared_error(y_val, y_pred, squared=False)

print(f"RMSE: {rmse}")
print("Predictions:", y_pred[:5])

count    1.885330e+05
mean     4.387802e+04
std      7.881952e+04
min      2.000000e+03
25%      1.700000e+04
50%      3.082500e+04
75%      4.990000e+04
max      2.954083e+06
Name: price, dtype: float64
RMSE: 6839919148626233.0
Predictions: [52849.55363408 60516.57519244 55227.81827795 25450.02986584
 33195.54463764]


- Without log transforming price: 6839919148626233.0

In [5]:
# Load the test data
test_data = pd.read_csv('./data/test.csv')

# Apply the same preprocessing steps to the test data
test_data['fuel_type'].fillna('Unknown', inplace=True)
test_data['accident'].fillna('No information', inplace=True)
test_data['clean_title'].fillna('Unknown', inplace=True)

# Extract horsepower from 'engine' column in test set
test_data['horsepower'] = test_data['engine'].str.extract(r'(\d+)\.0HP').astype(float)
test_data.drop(columns=['engine'], inplace=True)

# One-hot encode categorical variables in the test set
test_encoded = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)

# Make sure the test data has the same columns as the training data (including missing dummy columns)
missing_cols = set(data_encoded.columns) - set(test_encoded.columns)
for col in missing_cols:
    test_encoded[col] = 0
test_encoded = test_encoded[data_encoded.columns.drop('price')]  # Ensure column order matches

# Check for remaining NaNs
test_encoded.fillna(0, inplace=True)

# Scale the test features
X_test_scaled = scaler.transform(test_encoded)




Submission file created: submission.csv


In [12]:
test_encoded.head()

Unnamed: 0,model_year,milage,horsepower,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,int_col_WHITE,int_col_Walnut,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,int_col_–,accident_No information,accident_None reported,clean_title_Yes
0,2015,98000,240.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,2020,9142,395.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,2022,28121,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2016,61258,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2018,59000,252.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [None]:
# Make predictions on the test set
test_predictions = model.predict(X_test_scaled)

# Create a submission DataFrame with 'id' and predicted 'price'
submission = pd.DataFrame({
    'id': test_data['id'],
    'price': test_predictions
})

# Save to a CSV file
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")

In [6]:
test_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,horsepower
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,6-Speed A/T,White,Beige,None reported,Yes,240.0
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,8-Speed A/T,Silver,Black,None reported,Yes,395.0
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,10-Speed Automatic,White,Ebony,None reported,Unknown,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,Automatic,Silician Yellow,Black,None reported,Unknown,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,A/T,Gray,Black,None reported,Yes,252.0
