Let's cleanse data and construct the model base on the results of EDA.

First, I will create a baseline model. It puts data right into the model, which is LinearRegression.

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
# from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import mean_squared_error

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [61]:
def cleanse_data(train_data, test_data, test_has_label=True):

    # 1. delete 'brand'
    train_data = train_data.drop(['brand'], axis=1)
    test_data = test_data.drop(['brand'], axis=1)

    # 3. drop ext_col, int_col
    train_data = train_data.drop(['ext_col', 'int_col'], axis=1)
    test_data = test_data.drop(['ext_col', 'int_col'], axis=1)

    # 4. drop id
    train_data = train_data.drop(['id'], axis=1)
    test_data = test_data.drop(['id'], axis=1)

    ''' only 8 features ('model_year', 'milage', 'fuel_type', 'transmission', 'accident', 'clean_title', 'model', 'engine') left '''

    # 5. for the same model value group, delete all rows that has high price (above 90% quantile).
    quantiles = train_data.groupby('model')['price'].quantile(0.8)
    train_data = train_data[train_data.apply(lambda row: row['price'] <= quantiles[row['model']], axis=1)]
    
    # 6. Filter out rows that has price higher than 500K
    train_data = train_data[train_data.apply(lambda row: row['price'] < 500000, axis=1)]

    # 7. convert 'year' to int, and calculate year passed from min(year)
    base = min(train_data['model_year'])
    train_data['model_year'] = (train_data['model_year'].astype(int) - base)**2
    test_data['model_year']  = (test_data['model_year'] .astype(int) - base)**2
    
    # 9. Aggregate categorical features in transmission
    col_names = [
    'A/T',
    'Transmission w/Dual Shift Mode',
    '7-Speed A/T',
    '8-Speed A/T',
    '10-Speed Automatic',
    '1-Speed A/T',
    '6-Speed A/T',
    '10-Speed A/T',
    '9-Speed A/T',
    '8-Speed Automatic',
    '9-Speed Automatic',
    '5-Speed A/T',
    'Automatic',
    '7-Speed Automatic with Auto-Shift',
    'CVT Transmission',
    '5-Speed M/T',
    'M/T',
    '6-Speed M/T',
    '6-Speed Automatic',
    '4-Speed Automatic',
    '7-Speed M/T',
    '2-Speed A/T',
    '1-Speed Automatic',
    'Automatic CVT',
    '4-Speed A/T',
    '6-Speed Manual',
    'Transmission Overdrive Switch',
    '8-Speed Automatic with Auto-Shift',
    '7-Speed Manual',
    '7-Speed Automatic',
    '9-Speed Automatic with Auto-Shift',
    '6-Speed Automatic with Auto-Shift',
    '6-Speed Electronically Controlled Automatic with O',
    'F',
    'CVT-F',
    '8-Speed Manual',
    'Manual',
    '-',
    '2',
    '6 Speed At/Mt',
    '5-Speed Automatic',
    '2-Speed Automatic',
    '8-SPEED A/T',
    '7-Speed',
    'Variable',
    'Single-Speed Fixed Gear',
    '8-SPEED AT',
    '10-Speed Automatic with Overdrive',
    '7-Speed DCT Automatic',
    'SCHEDULED FOR OR IN PRODUCTION',
    '6-Speed',
    '6 Speed Mt'
]
    
    col_names_override = [
    'A/T',
    'Transmission w/Dual Shift Mode',
    '7-Speed A/T',
    '8-Speed A/T',
    '10-Speed A/T',
    '1-Speed A/T',
    '6-Speed A/T',
    '10-Speed A/T',
    '9-Speed A/T',
    '8-Speed A/T',
    '9-Speed A/T',
    '5-Speed A/T',
    'A/T',
    '7-Speed A/T with Auto-Shift',
    'CVT Transmission',
    '5-Speed M/T',
    'M/T',
    '6-Speed M/T',
    '6-Speed A/T',
    '4-Speed A/T',
    '7-Speed M/T',
    '2-Speed A/T',
    '1-Speed A/T',
    'A/T CVT',
    '4-Speed A/T',
    '6-SpeedM/T',
    'Transmission Overdrive Switch',
    '8-Speed A/T with Auto-Shift',
    '7-Speed M/T',
    '7-Speed A/T',
    '9-Speed A/T with Auto-Shift',
    '6-Speed A/T with Auto-Shift',
    '6-Speed Electronically Controlled A/T with O',
    '-',
    'CVT-F',
    '8-Speed M/T',
    'M/T',
    '-',
    '-',
    '6 Speed At/Mt',
    '5-Speed A/T',
    '2-Speed A/T',
    '8-Speed A/T',
    '7-Speed',
    'Variable',
    'Single-Speed Fixed Gear',
    '8-Speed AT',
    '10-Speed A/T with Overdrive',
    '7-Speed DCT A/T',
    '-',
    '6-Speed',
    '6 Speed M/T'
]
    
    trans_dict = dict(zip(col_names, col_names_override))
    train_data['transmission'] = train_data['transmission'].replace(trans_dict)
    test_data['transmission']  = test_data['transmission'].replace(trans_dict)

    # Get all numeric values
    # I will add categoricals later
    train_numeric = train_data.loc[:, ('milage', 'model_year', 'price')]
    if test_has_label:
        test_numeric = test_data.loc[:, ('milage', 'model_year', 'price')]
    else:
        test_numeric = test_data.loc[:, ('milage', 'model_year')]

    categoricals = ['model', 'accident', 'clean_title']#['transmission', 'fuel_type', 'accident', 'clean_title', 'model', 'engine']


    for i in categoricals:

        train_data_part = train_data[i]
        test_data_part = test_data[i]

        train_data_part[i] = train_data[i].fillna('blank')
        test_data_part[i]  = test_data[i].fillna('blank')

        # set values of train_data[i] to 'blank', 1% of them
        train_data_part.loc[train_data.sample(frac=0.01).index] = 'blank'

        # Get all unique values in train_data
        train_value_set = set(train_data_part[i])

        # get_dummies for train's model
        train_data_encoded = pd.get_dummies(train_data_part[i], prefix=i)

        # Remove categories that are not in train, switch them to 'blank'
        test_data[i] = test_data[i].apply(lambda x: 'blank' if x not in train_value_set else x)
        test_data_encoded = pd.get_dummies(test_data[i], prefix=i)

        # Add columns for train set
        for j in train_data_encoded.columns:
            if j not in test_data_encoded.columns:
                test_data_encoded[j] = 0
        
        # Add blank columns if it was not created
        if i+'_blank' not in train_data_encoded.columns:
            train_data_encoded[i+'_blank'] = 0
        if i+'_blank' not in test_data_encoded.columns:
            test_data_encoded[i+'_blank'] = 0
        
        train_numeric = pd.concat((train_numeric, train_data_encoded), axis=1)
        test_numeric  = pd.concat((test_numeric,  test_data_encoded), axis=1)

    # sort columns
    train_numeric = train_numeric[train_numeric.columns.sort_values()]
    test_numeric  = test_numeric[test_numeric.columns.sort_values()]
    

    return train_numeric, test_numeric

In [None]:
train_ = pd.DataFrame({'brand':['a', 'b'], 'id':['a', 'b'], 'ext_col':['a', 'b'], 'int_col':['a', 'b'], 'model_year': [2000, 2001], 'milage': [100, 200], 'model': ['A', 'B'], 'transmission': ['A/T', 'M/T'], 'fuel_type': ['A', 'B'], 'accident': ['None', 'reported'], 'clean_title': ['No', 'Yes'], 'engine': ['Turbo', 'Normal'], 'price': [100, 200]})
test_  = pd.DataFrame({'brand':['a', 'b'], 'id':['a', 'b'], 'ext_col':['a', 'b'], 'int_col':['a', 'b'], 'model_year': [2000, 2001], 'milage': [100, 200], 'model': ['A', 'C'], 'transmission': ['A/T', '8-M/T'], 'fuel_type': ['A', 'C'], 'accident': ['None', 'reported'], 'clean_title': ['No', 'Yes'], 'engine': ['Turbo', 'Normal'], 'price': [100, 200]})

print(train_.to_string())
print(test_.to_string())
train_data, test_data = cleanse_data(train_, test_)
print(train_data.to_string())
print(test_data.to_string())

In [None]:
import unittest

class TestOneHotEncoding:

    def test_basic_functionality(self):
        train_df = pd.DataFrame({'model_engine': ['A', 'B'], 'other_feature': [1, 2]})
        test_df = pd.DataFrame({'model_engine': ['A', 'C'], 'other_feature': [3, 4]})
        _, test_result = process_dataframes(train_df, test_df)
        expected_columns = ['model_engine', 'other_feature', 'engine_A', 'engine_B', 'model_engine_blank']
        print(list(test_result.columns), expected_columns)
        print(test_result['model_engine_blank'].iloc[1], 1)

    def test_no_missing_engines(self):
        train_df = pd.DataFrame({'model_engine': ['A', 'B'], 'other_feature': [1, 2]})
        test_df = pd.DataFrame({'model_engine': ['A', 'B'], 'other_feature': [3, 4]})
        _, test_result = process_dataframes(train_df, test_df)
        print((test_result['model_engine_blank'] == 0).all())

    def test_all_missing_engines(self):
        train_df = pd.DataFrame({'model_engine': ['A', 'B'], 'other_feature': [1, 2]})
        test_df = pd.DataFrame({'model_engine': ['C', 'D'], 'other_feature': [3, 4]})
        _, test_result = process_dataframes(train_df, test_df)
        print((test_result['model_engine_blank'] == 1).all())

    def test_empty_dataframes(self):
        train_df = pd.DataFrame(columns=['model_engine', 'other_feature'])
        test_df = pd.DataFrame(columns=['model_engine', 'other_feature'])
        train_result, test_result = process_dataframes(train_df, test_df)
        print(train_result.empty)
        print(test_result.empty)

    def test_identical_train_test(self):
        train_df = pd.DataFrame({'model_engine': ['A', 'B'], 'other_feature': [1, 2]})
        test_df = train_df.copy()
        _, test_result = process_dataframes(train_df, test_df)
        print((test_result['model_engine_blank'] == 0).all())

if __name__ == '__main__':
    module = TestOneHotEncoding()
    module.test_basic_functionality()
    module.test_no_missing_engines()
    module.test_identical_train_test()
    module.test_all_missing_engines()
    module.test_empty_dataframes()

In [62]:
# sns.boxplot(train['price'])
# plt.show()

# train_test_split, test is 0.2
test_set = train[int(len(train)*0.8):]
train_set = train[:int(len(train)*0.8)]
train_data, test_data = cleanse_data(train_set, test_set)
# summarize dataframe

print(train_data.shape)
print(test_data.shape)
# print(data.head())

# sns.boxplot(data['price'])
# plt.show()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_part[i] = train_data[i].fillna('blank')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_part[i]  = test_data[i].fillna('blank')


(120619, 1905)
(37707, 1905)


In [59]:

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge

# add sklearn Ridge
model = Ridge()

model.fit(train_data.drop('price', axis=1), train_data['price'])

print('R^2:', model.score(train_data.drop('price', axis=1), train_data['price']))

print('RMSE:', np.sqrt(mean_squared_error(test_data['price'], model.predict(test_data.drop('price', axis=1)))))

R^2: 0.6250189379723649
RMSE: 76311.76777769299


In [50]:

print('RMSE:', np.sqrt(mean_squared_error(test_data['price'], model.predict(test_data.drop('price', axis=1)))))

RMSE: 76053.46282248195


In [56]:
import pandas as pd

# Sample DataFrames
train_df = pd.DataFrame({
    'model_engine': ['A', 'B', 'C', 'A', 'B'],
    'other_feature': [1, 2, 3, 4, 5]
})

test_df = pd.DataFrame({
    'model_engine': ['A', 'C', 'D', 'E'],
    'other_feature': [6, 7, 8, 9]
})

# Get unique model_engine values from train set
train_engines = set(train_df['model_engine'])

# One-hot encode the 'model_engine' feature in the training set
train_encoded = pd.get_dummies(train_df['model_engine'], prefix='engine')

# Add the encoded columns to the train_df
train_df = pd.concat([train_df, train_encoded], axis=1)

# One-hot encode the 'model_engine' feature in the test set
test_encoded = pd.get_dummies(test_df['model_engine'], prefix='engine')

# Add the encoded columns to the test_df
test_df = pd.concat([test_df, test_encoded], axis=1)

# Add 'model_engine_blank' column to test_df
test_df['model_engine_blank'] = test_df['model_engine'].apply(lambda x: 1 if x not in train_engines else 0)

# Ensure test_df has the same columns as train_df (excluding 'model_engine_blank')
for column in train_encoded.columns:
    if column not in test_df.columns:
        test_df[column] = 0

# Add 'model_engine_blank' column to train_df (all zeros)
train_df['model_engine_blank'] = 0

# Ensure the order of columns in test_df matches train_df
test_df = test_df[train_df.columns]

# Fill missing columns in test_df with zeros
test_df.fillna(0, inplace=True)

# Display the final DataFrames
print("Train DataFrame:")
print(train_df)
print("\nTest DataFrame:")
print(test_df)

Train DataFrame:
  model_engine  other_feature  engine_A  engine_B  engine_C  \
0            A              1         1         0         0   
1            B              2         0         1         0   
2            C              3         0         0         1   
3            A              4         1         0         0   
4            B              5         0         1         0   

   model_engine_blank  
0                   0  
1                   0  
2                   0  
3                   0  
4                   0  

Test DataFrame:
  model_engine  other_feature  engine_A  engine_B  engine_C  \
0            A              6         1         0         0   
1            C              7         0         0         1   
2            D              8         0         0         0   
3            E              9         0         0         0   

   model_engine_blank  
0                   0  
1                   0  
2                   1  
3                   1  
