In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from data_cleaning import update_fuel_type

test = "../data/raw/test.csv"
train = "../data/raw/train.csv"
sample_submission = "../data/raw/sample_submission.csv"


test = pd.read_csv(test)
train = pd.read_csv(train)
sample_submission = pd.read_csv(sample_submission)

# Clean and Wrangle
## Test dataset

In [4]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [5]:
test.shape

(125690, 12)

In [6]:
test.dtypes

id               int64
brand           object
model           object
model_year       int64
milage           int64
fuel_type       object
engine          object
transmission    object
ext_col         object
int_col         object
accident        object
clean_title     object
dtype: object

In [7]:
test.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        3383
engine              0
transmission        0
ext_col             0
int_col             0
accident         1632
clean_title     14239
dtype: int64

### fuel_type

In [9]:
# Call the function: 'update_fuel_type' to fill in the null values in 'fuel_type' 
test = update_fuel_type(test)

In [10]:
test.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type           0
engine              0
transmission        0
ext_col             0
int_col             0
accident         1632
clean_title     14239
dtype: int64

In [11]:
test['fuel_type'].unique()

array(['Gasoline', 'Hybrid', 'Diesel', 'E85 Flex Fuel', 'Electric',
       'Plug-In Hybrid', '–', 'not supported'], dtype=object)

In [12]:
# Replace '-' and 'not supported' entries with NaN
test['fuel_type'] = test['fuel_type'].replace(['–', 'not supported'], np.nan)

# Group 'Plug-In Hybrid' with 'Hybrid' and 'E85 Flex Fuel' with 'Gasoline'
test['fuel_type'] = test['fuel_type'].replace({
    'Plug-In Hybrid': 'Hybrid',
    'E85 Flex Fuel': 'Gasoline'
})

In [13]:
test.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type         552
engine              0
transmission        0
ext_col             0
int_col             0
accident         1632
clean_title     14239
dtype: int64

In [14]:
# Call the function: 'update_fuel_type' to fill in the null values in 'fuel_type' 
test = update_fuel_type(test)

In [15]:
test.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type         524
engine              0
transmission        0
ext_col             0
int_col             0
accident         1632
clean_title     14239
dtype: int64

In [16]:
test[test['fuel_type'].isnull()].head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
704,189237,Dodge,Challenger SRT8 392,2010,33700,,–,A/T,White,Gray,None reported,Yes
892,189425,Dodge,Challenger R/T Scat Pack,2018,38000,,–,8-Speed A/T,White,Black,None reported,Yes
991,189524,Porsche,Boxster S,1993,97496,,–,A/T,Silver,Black,None reported,
1008,189541,Toyota,Land Cruiser Base,1994,97000,,–,A/T,White,Gray,At least 1 accident or damage reported,Yes
1011,189544,Chevrolet,Sonic LT,2018,8940,,–,6-Speed A/T,White,–,None reported,Yes


In [17]:
# Drop rows where 'fuel_type' is NaN because the 'engine' is not defined
test.dropna(subset=['fuel_type'], inplace=True)

In [18]:
test.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type           0
engine              0
transmission        0
ext_col             0
int_col             0
accident         1627
clean_title     14227
dtype: int64

In [19]:
test.shape

(125166, 12)

In [20]:
test['fuel_type'].unique()

array(['Gasoline', 'Hybrid', 'Diesel', 'Electric'], dtype=object)

#### One-Hot Encoding 

In [22]:
# One-Hot Encoding for 'fuel_type'
test = pd.get_dummies(test, columns=['fuel_type'], drop_first=False)

In [23]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,engine,transmission,ext_col,int_col,accident,clean_title,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid
0,188533,Land,Rover LR2 Base,2015,98000,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes,False,False,True,False
1,188534,Land,Rover Defender SE,2020,9142,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes,False,False,False,True
2,188535,Ford,Expedition Limited,2022,28121,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,,False,False,True,False
3,188536,Audi,A6 2.0T Sport,2016,61258,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,,False,False,True,False
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes,False,False,True,False


### transmission

In [25]:
test['transmission'].unique()

array(['6-Speed A/T', '8-Speed A/T', '10-Speed Automatic', 'Automatic',
       'A/T', 'Transmission w/Dual Shift Mode', '6-Speed Automatic',
       '7-Speed A/T', '5-Speed M/T', '8-Speed Automatic', '9-Speed A/T',
       '10-Speed A/T', '9-Speed Automatic', '5-Speed Automatic',
       'Automatic CVT', '6-Speed M/T', 'CVT Transmission',
       'Transmission Overdrive Switch', 'M/T', '7-Speed Automatic',
       '5-Speed A/T', '1-Speed A/T', '7-Speed Automatic with Auto-Shift',
       '4-Speed A/T', '9-Speed Automatic with Auto-Shift',
       '8-Speed Automatic with Auto-Shift', 'Variable',
       '10-Speed Automatic with Overdrive', '1-Speed Automatic',
       '7-Speed M/T', '6-Speed Manual', '2-Speed Automatic', 'F',
       '2-Speed A/T', '4-Speed Automatic',
       '6-Speed Automatic with Auto-Shift', '2', '6 Speed Mt',
       '7-Speed Manual', '–', '8-Speed Manual', 'Manual',
       '6-Speed Electronically Controlled Automatic with O',
       '8-SPEED A/T', '7-Speed', '7-Speed DCT Aut

In [26]:
# Simplify the transmission types: map the different values to a smaller set of categories (manual, automatic, CVT, and, Other)

# Create a dictionary for mapping transmission types
transmission_mapping = {
    'Automatic': ['A/T', 'Automatic', '6-Speed A/T', '8-Speed A/T', '10-Speed Automatic', 
                  'Transmission w/Dual Shift Mode', '6-Speed Automatic', '7-Speed A/T', 
                  '5-Speed Automatic', '8-Speed Automatic', '9-Speed A/T', '10-Speed A/T', 
                  '9-Speed Automatic', '5-Speed A/T', '1-Speed A/T', '4-Speed A/T',
                  '9-Speed Automatic with Auto-Shift', '8-Speed Automatic with Auto-Shift', 
                  '10-Speed Automatic with Overdrive', '1-Speed Automatic', '2-Speed Automatic', 
                  '7-Speed Automatic', '7-Speed Automatic with Auto-Shift', '6-Speed Automatic with Auto-Shift',
                  'Single-Speed Fixed Gear', '8-SPEED A/T', '7-Speed DCT Automatic', '2-Speed A/T', '4-Speed Automatic'],
    
    'Manual': ['M/T', 'Manual', '5-Speed M/T', '6-Speed M/T', '7-Speed M/T', 
               '6-Speed Manual', '7-Speed Manual', '8-Speed Manual', '6 Speed Mt', '7-Speed'],
    
    'CVT': ['Automatic CVT', 'CVT Transmission', 'Variable', 'CVT-F'],
    
    'Other': ['F', '2', '–', 'SCHEDULED FOR OR IN PRODUCTION', 'Transmission Overdrive Switch']
}

# Function to map transmission types based on the dictionary
def map_transmission(trans):
    for key, values in transmission_mapping.items():
        if trans in values:
            return key
    return 'Other'

# Apply the mapping function to the 'transmission' column
test['transmission'] = test['transmission'].apply(map_transmission)

In [27]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,engine,transmission,ext_col,int_col,accident,clean_title,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid
0,188533,Land,Rover LR2 Base,2015,98000,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Automatic,White,Beige,None reported,Yes,False,False,True,False
1,188534,Land,Rover Defender SE,2020,9142,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Automatic,Silver,Black,None reported,Yes,False,False,False,True
2,188535,Ford,Expedition Limited,2022,28121,3.5L V6 24V PDI DOHC Twin Turbo,Automatic,White,Ebony,None reported,,False,False,True,False
3,188536,Audi,A6 2.0T Sport,2016,61258,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,,False,False,True,False
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Automatic,Gray,Black,None reported,Yes,False,False,True,False


#### One-Hot Encoding 

In [29]:
# One-Hot Encoding for 'transmission'
test = pd.get_dummies(test, columns=['transmission'], drop_first=False)


In [30]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,engine,ext_col,int_col,accident,clean_title,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid,transmission_Automatic,transmission_CVT,transmission_Manual,transmission_Other
0,188533,Land,Rover LR2 Base,2015,98000,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,White,Beige,None reported,Yes,False,False,True,False,True,False,False,False
1,188534,Land,Rover Defender SE,2020,9142,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Silver,Black,None reported,Yes,False,False,False,True,True,False,False,False
2,188535,Ford,Expedition Limited,2022,28121,3.5L V6 24V PDI DOHC Twin Turbo,White,Ebony,None reported,,False,False,True,False,True,False,False,False
3,188536,Audi,A6 2.0T Sport,2016,61258,2.0 Liter TFSI,Silician Yellow,Black,None reported,,False,False,True,False,True,False,False,False
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Gray,Black,None reported,Yes,False,False,True,False,True,False,False,False


In [31]:
# Convert boolean columns (True/False) to integers (0/1)
boolean_columns = test.select_dtypes(include='bool').columns

# Convert True/False to 0/1 in these boolean columns
test[boolean_columns] = test[boolean_columns].astype(int)

In [32]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,engine,ext_col,int_col,accident,clean_title,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid,transmission_Automatic,transmission_CVT,transmission_Manual,transmission_Other
0,188533,Land,Rover LR2 Base,2015,98000,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,White,Beige,None reported,Yes,0,0,1,0,1,0,0,0
1,188534,Land,Rover Defender SE,2020,9142,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Silver,Black,None reported,Yes,0,0,0,1,1,0,0,0
2,188535,Ford,Expedition Limited,2022,28121,3.5L V6 24V PDI DOHC Twin Turbo,White,Ebony,None reported,,0,0,1,0,1,0,0,0
3,188536,Audi,A6 2.0T Sport,2016,61258,2.0 Liter TFSI,Silician Yellow,Black,None reported,,0,0,1,0,1,0,0,0
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Gray,Black,None reported,Yes,0,0,1,0,1,0,0,0


## Train dataset

In [34]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [35]:
train.shape

(188533, 13)

In [36]:
train.dtypes

id               int64
brand           object
model           object
model_year       int64
milage           int64
fuel_type       object
engine          object
transmission    object
ext_col         object
int_col         object
accident        object
clean_title     object
price            int64
dtype: object

In [37]:
train.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

### fuel_type

In [39]:
# Call the function: 'update_fuel_type' to fill in the null values in 'fuel_type'
train = update_fuel_type(train)

In [40]:
train.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type           0
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [41]:
train['fuel_type'].unique()

array(['Gasoline', 'E85 Flex Fuel', 'Electric', 'Hybrid', 'Diesel',
       'Plug-In Hybrid', '–', 'not supported'], dtype=object)

In [42]:
# Replace '-' and 'not supported' entries with NaN
train['fuel_type'] = train['fuel_type'].replace(['–', 'not supported'], np.nan)

# Group 'Plug-In Hybrid' with 'Hybrid' and 'E85 Flex Fuel' with 'Gasoline'
train['fuel_type'] = train['fuel_type'].replace({
    'Plug-In Hybrid': 'Hybrid',
    'E85 Flex Fuel': 'Gasoline'
})

In [43]:
train.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type         796
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [44]:
train[train['fuel_type'].isnull()].head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
532,532,Dodge,Challenger R/T Scat Pack,2021,35000,,–,8-Speed A/T,Blue,Gray,None reported,Yes,48000
1155,1155,Ford,Mustang GT Premium,2017,143600,,–,6-Speed M/T,White,Black,At least 1 accident or damage reported,Yes,31000
1307,1307,Toyota,Land Cruiser Base,1994,138033,,–,A/T,Green,Beige,None reported,Yes,11999
1362,1362,Ford,Mustang EcoBoost Premium,2016,43000,,–,A/T,Blue,Black,None reported,Yes,15500
1741,1741,Chevrolet,Protege DX,1993,94200,,–,A/T,Red,Gray,None reported,Yes,7599


In [45]:
# Drop rows where 'fuel_type' is NaN because the 'engine' is not defined
train.dropna(subset=['fuel_type'], inplace=True)

In [46]:
train.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type           0
engine              0
transmission        0
ext_col             0
int_col             0
accident         2435
clean_title     21393
price               0
dtype: int64

In [47]:
train.shape

(187737, 13)

In [48]:
train['fuel_type'].unique()

array(['Gasoline', 'Electric', 'Hybrid', 'Diesel'], dtype=object)

#### One-Hot Encoding 

In [50]:
# One-Hot Encoding for 'fuel_type'
train = pd.get_dummies(train, columns=['fuel_type'], drop_first=False)

In [51]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,engine,transmission,ext_col,int_col,accident,clean_title,price,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid
0,0,MINI,Cooper S Base,2007,213000,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200,False,False,True,False
1,1,Lincoln,LS V8,2002,143250,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999,False,False,True,False
2,2,Chevrolet,Silverado 2500 LT,2002,136731,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900,False,False,True,False
3,3,Genesis,G90 5.0 Ultimate,2017,19500,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000,False,False,True,False
4,4,Mercedes-Benz,Metris Base,2021,7388,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500,False,False,True,False


### transmission

In [103]:
train['transmission'].unique()

array(['A/T', 'Transmission w/Dual Shift Mode', '7-Speed A/T',
       '8-Speed A/T', '10-Speed Automatic', '1-Speed A/T', '6-Speed A/T',
       '10-Speed A/T', '9-Speed A/T', '8-Speed Automatic',
       '9-Speed Automatic', '5-Speed A/T', 'Automatic',
       '7-Speed Automatic with Auto-Shift', 'CVT Transmission',
       '5-Speed M/T', 'M/T', '6-Speed M/T', '6-Speed Automatic',
       '4-Speed Automatic', '7-Speed M/T', '2-Speed A/T',
       '1-Speed Automatic', 'Automatic CVT', '4-Speed A/T',
       '6-Speed Manual', 'Transmission Overdrive Switch',
       '8-Speed Automatic with Auto-Shift', '7-Speed Manual',
       '7-Speed Automatic', '9-Speed Automatic with Auto-Shift',
       '6-Speed Automatic with Auto-Shift',
       '6-Speed Electronically Controlled Automatic with O', 'F', 'CVT-F',
       '8-Speed Manual', 'Manual', '–', '2', '6 Speed At/Mt',
       '5-Speed Automatic', '2-Speed Automatic', '8-SPEED A/T', '7-Speed',
       'Variable', 'Single-Speed Fixed Gear', '8-SPEED AT',


In [105]:
# Create simplified dictionary for mapping transmission types
transmission_mapping = {
    'Automatic': ['A/T', 'Automatic', '6-Speed A/T', '8-Speed A/T', '10-Speed Automatic', 
                  'Transmission w/Dual Shift Mode', '6-Speed Automatic', '7-Speed A/T', 
                  '5-Speed Automatic', '8-Speed Automatic', '9-Speed A/T', '10-Speed A/T', 
                  '9-Speed Automatic', '5-Speed A/T', '1-Speed A/T', '4-Speed A/T',
                  '9-Speed Automatic with Auto-Shift', '8-Speed Automatic with Auto-Shift', 
                  '10-Speed Automatic with Overdrive', '1-Speed Automatic', '2-Speed Automatic', 
                  '7-Speed Automatic', '7-Speed Automatic with Auto-Shift', '6-Speed Automatic with Auto-Shift',
                  'Single-Speed Fixed Gear', '8-SPEED A/T', '7-Speed DCT Automatic', '2-Speed A/T', '4-Speed Automatic'],
    
    'Manual': ['M/T', 'Manual', '5-Speed M/T', '6-Speed M/T', '7-Speed M/T', 
               '6-Speed Manual', '7-Speed Manual', '8-Speed Manual', '6 Speed Mt', '7-Speed'],
    
    'CVT': ['Automatic CVT', 'CVT Transmission', 'Variable', 'CVT-F'],
    
    'Other': ['F', '2', '–', 'SCHEDULED FOR OR IN PRODUCTION', 'Transmission Overdrive Switch']
}

# Function to map transmission types based on the dictionary
def map_transmission(trans):
    for key, values in transmission_mapping.items():
        if trans in values:
            return key
    return 'Other'

# Apply the mapping function to the 'transmission' column in the train dataset
train['transmission'] = train['transmission'].apply(map_transmission)

In [107]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,engine,transmission,ext_col,int_col,accident,clean_title,price,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid
0,0,MINI,Cooper S Base,2007,213000,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,Automatic,Yellow,Gray,None reported,Yes,4200,0,0,1,0
1,1,Lincoln,LS V8,2002,143250,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,Automatic,Silver,Beige,At least 1 accident or damage reported,Yes,4999,0,0,1,0
2,2,Chevrolet,Silverado 2500 LT,2002,136731,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,Automatic,Blue,Gray,None reported,Yes,13900,0,0,1,0
3,3,Genesis,G90 5.0 Ultimate,2017,19500,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Automatic,Black,Black,None reported,Yes,45000,0,0,1,0
4,4,Mercedes-Benz,Metris Base,2021,7388,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Automatic,Black,Beige,None reported,Yes,97500,0,0,1,0


#### One-Hot Encoding 

In [111]:
# One-Hot Encoding for 'transmission' in the train dataset
train = pd.get_dummies(train, columns=['transmission'], drop_first=False)

In [113]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,engine,ext_col,int_col,accident,clean_title,price,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid,transmission_Automatic,transmission_CVT,transmission_Manual,transmission_Other
0,0,MINI,Cooper S Base,2007,213000,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,Yellow,Gray,None reported,Yes,4200,0,0,1,0,True,False,False,False
1,1,Lincoln,LS V8,2002,143250,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,Silver,Beige,At least 1 accident or damage reported,Yes,4999,0,0,1,0,True,False,False,False
2,2,Chevrolet,Silverado 2500 LT,2002,136731,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,Blue,Gray,None reported,Yes,13900,0,0,1,0,True,False,False,False
3,3,Genesis,G90 5.0 Ultimate,2017,19500,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Black,Black,None reported,Yes,45000,0,0,1,0,True,False,False,False
4,4,Mercedes-Benz,Metris Base,2021,7388,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Black,Beige,None reported,Yes,97500,0,0,1,0,True,False,False,False


In [119]:
# Convert boolean columns (True/False) to integers (0/1)
boolean_columns = train.select_dtypes(include='bool').columns

# Convert True/False to 0/1 in these boolean columns
train[boolean_columns] = train[boolean_columns].astype(int)

In [121]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,engine,ext_col,int_col,accident,clean_title,price,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid,transmission_Automatic,transmission_CVT,transmission_Manual,transmission_Other
0,0,MINI,Cooper S Base,2007,213000,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,Yellow,Gray,None reported,Yes,4200,0,0,1,0,1,0,0,0
1,1,Lincoln,LS V8,2002,143250,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,Silver,Beige,At least 1 accident or damage reported,Yes,4999,0,0,1,0,1,0,0,0
2,2,Chevrolet,Silverado 2500 LT,2002,136731,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,Blue,Gray,None reported,Yes,13900,0,0,1,0,1,0,0,0
3,3,Genesis,G90 5.0 Ultimate,2017,19500,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Black,Black,None reported,Yes,45000,0,0,1,0,1,0,0,0
4,4,Mercedes-Benz,Metris Base,2021,7388,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Black,Beige,None reported,Yes,97500,0,0,1,0,1,0,0,0
