##import library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#upload dataset

In [None]:
file_path = '/content/Sport car price.csv'
df = pd.read_csv(file_path)

#Display the first few rows of each dataset

In [None]:
print(df.head())

      Car Make Car Model  Year Engine Size (L) Horsepower Torque (lb-ft)  \
0      Porsche       911  2022               3        379            331   
1  Lamborghini   Huracan  2021             5.2        630            443   
2      Ferrari   488 GTB  2022             3.9        661            561   
3         Audi        R8  2022             5.2        562            406   
4      McLaren      720S  2021               4        710            568   

  0-60 MPH Time (seconds) Price (in USD)  
0                       4        101,200  
1                     2.8        274,390  
2                       3        333,750  
3                     3.2        142,700  
4                     2.7        298,000  


#Exploratory Data Analysis (EDA)

In [None]:
print(df.info())

print(df.describe())


<class 'pandas.core.frame.DataFrame'>
Index: 1004 entries, 0 to 1005
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Car Make                 1004 non-null   object
 1   Car Model                1004 non-null   object
 2   Year                     1004 non-null   int64 
 3   Engine Size (L)          994 non-null    object
 4   Horsepower               1004 non-null   object
 5   Torque (lb-ft)           1001 non-null   object
 6   0-60 MPH Time (seconds)  1004 non-null   object
 7   Price (in USD)           1004 non-null   object
dtypes: int64(1), object(7)
memory usage: 70.6+ KB
None
              Year
count  1004.000000
mean   2021.201195
std       2.022643
min    1965.000000
25%    2021.000000
50%    2021.000000
75%    2022.000000
max    2023.000000


##Missing values

In [None]:
print(df.isnull().sum())


Car Make                    0
Car Model                   0
Year                        0
Engine Size (L)            10
Horsepower                  0
Torque (lb-ft)              3
0-60 MPH Time (seconds)     0
Price (in USD)              0
dtype: int64


#Preprocessing

In [None]:
df_clean = df.copy()

# convert values to numeric
def clean_numeric(series):
    return pd.to_numeric(series.astype(str).str.replace(',', '').str.strip(), errors='coerce')

numeric_cols = ["Engine Size (L)", "Horsepower", "Torque (lb-ft)", "0-60 MPH Time (seconds)", "Price (in USD)"]
for col in numeric_cols:
    df_clean[col] = clean_numeric(df_clean[col])

missing_values = df_clean.isnull().sum()

# fill missing numeric values with the median of each column
for col in numeric_cols:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Display the final result
df_clean_info = df_clean.info()
df_clean_head = df_clean.head()

missing_values, df_clean_info, df_clean_head


<class 'pandas.core.frame.DataFrame'>
Index: 1004 entries, 0 to 1005
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Car Make                 1004 non-null   object 
 1   Car Model                1004 non-null   object 
 2   Year                     1004 non-null   int64  
 3   Engine Size (L)          1004 non-null   float64
 4   Horsepower               1004 non-null   float64
 5   Torque (lb-ft)           1004 non-null   float64
 6   0-60 MPH Time (seconds)  1004 non-null   float64
 7   Price (in USD)           1004 non-null   int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 70.6+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna(df_clean[col].median(), inplace=True)


(Car Make                    0
 Car Model                   0
 Year                        0
 Engine Size (L)            56
 Horsepower                  6
 Torque (lb-ft)              5
 0-60 MPH Time (seconds)     1
 Price (in USD)              0
 dtype: int64,
 None,
       Car Make Car Model  Year  Engine Size (L)  Horsepower  Torque (lb-ft)  \
 0      Porsche       911  2022              3.0       379.0           331.0   
 1  Lamborghini   Huracan  2021              5.2       630.0           443.0   
 2      Ferrari   488 GTB  2022              3.9       661.0           561.0   
 3         Audi        R8  2022              5.2       562.0           406.0   
 4      McLaren      720S  2021              4.0       710.0           568.0   
 
    0-60 MPH Time (seconds)  Price (in USD)  
 0                      4.0          101200  
 1                      2.8          274390  
 2                      3.0          333750  
 3                      3.2          142700  
 4                

In [None]:
# Scale relevant numeric columns
scaler = StandardScaler()
df_clean[['Engine Size (L)', 'Horsepower', 'Torque (lb-ft)', '0-60 MPH Time (seconds)', 'Price (in USD)']] = scaler.fit_transform(
    df_clean[['Engine Size (L)', 'Horsepower', 'Torque (lb-ft)', '0-60 MPH Time (seconds)', 'Price (in USD)']]
)


In [None]:
print(df_clean)

          Car Make Car Model  Year  Engine Size (L)  Horsepower  \
0          Porsche       911  2022        -1.008930   -0.613557   
1      Lamborghini   Huracan  2021         0.606879   -0.012391   
2          Ferrari   488 GTB  2022        -0.347917    0.061857   
3             Audi        R8  2022         0.606879   -0.175256   
4          McLaren      720S  2021        -0.274471    0.179216   
...            ...       ...   ...              ...         ...   
1000  Aston Martin   Vantage  2021        -0.274471   -0.316566   
1001       Bugatti    Chiron  2021         2.663363    2.021038   
1002    Koenigsegg     Jesko  2022         0.459988    1.544416   
1004       McLaren     Senna  2021        -0.274471    0.368428   
1005        Pagani    Huayra  2021         1.194446    0.308551   

      Torque (lb-ft)  0-60 MPH Time (seconds)  Price (in USD)  
0          -0.674164                 0.623354       -0.377809  
1          -0.326485                -0.928174       -0.142008  
2  

## Prepare Data for Machine learning

In [None]:
# Check the column names of the dataset
print(df_clean.columns)

Index(['Car Make', 'Car Model', 'Year', 'Engine Size (L)', 'Horsepower',
       'Torque (lb-ft)', '0-60 MPH Time (seconds)', 'Price (in USD)'],
      dtype='object')


In [None]:
# Apply One-Hot Encoding to the 'Car Make' and 'Car Model' columns
df_encoded = pd.get_dummies(df_clean, columns=['Car Make', 'Car Model'], drop_first=True)

# Display the first few rows of the encoded dataset
print(df_encoded.head())


   Year  Engine Size (L)  Horsepower  Torque (lb-ft)  0-60 MPH Time (seconds)  \
0  2022        -1.008930   -0.613557       -0.674164                 0.623354   
1  2021         0.606879   -0.012391       -0.326485                -0.928174   
2  2022        -0.347917    0.061857        0.039821                -0.669586   
3  2022         0.606879   -0.175256       -0.441343                -0.410998   
4  2021        -0.274471    0.179216        0.061551                -1.057468   

   Price (in USD)  Car Make_Alfa Romeo  Car Make_Alpine  Car Make_Ariel  \
0       -0.377809                False            False           False   
1       -0.142008                False            False           False   
2       -0.061188                False            False           False   
3       -0.321306                False            False           False   
4       -0.109863                False            False           False   

   Car Make_Aston Martin  ...  Car Model_Taycan Turbo S  Car M

# train_model

In [None]:
# Selecting Feature and Target Columns
target_col = "Price (in USD)"
feature_cols = ["Car Make", "Car Model", "Year", "Engine Size (L)", "Horsepower", "Torque (lb-ft)", "0-60 MPH Time (seconds)"]

# استخراج X و y
X = df_clean[feature_cols].copy()
y = df_clean[target_col].astype(float).copy()

# نمایش ابعاد
print("X shape:", X.shape)
print("y shape:", y.shape)

# Splitting Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# نمایش ابعاد
split_info = {
    "Train X shape": X_train.shape,
    "Train y shape": y_train.shape,
    "Test X shape": X_test.shape,
    "Test y shape": y_test.shape
}

# Choosing the Mode
def clean_numeric(series):
    return pd.to_numeric(series.astype(str).str.replace(",", "").str.strip(), errors="coerce")

numeric_cols = ["Engine Size (L)", "Horsepower", "Torque (lb-ft)", "0-60 MPH Time (seconds)", "Price (in USD)"]
df = df_clean.copy()
for col in numeric_cols:
    df[col] = clean_numeric(df[col])
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# ---------- Step 1: Select X, y ----------
target_col = "Price (in USD)"
feature_cols = ["Car Make", "Car Model", "Year", "Engine Size (L)", "Horsepower", "Torque (lb-ft)", "0-60 MPH Time (seconds)"]
X_raw = df[feature_cols].copy()
y = df[target_col].astype(float).copy()

# ---------- Step 2: Manual One-Hot Encoding (no Pipeline) ----------
X_encoded = pd.get_dummies(X_raw, columns=["Car Make", "Car Model"], drop_first=False)

# Keep columns list for inference later
feature_order = X_encoded.columns.tolist()

# ---------- Step 3: Train/Test split ----------
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# ---------- Step 4: Train model ----------
rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# ---------- Step 5: Evaluate ----------
mse = mean_squared_error(y_test, preds)  # بدون squared
rmse = np.sqrt(mse)                      # خودمون sqrt می‌گیریم
r2 = r2_score(y_test, preds)

print("Shapes -> X_train:", X_train.shape, "| X_test:", X_test.shape)
print(f"RMSE (USD): {rmse:.2f}")
print(f"R^2: {r2:.4f}")

# ---------- Step 6: Feature importances ----------
feat_imp = pd.DataFrame({
    "feature": feature_order,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False).reset_index(drop=True)

# Training the Model (fit)
# Prediction (predict)
# Model Evaluation

X shape: (1004, 7)
y shape: (1004,)
Shapes -> X_train: (803, 219) | X_test: (201, 219)
RMSE (USD): 0.18
R^2: 0.9506
