In [486]:
import re
from itertools import count
import numpy as np
# add imports here
import pandas as pd
from scipy.constants import horsepower

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# from Training import y_pred_gb
# from datashader import count_cat



In [487]:
# add from here
from sklearn.preprocessing import LabelEncoder

In [488]:
train_data = pd.read_csv('data/train.csv')
train_data.drop(columns=['id'], inplace=True)
train_data.head(4)


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000


In [489]:
train_data = train_data[train_data.price < 100000]
train_data = train_data[train_data.model_year > 2000]

In [490]:
# Fuel type
# Hybrid
# Due to dual engines you could assume they cost more
fuel_mapping = {
    'Hybrid': ['hybrid'],
    'Electric': ['electric'],
    'Diesel': ['diesel'],
    'E85': ['e85'],
    'Gasoline': ['gasoline']
}

# Function to map fuel types
def map_fuel_type(value):
    value_lower = value.lower()
    for fuel, keywords in fuel_mapping.items():
        if any(keyword in value_lower for keyword in keywords):
            return fuel
    return 'Other'

# Apply the mapping function
train_data['fuel_type'] = train_data['fuel_type'].astype(str).apply(map_fuel_type)

# print(train_data)
train_data.head(5)


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [491]:
# may want to add a more ideal way to fix the color hell
# I wonder if there is an external library that can help with this or I have to do it manually
# Extract color information
color_mapping = {
    'Blue': ['blue', 'navy', 'sea', 'glacier', 'sapphire', 'midnight'],
    'Black': ['black', 'onyx', 'obsidian', 'jet', 'ebony'],
    'White': ['white', 'pearl', 'ivory', 'alabaster', 'cream', 'frost'],
    'Silver': ['silver', 'platinum', 'titanium', 'steel', 'graphite', 'ash'],
    'Gray': ['gray', 'grey', 'charcoal', 'slate', 'gunmetal'],
    'Red': ['red', 'crimson', 'ruby', 'cherry', 'garnet'],
    'Green': ['green', 'emerald', 'olive', 'lime', 'forest'],
    'Brown': ['brown', 'copper', 'bronze', 'mahogany', 'rust'],
    'Yellow': ['yellow', 'gold', 'amber', 'honey', 'sun'],
    'Orange': ['orange', 'copper', 'bronze', 'mahogany', 'rust'],
    'Purple': ['purple', 'violet', 'lavender', 'plum', 'lilac'],
    'Pink': ['pink', 'rose', 'fuchsia', 'blush', 'coral'],
    'Beige': ['beige', 'tan', 'cream', 'sand', 'camel'],
    'Gold': ['gold', 'yellow', 'amber', 'honey', 'sun'],
    'Bronze': ['bronze', 'brown', 'copper', 'mahogany', 'rust']
}

# Function to map colors
def map_color(value):
    value_lower = value.lower()
    for color, keywords in color_mapping.items():
        if any(keyword in value_lower for keyword in keywords):
            return color
    return value

# Apply the mapping function
train_data['ext_col'] = train_data['ext_col'].astype(str).apply(map_color)

# print(train_data)


In [492]:
# Accident and clean title
# Accident

train_data.accident = train_data.accident.astype(str).apply(lambda x: 1 if 'At least 1' in x else 0)

# Clean title
train_data.clean_title = train_data.clean_title.astype(str).apply(lambda x: 1 if 'Yes' in x else 0)


In [493]:
# Transmission
# Simplify transmission to 'Manual' or 'Automatic'

train_data.transmission = train_data.transmission.astype(str).apply(lambda x: 'Manual' if
                                                                    'manual' in x or
                                                                    'm/t' in x in x else x)
# Missing some types of automatic transmissions, but this is a start
train_data.transmission = train_data.transmission.astype(str).apply(lambda x: 'Automatic' if
                                                                    'automatic' in x or
                                                                    'a/t' in x in x else x)
train_data.transmission = train_data.transmission.astype(str).apply(lambda x: x if
                                                                    'Automatic' in x or
                                                                    'Manual' in x else 'Other')

In [494]:
gr_data = train_data.copy()

In [495]:
# Extract engine information


# Extract horsepower(HP) from the 'engine' column now kw
def extract_engine(df):
    #as a non car person i would assume that cylinders are not as important as the horsepower and the engine size for it's value
    if pd.isnull(df):
        return np.nan
    hp_match = re.search(r'(\d+\.?\d*)HP', df) # HP hp = re.search(r'(\d+\.?\d*)HP', df)
    engine_size_match = re.search(r'(\d+\.?\d*)\s*[lL]', df) # L, test if it works like it is
    cylynder_match = re.search(r'(\d+\.?\d*)\s*[cC][yY][lL]', df) # Cylinders

    # Extract and return the horsepower and engine size as a float
    horsepower = float(hp_match.group(1)) if hp_match else None #np.nan # May need to rename the horsepower variable to something like hp. Since horsepower is from the scipy.constants
    engine_size = float(engine_size_match.group(1)) if engine_size_match else None #np.nan
    cylynder = float(cylynder_match.group(1)) if cylynder_match else None

    return horsepower, engine_size , cylynder
    # if pd.isnull(df):
    #     return np.nan
    # df = str(df)
    # hp = re.search(r'(\d+\.?\d*)\s*[hH][pP]', df)
    # return float(hp.group(1)) if hp else np.nan

# Apply the refined horsepower extraction function to the 'engine' column
train_data[['horsepower','engine_size', 'cylynder']] = train_data['engine'].apply(lambda x: pd.Series(extract_engine(x)))

train_data.drop(columns=['engine'], inplace=True)

train_data.horsepower = train_data.horsepower.fillna(0)
train_data.engine_size = train_data.engine_size.fillna(0)
train_data.cylynder = train_data.cylynder.fillna(0)

In [496]:
#int_col
# Extract color information

# Function to map colors
# Define the interior color mapping dictionary for car interiors
interior_color_mapping = {
    'Black': ['black', 'onyx', 'ebony', 'charcoal'],
    'Gray': ['gray', 'grey', 'ash', 'slate', 'pewter'],
    'Beige': ['beige', 'tan', 'cream', 'sand', 'camel'],
    'Brown': ['brown', 'chocolate', 'espresso', 'cocoa', 'saddle'],
    'White': ['white', 'ivory', 'alabaster', 'frost', 'pearl'],
    'Red': ['red', 'crimson', 'burgundy', 'wine', 'maroon'],
    'Blue': ['blue', 'navy'],
    'Green': ['green', 'olive'],
    # Add other interior-specific colors or materials as needed
}

# Function to map interior colors
def map_interior_color(value):
    value_lower = value.lower()
    for color, keywords in interior_color_mapping.items():
        if any(keyword in value_lower for keyword in keywords):
            return color
    return 'Other'

# Apply the mapping function to the 'interior_color' column
train_data['int_col'] = train_data['int_col'].astype(str).apply(map_interior_color)



In [497]:
# Label encoding

label_columns = ['brand', 'model', 'fuel_type', 'ext_col', 'int_col', 'transmission']
label_encoders = {col: LabelEncoder() for col in label_columns}

for col in label_columns:
    train_data[col] = label_encoders[col].fit_transform(train_data[col])

In [498]:
X= train_data.drop(columns=['price'])
y = train_data.price

In [499]:
from xgboost import XGBClassifier

#XGBRegressor. Already tested this in a previous notebook, and it worked if i didn't have problems with pipelining the data
xgb_model = xgb.XGBRegressor()
# xgb_model = XGBClassifier()

# parameters = { # Heads up only works in PyCharmPro ide not the notebook web version
#     'n_estimators':[10000],
#     'random_state':[42],
#     'subsample':[0.9],
#     'min_child_weight':[10],
#     'max_depth':[6],
#     'learning_rate':[0.01],
#     'gamma':[0.5],
#     'colsample_bytree':[0.9],
#     'lambda':[0],
#     'alpha':[0.1],
#     # 'scale_pos_weight':[len(y[y == 0]) ],
#     # 'objective':['binary:logistic']
# }
parameters = { # go over the parameters and see if they are correct because these are now automatically generated
    'n_estimators':[100], #, 500, 1000
    'max_depth':[3],
    'learning_rate':[0.3],
    'subsample':[0.9],
    'colsample_bytree':[0.5, 0.7, 0.9],
    'gamma':[0, 0.1],
    'min_child_weight':[5],
    'lambda':[0, 0.1],
    'alpha':[0.1, 0.2]
}

# Grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=parameters, cv=3, verbose=2, scoring='neg_mean_squared_error')

# Fit the model
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {-grid_search.best_score_}")

best_xgb_model = grid_search.best_estimator_
best_xgb_model.fit(X, y)

# Predict the target on the training data
y_predict_train = best_xgb_model.predict(X)

# Calculate the RMSE
rmse_train = np.sqrt(mean_squared_error(y, y_predict_train))

print(f"RMSE on training data: {rmse_train}")

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END alpha=0.1, colsample_bytree=0.5, gamma=0, lambda=0, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END alpha=0.1, colsample_bytree=0.5, gamma=0, lambda=0, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END alpha=0.1, colsample_bytree=0.5, gamma=0, lambda=0, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END alpha=0.1, colsample_bytree=0.5, gamma=0, lambda=0.1, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.9; total time=   0.2s
[CV] END alpha=0.1, colsample_bytree=0.5, gamma=0, lambda=0.1, learning_rate=0.3, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END alpha=0.1, colsample_bytree=0.5, gamma=0, lambda=0.1, learning_rate=0.3, max_depth=3, min_child_weigh

In [500]:
# Sort
def cat_sort(cat):
    count_cat = train_data[cat].value_counts()
    sorted_cat = list(count_cat.index)
    return sorted_cat

sorted_brand = cat_sort('brand')
sorted_model = cat_sort('model')
sorted_fuel_type = cat_sort('fuel_type')
sorted_ext_col = cat_sort('ext_col')
sorted_int_col = cat_sort('int_col')
sorted_transmission = cat_sort('transmission')



In [501]:
import gradio as gr
print(gr.__version__)

def car_builder(brand, model, model_year, milage, engine, fuel_type, ext_col, int_col, transmission, accident, clean_title):
    # 'id' = 0
    dict = {
        'brand': [label_encoders['brand'].transform([brand])[0]],
        'model': [label_encoders['model'].transform([model])[0]],
        'model_year': [model_year],
        'milage': [milage],
        'hp': [extract_engine(engine)[0]],
        'fuel_type': [label_encoders['fuel_type'].transform([fuel_type])[0]],
        'ext_col': [label_encoders['ext_col'].transform([ext_col])[0]],
        'int_col': [label_encoders['int_col'].transform([int_col])[0]],
        'transmission': [label_encoders['transmission'].transform([transmission])[0]],
        'accident': [accident],
        'clean_title': [clean_title]
    }
    df = pd.DataFrame(dict)

    pricePrediction = best_XGB_Pred.predict(df)[0].astype(int)

    # Return the predicted price
    return f"Predicted price: {pricePrediction} USD"

demo = gr.Interface(
    car_builder[
        gr.Dropdown(choices=sorted_brand, value=sorted_brand[0])
    ], 'text'
)

#main
if __name__ == '__main__':
    demo.launch()


ModuleNotFoundError: No module named 'gradio'