#  Used Car Price Prediction
*Author: Diane Konan | GitHub: https://dmkonan.github.io/Diane-Konan-Portfolio/ | Email: dimikonan@gmail.com*

##  Problem Statement
Manual car pricing is often inconsistent and subjective, leading to missed revenue opportunities. This project builds a machine learning system to predict fair used car prices based on specifications.

##  Data Exploration and Cleaning
I clean and preprocess ~189K car records with corrupted characters and missing values using domain-specific logic.

## Feature Engineering
Meaningful features like `car_age`, `mileage_per_year`, and `brand_tier` were created. The target variable `price` was log-transformed.

##  Model Building and Evaluation
We trained and tuned several regression models including Linear, Ridge, Lasso, Random Forest, Gradient Boosting, and XGBoost. A stacked ensemble achieved the best results.

##  Model Performance
The final stacked model achieved an R² of 0.866. Top predictors included car age, mileage, and brand tier.

##  Deployment Plan
The model is deployable via a REST API with real-time prediction capabilities on AWS/Google Cloud.

##  Key Takeaways
- Final R²: **0.866** using Stacked GB + XGBoost
- Most impactful features: Car age, mileage, model tier
- Real-time deployment supports better pricing decisions and inventory turnover

USED CAR PRICE PREDICTION PROJECT CODE

****************************************************
ESTIMATED CODE RUN TIME: ~ 10-15 minutes
INTERACTIVE USER INPUT CODE TOWARDS END OF CODE FOR DEPLOYMENT
****************************************************

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Load initial dataset
input_path = "/Users/dianemireillekonan/Python files/" # change to run on local device
file_name = "Regression of Used Car Prices- train.csv"

In [None]:
df = pd.read_csv(input_path + file_name)

In [None]:
# Output for cleaned CSV file
output_file ="df_cleaned_output_12APR25.csv"
output_path = "/Users/dianemireillekonan/Python files/" # change to run on local device
output_path = (output_path + output_file)

DATA UNDERSTANDING PHASE

DESCRIPTIVE STATISTICS OF THE VARIABLES

In [None]:
# Information on column variables (non_null count, data types, column names)
print(df.info())

In [None]:
# Finding range of ID values
print(df.iloc[:, 0].min())
print(df.iloc[:, 0].max())

In [None]:
# Descriptive stats of numerical variables not including ID
print(df.iloc[:, 1:].describe())

In [None]:
# Finding value counts for each unique value for categorical variable
cat_count_summary = {} # creating empty dictionary
for col in df.select_dtypes(include=['object', 'category']).columns: # selecting only the categorical variables
    cat_count_summary[col] = df[col].value_counts() # column name = key, count = value
# Printing value counts for each
for col_name, count in cat_count_summary.items():
    print(f"Count For Each '{col_name}':\n{count}\n")

In [None]:
# Finding the mode for each column excluding ID (cars.iloc[:,1:])
variable_modes = {} # creating empty dictionary
for column in df.iloc[:, 1:].columns:
    modes = df.iloc[:, 1:][column].mode()  # column name = key, mode = value
    # assigning series of mode values to column name key
    variable_modes[column] = modes.tolist() # converting to python list

In [None]:
# Printing the modes for each item
print("Modes:")
for col_name, mode in variable_modes.items():
    print(f"{col_name}: {mode}")

In [None]:
# Finding number of unique entries for each variable - including ID this time
unique_entries = {} # creating empty dictionary
for column in df.columns:
    unique_entries[column] = df[column].nunique()  # column name = key, # unique entries = value

In [None]:
# Printing the modes for each item
print("Number of Unique Entries For Each Variable:")
for col_name, unique_count in unique_entries.items():
    print(f"{col_name}: {unique_count}")

GRAPHS AND PLOTS

In [None]:
# Histograms of integer variables (model_year, milage, price) - excluding ID
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.hist(df['model_year'], bins=25)
plt.xlabel('model_year')
plt.ylabel('Frequency')
plt.title("Histogram of Model Year")
plt.subplot(2,2,2)
plt.hist(df['milage'], bins=25)
plt.xlabel('Milage')
plt.ylabel('Frequency')
plt.title("Histogram of Mileage")
plt.subplot(2,2,3)
plt.hist(df['price'], bins=40)
plt.xlabel('Price $')
plt.ylabel('Frequency')
plt.title("Histogram of Used Car Price")
plt.tight_layout()
plt.show()

In [None]:
# Box plots of integer variables (model_year, milage, price) - excluding ID
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
plt.boxplot(df['model_year'])
plt.xlabel('Model Year')
plt.ylabel('Value')
plt.title("Boxplot of Used Car Model Year")
plt.subplot(2,2,2)
plt.boxplot(df['milage'])
plt.xlabel('Milage')
plt.ylabel('Value')
plt.title("Boxplot of Mileage")
plt.subplot(2,2,3)
plt.boxplot(df['price'])
plt.xlabel('Price $ ')
plt.ylabel('Value')
plt.title("Boxplot of Used Car Price")
plt.tight_layout()
plt.show()

CATEGORICAL VARIABLES
did not do visuals for model_type, ext_color, int_color because too large of a range

In [None]:
# Bar Graph of Brands
df['brand'].value_counts().plot(kind='bar')
plt.title("Brands")
plt.xlabel("Brand Name")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Bar Graph of Transmission Type
df['transmission'].value_counts().plot(kind='bar')
plt.title("Transmission Types")
plt.xlabel("Transmission")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Bar Graph of Accident
df['accident'].value_counts().plot(kind='bar')
plt.title("Accident?")
plt.xlabel("Accident?")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# Bar Graph of Clean Title
df['clean_title'].value_counts().plot(kind='bar')
plt.title("Clean Title?")
plt.xlabel("Clean Title?")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
DATA PREPARATION PHASE

In [None]:
# Make a copy so the original dataset stays unchanged
df_cleaned = df.copy()

In [None]:
print(df_cleaned.isna().sum())

In [None]:
# --- BRAND PRICE TIER 1-4 ---
df_cleaned['price_log'] = np.log(df_cleaned['price'])

In [None]:
brand_avg_price = df_cleaned.groupby('brand')['price'].mean().sort_values(ascending=False).reset_index()

In [None]:
brand_avg_price['brand_tier'] = pd.qcut(
    brand_avg_price['price'],
    q=[0, 0.35, 0.85, 0.95, 1.0],  # 35%, next 50%, next 10%, top 5%
    labels=[4, 3, 2, 1]  # 1 = Top 5%, 4 = Bottom 35%
)

In [None]:
brand_tier_map = dict(zip(brand_avg_price['brand'], brand_avg_price['brand_tier']))
df_cleaned['brand_tier'] = df_cleaned['brand'].map(brand_tier_map)

In [None]:
# --- MODEL PRICE TIER 1-3 ---
model_avg_price = df_cleaned.groupby('model')['price'].mean()
low_threshold = model_avg_price.quantile(0.25)
high_threshold = model_avg_price.quantile(0.75)

In [None]:
def encode_model_tier(model):
    avg_price = model_avg_price.get(model, 0)
    if avg_price <= low_threshold:
        return 3  # Low-end
    elif avg_price >= high_threshold:
        return 1  # High-end
    else:
        return 2  # Mid-range

In [None]:
df_cleaned['model_tier'] = df_cleaned['model'].apply(encode_model_tier)

In [None]:
# --- IMPUTE MISSING FUEL TYPE ---
electric_brands = ['Tesla', 'Rivian']
df_cleaned['fuel_type'] = df_cleaned.apply(
    lambda row: 'Electric' if pd.isna(row['fuel_type']) and row['brand'] in electric_brands else row['fuel_type'],
    axis=1
)

In [None]:
df_cleaned['engine'] = df_cleaned['engine'].astype(str).str.lower()

In [None]:
engine_keyword_map = {
    'plug-in': 'Plug-In Hybrid',
    'hybrid': 'Hybrid',
    'flex fuel': 'Flex Fuel',
    'e85': 'Flex Fuel',
    'diesel': 'Diesel',
    'gasoline': 'Gasoline',
    'electric': 'Electric'
}

In [None]:
def infer_fuel_type(row):
    if pd.isna(row['fuel_type']):
        for keyword, fuel in engine_keyword_map.items():
            if keyword in row['engine']:
                return fuel
    return row['fuel_type']

In [None]:
df_cleaned['fuel_type'] = df_cleaned.apply(infer_fuel_type, axis=1)

In [None]:
# --- ACCIDENT & TITLE BINARY FLAGS ---
df_cleaned['accident_flag'] = df_cleaned['accident'].apply(
    lambda x: 1 if isinstance(x, str) and 'accident' in x.lower() else 0
)
df_cleaned['accident_info_provided'] = df_cleaned['accident'].notna().astype(int)

In [None]:
df_cleaned['clean_title_flag'] = df_cleaned['clean_title'].apply(
    lambda x: 0 if isinstance(x, str) and x.strip().lower() == 'yes' else 1
)
df_cleaned['title_info_provided'] = df_cleaned['clean_title'].notna().astype(int)

In [None]:
# Drop original text fields no longer needed
df_cleaned.drop(['accident', 'clean_title'], axis=1, inplace=True)

In [None]:
# Step 1: Normalize transmission column (convert to lowercase for safety)
df_cleaned['transmission'] = df_cleaned['transmission'].astype(str).str.lower()

In [None]:
# Step 2: Standardize to 'manual' or 'automatic'
df_cleaned['transmission_type'] = df_cleaned['transmission'].apply(
    lambda x: 'manual' if x in ['manual', 'm/t'] else 'automatic'
)

In [None]:
# Step 3: Encode to binary: 1 = manual, 2 = automatic
df_cleaned['transmission_binary'] = df_cleaned['transmission_type'].map({
    'manual': 1,
    'automatic': 2
})

In [None]:
df_cleaned.drop('transmission_type', axis=1, inplace=True)
df_cleaned.drop('transmission', axis=1, inplace=True)

In [None]:
import re
from datetime import datetime

In [None]:
# Step 1: Car Age (Assuming current year is 2025)
current_year = 2025
df_cleaned['car_age'] = current_year - df_cleaned['model_year']

In [None]:
# Step 2: Extract Horsepower from engine (e.g., "208.0HP")
df_cleaned['horsepower'] = df_cleaned['engine'].str.extract(r'(\d+\.?\d*)\s*hp', expand=False).astype(float)

In [None]:
# Step 3: Extract Engine Size in Liters (e.g., "2.0L")
df_cleaned['engine_size'] = df_cleaned['engine'].str.extract(r'(\d+\.?\d*)\s*l', expand=False).astype(float)

In [None]:
# Step 4: Cylinders (including 'V6', 'V8', etc.)
def extract_cylinders(engine_text):
    engine_text = str(engine_text).lower()

    # Try to extract from "x Cylinder"
    match = re.search(r'(\d+)\s*cylinder', engine_text)
    if match:
        return float(match.group(1))

    # Check for V6, V8, V10, V12
    match = re.search(r'v(\d+)', engine_text)
    if match:
        return float(match.group(1))

    return np.nan  # return NaN if no match found

In [None]:
df_cleaned['cylinders'] = df_cleaned['engine'].apply(extract_cylinders)

In [None]:
df_cleaned.drop('engine', axis=1, inplace=True)
df_cleaned.drop('id', axis=1, inplace=True)
df_cleaned.drop('model_year', axis=1, inplace=True)

In [None]:
# Define the list of allowed colors
allowed_colors = ["red", "black", "white", "silver", "green", "yellow", "blue", "gray", "beige"]

In [None]:
def recode_color(val):
    """
    Checks if any allowed color is a substring of the given value.
    Returns the first allowed color found, or "exotic" if none is present.
    """
    # Convert the value to a string and lower-case it for case-insensitive matching.
    val_str = str(val).lower()
    for color in allowed_colors:
        if color in val_str:
            return color  # Return the first found allowed color.
    return "exotic"

In [None]:
# Apply the function to both ext_col and int_col
df_cleaned['ext_col'] = df_cleaned['ext_col'].apply(recode_color)
df_cleaned['int_col'] = df_cleaned['int_col'].apply(recode_color)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# List of columns to scale
columns_to_scale = ["milage", "car_age", "horsepower", "engine_size", "cylinders"]

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fit the scaler to the selected columns and transform them
df_cleaned[columns_to_scale] = scaler.fit_transform(df_cleaned[columns_to_scale])

In [None]:
# Impute all remaining missing values with the string "unknown"
df_cleaned.fillna("unknown", inplace=True)

In [None]:
# --- EXPORT CSV FILE TO OUTPUT PATH ---
df_cleaned.to_csv(output_path, index=False)

In [None]:
MODELING PHASE

In [None]:
# Load the cleaned dataset
car_df = pd.read_csv(output_path)

In [None]:
# Additional Numeric cleaning
for col in ['horsepower', 'engine_size', 'cylinders']:
    car_df[col] = pd.to_numeric(car_df[col], errors='coerce')
car_df.dropna(subset=['horsepower', 'engine_size', 'cylinders'], inplace=True)

In [None]:
# Feature Engineering: add some interaction features to improve predictive power
car_df['mileage_per_year'] = car_df['milage'] / (car_df['car_age'] + 1)
car_df['engine_per_cylinder'] = car_df['engine_size'] / (car_df['cylinders'] + 1)

In [None]:
# Price Binning by Segment
car_df['price_segment'] = pd.qcut(car_df['price_log'], q=4, labels=[0, 1, 2, 3]).astype(int)

In [None]:
# Define features and target
drop_cols = ['price', 'price_log', 'brand', 'model', 'fuel_type', 'ext_col', 'int_col']
available_features = [col for col in car_df.columns if col not in drop_cols]
X = car_df[available_features]
y_log = car_df['price_log']

In [None]:
# K-Fold Cross Validation, K=5
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Define evaluation metrics
def rmse_log(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
rmse_scorer = make_scorer(rmse_log, greater_is_better=False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

In [None]:
# All models to be tested
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01, max_iter=10000),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=10),
    "Random Forest": RandomForestRegressor(
        n_estimators=100, max_depth=8, min_samples_split=5, max_features='sqrt', random_state=42
    ),
    "Gradient Boosting": GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.03, max_depth=3, subsample=0.8, min_samples_split=5, random_state=42
    ),
    "XGBoost": XGBRegressor(
        n_estimators=100, learning_rate=0.03, max_depth=3, subsample=0.8, colsample_bytree=0.8,
        objective='reg:squarederror', random_state=42
    )
}

In [None]:
# Add Stacked Model (GB + XGB stacked together) to reduce RMSE
# Linear Regression used as meta-learner
base_learners = [
    ('gb', GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.03, max_depth=3, subsample=0.8, min_samples_split=5, random_state=42)),
    ('xgb', XGBRegressor(
        n_estimators=100, learning_rate=0.03, max_depth=3, subsample=0.8, colsample_bytree=0.8,
        objective='reg:squarederror', random_state=42))
]

In [None]:
stacked_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=LinearRegression(),
    n_jobs=-1
)

In [None]:
models["Stacked GB+XGB"] = stacked_model

In [None]:
# Models Evaluation
results_log = []
for name, model in models.items():
    rmse_scores = -cross_val_score(model, X, y_log, cv=kf, scoring=rmse_scorer)
    mae_scores = -cross_val_score(model, X, y_log, cv=kf, scoring=mae_scorer)
    r2_scores = cross_val_score(model, X, y_log, cv=kf, scoring=r2_scorer)

    results_log.append({
        "Model": name,
        "Avg Log RMSE": round(np.mean(rmse_scores), 4),
        "Avg Log MAE": round(np.mean(mae_scores), 4),
        "Avg R² (Log Target)": round(np.mean(r2_scores), 4)
    })

In [None]:
print("After: Models Evaluation")

In [None]:
# Display results
results_df_log = pd.DataFrame(results_log)
print(results_df_log)

In [None]:
stacked_model.fit(X, y_log) # fitting model that had the best results

EVALUATION PHASE

In [None]:
# Visualization - Feature Importance
gb_importances_folds = []
xgb_importances_folds = []

In [None]:
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]


    models["Gradient Boosting"].fit(X_train, y_train)
    gb_importances_folds.append(models["Gradient Boosting"].feature_importances_)


    models["XGBoost"].fit(X_train, y_train)
    xgb_importances_folds.append(models["XGBoost"].feature_importances_)

In [None]:
# Average importances
avg_gb_importances = np.mean(gb_importances_folds, axis=0)
avg_xgb_importances = np.mean(xgb_importances_folds, axis=0)

In [None]:
# Combine into a DataFrame
importance_df = pd.DataFrame({
    'Gradient Boosting': avg_gb_importances,
    'XGBoost': avg_xgb_importances
}, index=X.columns)

In [None]:
# Sort by average importance of both models
importance_df = importance_df.sort_values(by="Gradient Boosting")

In [None]:
importance_df.plot(kind='barh', figsize=(12, 10), alpha=0.7)
plt.title("Feature Importances: Gradient Boosting vs XGBoost (CV Averaged)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
models_to_compare = {
"Random Forest": models["Random Forest"],
"Stacked Model": stacked_model
}

In [None]:
for model_name, model in models_to_compare.items():
    all_preds_log = []
    all_actuals_log = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_log.iloc[train_idx], y_log.iloc[val_idx]

        model.fit(X_train, y_train)
        preds_log = model.predict(X_val)

        all_preds_log.extend(preds_log)
        all_actuals_log.extend(y_val)

    # Convert to arrays
    all_preds_log = np.array(all_preds_log)
    all_actuals_log = np.array(all_actuals_log)
    residuals = all_actuals_log - all_preds_log
    r2 = r2_score(all_actuals_log, all_preds_log)

    # Visualization - Predicted vs Actual Plot
    plt.figure(figsize=(8, 6))
    plt.scatter(all_actuals_log, all_preds_log, alpha=0.3)
    plt.plot([min(all_actuals_log), max(all_actuals_log)],
    [min(all_actuals_log), max(all_actuals_log)],
    color='red', linestyle='--', label='Ideal fit')
    plt.xlabel("Log(Actual Price)")
    plt.ylabel("Log(Predicted Price)")
    plt.title(f"{model_name} - Predicted vs Actual Prices\nRÂ² = {r2:.4f}")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Visualization - Residual Plot
    plt.figure(figsize=(8, 6))
    plt.scatter(all_preds_log, residuals, alpha=0.3)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel("Log(Predicted Price)")
    plt.ylabel("Residual (Log Actual - Log Predicted)")
    plt.title(f"{model_name} - Residual Plot\nRÂ² = {r2:.4f}")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

DEPLOYMENT PHASE

In [None]:
print("DEPLOYMENT/ NEW DATA INPUT EXAMPLE \n")
new_car_data = []  # empty list for new user data input to be added to

""****" used to help separate questions

In [None]:
while True:  # using while loop so multiple cars can be predicted/gives option to reset loop/questions
    print("Used Car Prediction Model. Please answer the prompts below to predict the cost of the used car. \n"
          "To quit please type 'quit' when prompted to input an answer. \n"
          "**********************************************************************************************************")
    print("\n")
    # user input mileage
    milage = input("Please enter mileage of the car (NUMBERS ONLY): ")

    if milage.lower() == "quit":  # to end loop
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    elif milage.isdigit():
        new_car_data.append(milage)  # adding to new_car_data
        print(f" mileage input: {milage}")
    else:
        print("**********************************************************************************")
        print("Please restart and only input a NUMBER or 'quit' to end form")
        print("**********************************************************************************")
        new_car_data = []  # resetting new_car_data
        continue

    print("**********************************************************************************")
    print("\n")

    # user input brand tier
    brand = input("Please input brand of car (example: Jeep or Ford):")

    # creating subsets for each brand and their tier (1-4)

    # brand_tier 1 subset
    tier_1b = car_df[car_df['brand_tier'] == 1]  # only getting instances where brand_tier = 1
    tier_1b_unique = tier_1b.drop_duplicates(subset=['brand'])  # dropping duplicate brands in tier 1
    brand_tier_1 = tier_1b_unique["brand"]  # subset to only get brand column
    brand_tier_1 = brand_tier_1.str.lower()  # making all values lower case

    # brand_tier 2 subset
    tier_2b = car_df[car_df['brand_tier'] == 2]  # only getting instances where brand_tier = 2
    tier_2b_unique = tier_2b.drop_duplicates(subset=['brand'])  # dropping duplicate brands in tier 2
    brand_tier_2 = tier_2b_unique["brand"]  # subset to only get brand column
    brand_tier_2 = brand_tier_2.str.lower()  # making all values lower case

    # brand_tier 3 subset
    tier_3b = car_df[car_df['brand_tier'] == 3]  # only getting instances where brand_tier = 4
    tier_3b_unique = tier_3b.drop_duplicates(subset=['brand'])  # dropping duplicate brands in tier 4
    brand_tier_3 = tier_3b_unique["brand"]  # subset to only get brand column
    brand_tier_3 = brand_tier_3.str.lower()  # making all values lower case

    # brand_tier 4 subset
    tier_4b = car_df[car_df['brand_tier'] == 3]  # only getting instances where brand_tier = 4
    tier_4b_unique = tier_4b.drop_duplicates(subset=['brand'])  # dropping duplicate brands in tier 4
    brand_tier_4 = tier_4b_unique["brand"]  # subset to only get brand column
    brand_tier_4 = brand_tier_4.str.lower()  # making all values lower case

    # matching user input of model to their brand tier values if in brand_tier subset
    if brand.lower() in brand_tier_1.values:
        brand_tier = 1
        new_car_data.append(brand_tier)  # appending brand_tier value to new_car_data[]
        print(f" brand tier input: {brand_tier}")

    elif brand.lower() in brand_tier_2.values:
        brand_tier = 2
        new_car_data.append(brand_tier)
        print(f" brand tier input: {brand_tier}")

    elif brand.lower() in brand_tier_3.values:
        brand_tier = 3
        new_car_data.append(brand_tier)
        print(f" brand tier input: {brand_tier}")

    elif brand.lower() in brand_tier_4.values:
        brand_tier = 4
        new_car_data.append(brand_tier)
        print(f" brand tier input: {brand_tier}")

    elif brand.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop

    else:
        print("**********************************************************************************")
        print(
            "Please restart and add in correct brand name. Please contact support email if brand of car is not allowed. \n"
            "Updates/retraining of model occur bi-annually if new brands emerge in market")
        print("**********************************************************************************")
        # resetting new_car_data
        new_car_data = []
        continue

    print("**********************************************************************************")
    print("\n")

    # model user input
    model = input("Please input model of car (Example: Liberty Sport or E350 Super Duty XLT):")

    # creating subsets for each tier and their associated model
    # model_tier 1 subset
    tier_1m = car_df[car_df['model_tier'] == 1] # only getting instances where model_tier = 1
    tier_1_unique = tier_1m.drop_duplicates(subset=['model']) # dropping duplicate models in tier 1
    model_tier_1 = tier_1_unique["model"] # subset to only get model column
    model_tier_1 = model_tier_1.str.lower() # making all values lower case

    # model_tier 2 subset
    tier_2m = car_df[car_df['model_tier'] == 2] # dropping duplicate models in tier 2
    tier_2_unique = tier_2m.drop_duplicates(subset=['model'])
    model_tier_2 = tier_2_unique["model"] # subset to only get model column
    model_tier_2 = model_tier_2.str.lower() # making all values lower case

    # model_tier 3 subset
    tier_3m = car_df[car_df['model_tier'] == 3]
    tier_3_unique = tier_3m.drop_duplicates(subset=['model'])
    model_tier_3 = tier_3_unique["model"]
    model_tier_3 = model_tier_3.str.lower()

In [None]:
# matching user input of model to their model tier values if in model_tier subset
    if model.lower() in model_tier_1.values:
        model_tier = 1
        new_car_data.append(model_tier) # appending model_tier value to new_car_data[]
        print(f" model tier input: {model_tier}")

    elif model.lower() in model_tier_2.values:
        model_tier = 2
        new_car_data.append(model_tier)
        print(f" model tier input: {model_tier}")

    elif model.lower() in model_tier_3.values:
        model_tier = 3
        new_car_data.append(model_tier)
        print(f" model tier input: {model_tier}")

    elif model.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop

    else:
        print("**********************************************************************************")
        print ("Please restart and add in correct model name. Please contact support email if model of car is not allowed. \n"
               "Updates/retraining of model occur bi-annually if new models emerge in market")
        print("**********************************************************************************")
        # resetting new_car_data
        new_car_data = []
        continue

    print("**********************************************************************************")
    print("\n")
    accident_flag = input("Was the used car flagged for an accident? \n"
                          "Please input '1' if Yes and '0' if No")
    num_flag = int(accident_flag)
    if accident_flag in ['0', '1']:
        new_car_data.append(num_flag)
        print(f" accident flag input: {accident_flag}")

    elif accident_flag.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop

    else:
        print("**********************************************************************************")
        print("Please restart and enter '1' if Yes and '0' if No only")
        print("**********************************************************************************")
        new_car_data = [] # resetting new_car_data
        continue # restarting loop from beginning


    print("**********************************************************************************")
    print("\n")
    accident_info_provided = input("Was accident information provided - this includes if reported 'no accidents'? \n"
                                   "Please input '1' if Yes and '0' if No/Unsure")
    accident_info_num = int(accident_info_provided)
    if accident_info_provided in ['0', '1']:
        new_car_data.append(accident_info_num)
        print(f" accident info provided input: {accident_info_provided}")
    elif accident_info_provided.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    else:
        print("**********************************************************************************")
        print("Please restart and only input 1 if yes or 0 if no/unsure")
        print("**********************************************************************************")
        # resetting new_car_data
        new_car_data = []
        continue
    print("**********************************************************************************")
    print("\n")
    clean_title_flag = input("Was title flagged as clean'? \n"
                             "Please input '1' if Yes and '0' if No")
    clean_title_flag_num = int(clean_title_flag)
    if clean_title_flag in ['0', '1']:
        new_car_data.append(clean_title_flag_num)
        print(f" clean title flag input: {clean_title_flag}")
    elif clean_title_flag.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    else:
        print("**********************************************************************************")
        print("Please restart and only input 1 if yes or 0 if no/unsure")
        print("**********************************************************************************")
        new_car_data = []# resetting new_car_data
        continue
    print("**********************************************************************************")
    print("\n")
    title_info_provided = input("Was title information provided'? \n"
                                "Please input '1' if Yes and '0' if No/Unsure")
    title_info_provided_num = int(title_info_provided)
    if title_info_provided in ['0', '1']:
        new_car_data.append(title_info_provided_num)
        print(f" title provided input: {title_info_provided}")
    elif title_info_provided.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    else:
        print("**********************************************************************************")
        print("Please restart and only input 1 if yes or 0 if no/unsure")
        print("**********************************************************************************")
        new_car_data = []# resetting new_car_data
        continue

    print("**********************************************************************************")
    print("\n")
    transmission_binary = input("What is the type of transmission of car? \n"
                                "Please input '1' if Manual and '2' Automatic")
    transmission_num = int(title_info_provided)
    if transmission_binary in ['1', '2']:
        new_car_data.append(transmission_num)
        print(f" title provided input: {transmission_binary}")
    elif transmission_binary.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    else:
        print("**********************************************************************************")
        print("Please restart and only input 1 if Manual and 2 if Automatic")
        print("**********************************************************************************")
        new_car_data = []# resetting new_car_data
        continue
    print("**********************************************************************************")
    print("\n")
    car_age = input("Please enter model year of car NUMBER ONLY: ")

    if car_age.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    elif car_age.isdigit():
        current_age = int(current_year) - int(car_age)
        new_car_data.append(current_age)
        print(f" model year input: {car_age}")
    else:
        print("**********************************************************************************")
        print("Please restart and only input a NUMBER or 'quit' to end form")
        print("**********************************************************************************")
        new_car_data = []  # resetting new_car_data
        continue

    print("******************************************************************************************************* \n")
    print("\n")
    horsepower = input("Please enter horsepower of car: NUMBER ONLY")
    if horsepower.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    elif horsepower.isdigit():
        new_car_data.append(horsepower)
        print(f" horsepower input: {horsepower}")
    else:
        print("**********************************************************************************")
        print("Please restart and only input a NUMBER or 'quit' to end form")
        print("**********************************************************************************")
        new_car_data = []  # resetting new_car_data
        continue

    print("******************************************************************************************************* \n")
    print("\n")
    engine_size = input("Please enter engine size of car: NUMBER ONLY")
    if engine_size.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    elif engine_size.isdigit():
        new_car_data.append(engine_size)
        print(f" engine size input: {engine_size}")
    else:
        print("**********************************************************************************")
        print("Please restart and only input a NUMBER or 'quit' to end form")
        print("**********************************************************************************")
        new_car_data = []  # resetting new_car_data
        continue

    print("******************************************************************************************************* \n")
    print("\n")
    cylinders = input("Please enter number of cylinders of car: NUMBER ONLY ")
    if cylinders.lower() == "quit":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    elif cylinders.isdigit():
        new_car_data.append(cylinders)
        print(f" cylinder input: {cylinders}")
    else:
        print("**********************************************************************************")
        print("Please restart and only input a NUMBER or 'quit' to end form")
        print("**********************************************************************************")
        new_car_data = []  # resetting new_car_data
        continue
    print("******************************************************************************************************* \n")

    # creating column names for new dataframe for info input by user
    new_car_data_col = ["milage", "brand_tier", "model_tier", "accident_flag", "accident_info_provided",
                        "clean_title_flag",
                        "title_info_provided", "transmission_binary", "car_age", "horsepower", "engine_size",
                        "cylinders"]
    # creating new dataframe from info input by user
    new_car_data_df = pd.DataFrame([new_car_data], columns=new_car_data_col)

    # List of column names of new data to scale
    new_columns_to_scale = ["milage", "car_age", "horsepower", "engine_size", "cylinders"]

    # Scaled new user input data using previous scaler of training data for model
    new_car_data_df[new_columns_to_scale] = scaler.transform(new_car_data_df[new_columns_to_scale])

    # calculating miles_per_year, engine_per_cylinder based on user input data
    mileage_per_year = new_car_data_df.loc[0, 'milage'] / (new_car_data_df.loc[0, 'car_age'] + 1)
    engine_per_cylinder = new_car_data_df.loc[0, 'engine_size'] / (new_car_data_df.loc[0, 'cylinders'] + 1)

    # price segment
    price_segment = 0 #initializing variable

    # correlating model tier to price segment
    if model_tier ==1:
        price_segment = 3

    elif model_tier == 2:
        price_segment = 2
    else:
        price_segment = 1


    # column names for newly calculated/new variables
    new_df_col = ['mileage_per_year', 'engine_per_cylinder', 'price_segment']
    # creation of dataframe with calculated variables and appending it to previously created dataset
    new_variables_df = pd.DataFrame([[mileage_per_year, engine_per_cylinder, price_segment]], columns=new_df_col)
    new_data_to_predict = pd.concat([new_car_data_df, new_variables_df], axis=1)

    print(f" Used Car Data to Predict: \n {new_data_to_predict}")
    print("*****************************************************************************************")

    # NEW CAR PRICE PREDICTION
    new_car_predict = stacked_model.predict(new_data_to_predict)
    print(f" Predicted Car Price: $  {np.exp(new_car_predict)}") # converting from np.log to $

    print("*****************************************************************************************")

    # Gives user the option to quit or to input new car data
    retry = input("Do you want to predict another car price? \n"
                  "Please type 'Yes' to predict another car, 'No' to Quit.")

    if retry.lower() == "no":
        print("*****************************************************************************************")
        print("Thank you for predicting your car price")  # goodbye message
        break  # stops loop
    elif retry.lower() == "yes":
        new_car_data.clear()  # clears out new data list to allow user to add in new info
        continue