In [1]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
#from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
current_year = datetime.now().year

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nehab\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df = pd.read_csv('final_cleaned.csv', index_col = False)
car_data_encoded = pd.get_dummies(df, columns=['Model Full Name', 'Drivetrain', 'Vehicle Class'], drop_first=True)

In [3]:
results_list = []

# Functional

In [4]:
functional_features = [
    'Horsepower (hp)', 'Curb Weight (lbs)', 'Combined MPG', 'Fuel Capacity (gallons)', 'Age'
] + [col for col in car_data_encoded.columns if col.startswith('Drivetrain_')]

vehicle_class_features = [col for col in car_data_encoded.columns if col.startswith('Vehicle Class_')]

functional_features += vehicle_class_features

features = functional_features

### Functional Unaggregated

In [5]:
#unaggregated

y_price = car_data_encoded['Average KBB Fair Price ($)']
X = car_data_encoded[features]

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)


price_model = LinearRegression()
price_model.fit(X_train, y_train)
y_pred = price_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


price_coefficients = dict(zip(X.columns, price_model.coef_))

# Print R^2 and RMSE
print(f"R^2: {r2:.4f}")
print(f"\nRMSE: {rmse:.4f}")

# Print Model Coefficients
print(f"\nModel Coefficients:")
for feature, coef in price_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept
print(f"\nIntercept: {price_model.intercept_:.4f}")

division_title = "Functional Unaggregated"
model_type = "Linear Regression"

# Append the results to the list as a dictionary
results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": None,
    "R^2": r2,
    "RMSE": rmse
})

R^2: 0.7800

RMSE: 4123.7972

Model Coefficients:
  Horsepower (hp): 47.0750
  Curb Weight (lbs): -2.1557
  Combined MPG: -7.0564
  Fuel Capacity (gallons): 185.2962
  Age: -1750.4193
  Drivetrain_4WD: 4185.8522
  Drivetrain_AWD: -854.1422
  Drivetrain_FWD: -2477.9903
  Drivetrain_RWD: 578.9442
  Vehicle Class_compact-suv: -935.3245
  Vehicle Class_electric-car: 3351.7157
  Vehicle Class_electric-suv: 4715.5914
  Vehicle Class_full-size: -1152.8683
  Vehicle Class_full-size-truck: -1012.8441
  Vehicle Class_hybrid-car: 559.6436
  Vehicle Class_hybrid-suv: -3184.6762
  Vehicle Class_luxury-hybrid-suv: 3100.9584
  Vehicle Class_mid-size: 1234.1979
  Vehicle Class_mid-size-suv: 967.6763
  Vehicle Class_mid-size-truck: 2313.9656
  Vehicle Class_minivan: 1583.3936

Intercept: 25117.5797


#### Functional Unaggregated Lasso

In [6]:

y_price = car_data_encoded['Average KBB Fair Price ($)']
X = car_data_encoded[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is your input features

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 200, 300, 500, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional Unaggregated"
model_type = "Lasso"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})



Alpha: 0.1
  R^2: 0.7800
  RMSE: 4123.7895

Alpha: 1.0
  R^2: 0.7800
  RMSE: 4123.7285

Alpha: 10.0
  R^2: 0.7800
  RMSE: 4123.9118

Alpha: 100
  R^2: 0.7746
  RMSE: 4174.3339

Alpha: 200
  R^2: 0.7663
  RMSE: 4250.4461

Alpha: 300
  R^2: 0.7549
  RMSE: 4352.8012

Alpha: 500
  R^2: 0.7404
  RMSE: 4479.5823

Alpha: 1000
  R^2: 0.7124
  RMSE: 4715.1785

Best Alpha based on RMSE: 1.0
Best R^2: 0.7800
Best RMSE: 4123.7285

Model Coefficients for the Best Alpha:
  Horsepower (hp): 3238.6874
  Curb Weight (lbs): -1622.0267
  Combined MPG: -93.9673
  Fuel Capacity (gallons): 805.3588
  Age: -6890.2770
  Drivetrain_4WD: 1338.3041
  Drivetrain_AWD: -341.3202
  Drivetrain_FWD: -1205.0900
  Drivetrain_RWD: 197.7946
  Vehicle Class_compact-suv: -369.6838
  Vehicle Class_electric-car: 460.0448
  Vehicle Class_electric-suv: 507.4761
  Vehicle Class_full-size: -387.8504
  Vehicle Class_full-size-truck: -215.2602
  Vehicle Class_hybrid-car: 101.2008
  Vehicle Class_hybrid-suv: -434.5650
  Vehicle Cla

#### Functional Unaggregated Ridge

In [7]:

y_price = car_data_encoded['Average KBB Fair Price ($)']
X = car_data_encoded[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is your input features

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_price, test_size=0.2, random_state=15)


alpha_values = [0.1, 1.0, 10.0, 100, 200, 300, 500, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    Ridge_model = Ridge(alpha=alpha)
    Ridge_model.fit(X_train, y_train)
    y_pred = Ridge_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional Unaggregated"
model_type = "Ridge"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.1
  R^2: 0.7800
  RMSE: 4123.7960

Alpha: 1.0
  R^2: 0.7800
  RMSE: 4123.7857

Alpha: 10.0
  R^2: 0.7800
  RMSE: 4123.6896

Alpha: 100
  R^2: 0.7801
  RMSE: 4123.4380

Alpha: 200
  R^2: 0.7799
  RMSE: 4124.5054

Alpha: 300
  R^2: 0.7797
  RMSE: 4126.7869

Alpha: 500
  R^2: 0.7789
  RMSE: 4134.3816

Alpha: 1000
  R^2: 0.7754
  RMSE: 4166.5322

Best Alpha based on RMSE: 100
Best R^2: 0.7801
Best RMSE: 4123.4380

Model Coefficients for the Best Alpha:
  Horsepower (hp): 2357.4999
  Curb Weight (lbs): 0.0000
  Combined MPG: 0.0000
  Fuel Capacity (gallons): 0.0000
  Age: -5871.9075
  Drivetrain_4WD: 223.6079
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -159.4827
  Drivetrain_RWD: 0.0000
  Vehicle Class_compact-suv: -0.0000
  Vehicle Class_electric-car: 0.0000
  Vehicle Class_electric-suv: 0.0000
  Vehicle Class_full-size: -0.0000
  Vehicle Class_full-size-truck: -0.0000
  Vehicle Class_hybrid-car: -0.0000
  Vehicle Class_hybrid-suv: -0.0000
  Vehicle Class_luxury-hybrid-suv: 0.000

### Functional Aggregated

In [8]:
#aggregated
aggregated_data = df.groupby(['Model Full Name', 'Year']).agg({
    'Horsepower (hp)': 'mean',
    'Curb Weight (lbs)': 'mean',
    'Combined MPG': 'mean',
    'Fuel Capacity (gallons)': 'mean',
    'Age': 'mean',
    'Average KBB Fair Price ($)': 'mean',
    'Vehicle Class': 'first',
    'Car Brand': 'first',
    'Car Model': 'first',
    'Drivetrain': 'first'
}).reset_index()

car_data_encoded_agg = pd.get_dummies(aggregated_data, columns=['Car Brand', 'Car Model', 'Drivetrain', 'Vehicle Class'], drop_first=True)

y_price = car_data_encoded_agg['Average KBB Fair Price ($)']
X = car_data_encoded_agg[functional_features]

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)


price_model = LinearRegression()
price_model.fit(X_train, y_train)
y_pred = price_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


price_coefficients = dict(zip(X.columns, price_model.coef_))

# Print R^2 and RMSE
print(f"R^2: {r2:.4f}")
print(f"\nRMSE: {rmse:.4f}")

# Print Model Coefficients
print("Model Coefficients:")
for feature, coef in price_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept
print(f"\nIntercept: {price_model.intercept_:.4f}")

division_title = "Functional Aggregated"
model_type = "Linear Regression"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": None,
    "R^2": r2,
    "RMSE": rmse
})


R^2: 0.6385

RMSE: 5149.0639
Model Coefficients:
  Horsepower (hp): 45.7558
  Curb Weight (lbs): -0.8214
  Combined MPG: 11.3679
  Fuel Capacity (gallons): 190.6051
  Age: -2057.9485
  Drivetrain_4WD: 3148.6491
  Drivetrain_AWD: -2380.2253
  Drivetrain_FWD: -3474.6153
  Drivetrain_RWD: -453.2229
  Vehicle Class_compact-suv: -229.2713
  Vehicle Class_electric-car: 6718.1756
  Vehicle Class_electric-suv: 4723.7591
  Vehicle Class_full-size: -673.6605
  Vehicle Class_full-size-truck: -3125.2154
  Vehicle Class_hybrid-car: 1138.7061
  Vehicle Class_hybrid-suv: -2870.1388
  Vehicle Class_luxury-hybrid-suv: 5894.6217
  Vehicle Class_mid-size: 2429.0707
  Vehicle Class_mid-size-suv: 1234.6862
  Vehicle Class_mid-size-truck: 1678.9081
  Vehicle Class_minivan: 2773.7374

Intercept: 23573.2972


#### Functional Aggregated Lasso

In [9]:

y_price = car_data_encoded_agg['Average KBB Fair Price ($)']
X = car_data_encoded_agg[functional_features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is your input features

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 200, 300, 500, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional Aggregated"
model_type = "Lasso"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})



Alpha: 0.1
  R^2: 0.6385
  RMSE: 5148.8350

Alpha: 1.0
  R^2: 0.6388
  RMSE: 5146.7798

Alpha: 10.0
  R^2: 0.6416
  RMSE: 5126.6812

Alpha: 100
  R^2: 0.6641
  RMSE: 4962.9833

Alpha: 200
  R^2: 0.6814
  RMSE: 4833.7407

Alpha: 300
  R^2: 0.6900
  RMSE: 4768.1244

Alpha: 500
  R^2: 0.6955
  RMSE: 4725.7224

Alpha: 1000
  R^2: 0.6996
  RMSE: 4693.6620

Best Alpha based on RMSE: 1000
Best R^2: 0.6996
Best RMSE: 4693.6620

Model Coefficients for the Best Alpha:
  Horsepower (hp): 3543.5295
  Curb Weight (lbs): 0.0000
  Combined MPG: 25.3977
  Fuel Capacity (gallons): 0.0000
  Age: -8084.9927
  Drivetrain_4WD: 293.3834
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -0.0000
  Drivetrain_RWD: 0.0000
  Vehicle Class_compact-suv: -0.0000
  Vehicle Class_electric-car: 0.0000
  Vehicle Class_electric-suv: 96.3568
  Vehicle Class_full-size: -0.0000
  Vehicle Class_full-size-truck: -0.0000
  Vehicle Class_hybrid-car: -0.0000
  Vehicle Class_hybrid-suv: -0.0000
  Vehicle Class_luxury-hybrid-suv: 0.00

#### Functional Aggregated Ridge

In [10]:

y_price = car_data_encoded_agg['Average KBB Fair Price ($)']
X = car_data_encoded_agg[functional_features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is your input features

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 200, 300, 500, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional Aggregated"
model_type = "Ridge"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.1
  R^2: 0.6388
  RMSE: 5146.6787

Alpha: 1.0
  R^2: 0.6417
  RMSE: 5125.7161

Alpha: 10.0
  R^2: 0.6652
  RMSE: 4954.7213

Alpha: 100
  R^2: 0.7084
  RMSE: 4624.1261

Alpha: 200
  R^2: 0.6595
  RMSE: 4997.2424

Alpha: 300
  R^2: 0.5992
  RMSE: 5421.3697

Alpha: 500
  R^2: 0.4933
  RMSE: 6095.5715

Alpha: 1000
  R^2: 0.3239
  RMSE: 7041.4219

Best Alpha based on RMSE: 100
Best R^2: 0.7084
Best RMSE: 4624.1261

Model Coefficients for the Best Alpha:
  Horsepower (hp): 3543.5295
  Curb Weight (lbs): 0.0000
  Combined MPG: 25.3977
  Fuel Capacity (gallons): 0.0000
  Age: -8084.9927
  Drivetrain_4WD: 293.3834
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -0.0000
  Drivetrain_RWD: 0.0000
  Vehicle Class_compact-suv: -0.0000
  Vehicle Class_electric-car: 0.0000
  Vehicle Class_electric-suv: 96.3568
  Vehicle Class_full-size: -0.0000
  Vehicle Class_full-size-truck: -0.0000
  Vehicle Class_hybrid-car: -0.0000
  Vehicle Class_hybrid-suv: -0.0000
  Vehicle Class_luxury-hybrid-suv: 0.000

# Functional and Experiential

### Functional and Experiental Unaggregated

In [11]:
selected_topics = ['Topic_0', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_7', 'Topic_8', 'Topic_9']

for topic in selected_topics:
    car_data_encoded[f'{topic}_Weighted'] = car_data_encoded[topic] * car_data_encoded['Sentiment Score']

weighted_topic_features = [f'{topic}_Weighted' for topic in selected_topics]
features = functional_features + weighted_topic_features + [col for col in car_data_encoded.columns if col.startswith('Model Full Name_')]

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)


price_model = LinearRegression()
price_model.fit(X_train, y_train)
y_pred = price_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


price_coefficients = dict(zip(X.columns, price_model.coef_))

print(f"R^2: {r2:.4f}")
print(f"\nRMSE: {rmse:.4f}")

# Print Model Coefficients
print(f"\nModel Coefficients:")
for feature, coef in price_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept
print(f"\nIntercept: {price_model.intercept_:.4f}")

division_title = "Functional & Experiential Unaggregated"
model_type = "Linear Regression"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": None,
    "R^2": r2,
    "RMSE": rmse
})

R^2: 0.9119

RMSE: 2610.2972

Model Coefficients:
  Horsepower (hp): 5.6205
  Curb Weight (lbs): -4.1024
  Combined MPG: -39.8881
  Fuel Capacity (gallons): -70.7159
  Age: -1818.9150
  Drivetrain_4WD: 408.1054
  Drivetrain_AWD: -1565.4764
  Drivetrain_FWD: -4207.0753
  Drivetrain_RWD: -2462.8245
  Vehicle Class_compact-suv: 3181807109257.7056
  Vehicle Class_electric-car: -715044531176.8585
  Vehicle Class_electric-suv: 7210183252136.6016
  Vehicle Class_full-size: 2926079233543.4614
  Vehicle Class_full-size-truck: -1887711719589.7280
  Vehicle Class_hybrid-car: 11643799989635.1094
  Vehicle Class_hybrid-suv: -779796829386.9764
  Vehicle Class_luxury-hybrid-suv: 340421695847.9497
  Vehicle Class_mid-size: 49960159987.5021
  Vehicle Class_mid-size-suv: 2592475688014.8296
  Vehicle Class_mid-size-truck: 1182986181354.1604
  Vehicle Class_minivan: 5435048661774.5166
  Topic_0_Weighted: -7599.1789
  Topic_2_Weighted: -1092.6203
  Topic_3_Weighted: 5602.0576
  Topic_4_Weighted: -1684.8179

#### Functional and Experiental Unaggregated Lasso

In [12]:

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is your input features

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 200, 300, 500, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional & Experiential Unaggregated"
model_type = "Lasso"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})

  model = cd_fast.enet_coordinate_descent(



Alpha: 0.1
  R^2: 0.9119
  RMSE: 2610.3225

Alpha: 1.0
  R^2: 0.9118
  RMSE: 2610.8747

Alpha: 10.0
  R^2: 0.9107
  RMSE: 2626.7807

Alpha: 100
  R^2: 0.8844
  RMSE: 2989.6112

Alpha: 200
  R^2: 0.8487
  RMSE: 3420.3871

Alpha: 300
  R^2: 0.8170
  RMSE: 3761.3364

Alpha: 500
  R^2: 0.7720
  RMSE: 4197.8319

Alpha: 1000
  R^2: 0.7293
  RMSE: 4574.4934

Best Alpha based on RMSE: 0.1
Best R^2: 0.9119
Best RMSE: 2610.3225

Model Coefficients for the Best Alpha:
  Horsepower (hp): 389.8549
  Curb Weight (lbs): -3090.7731
  Combined MPG: -580.8696
  Fuel Capacity (gallons): -301.9682
  Age: -7163.7914
  Drivetrain_4WD: 131.2370
  Drivetrain_AWD: -632.5114
  Drivetrain_FWD: -2054.6870
  Drivetrain_RWD: -827.5845
  Vehicle Class_compact-suv: -2283.6561
  Vehicle Class_electric-car: 79.3001
  Vehicle Class_electric-suv: 707.1642
  Vehicle Class_full-size: 857.7148
  Vehicle Class_full-size-truck: 304.0813
  Vehicle Class_hybrid-car: -1156.4370
  Vehicle Class_hybrid-suv: -517.4764
  Vehicle Cl

#### Functional and Experiental Unaggregated Ridge

In [13]:

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is your input features

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 200, 300, 500, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional & Experiential Unaggregated"
model_type = "Ridge"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.1
  R^2: 0.9119
  RMSE: 2610.2977

Alpha: 1.0
  R^2: 0.9119
  RMSE: 2610.3006

Alpha: 10.0
  R^2: 0.9119
  RMSE: 2610.4060

Alpha: 100
  R^2: 0.9114
  RMSE: 2616.8996

Alpha: 200
  R^2: 0.9105
  RMSE: 2630.3706

Alpha: 300
  R^2: 0.9094
  RMSE: 2646.5772

Alpha: 500
  R^2: 0.9069
  RMSE: 2682.0127

Alpha: 1000
  R^2: 0.9003
  RMSE: 2775.6619

Best Alpha based on RMSE: 0.1
Best R^2: 0.9119
Best RMSE: 2610.2977

Model Coefficients for the Best Alpha:
  Horsepower (hp): 2182.5473
  Curb Weight (lbs): 0.0000
  Combined MPG: 0.0000
  Fuel Capacity (gallons): 0.0000
  Age: -5889.8643
  Drivetrain_4WD: 284.2387
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -166.5648
  Drivetrain_RWD: 0.0000
  Vehicle Class_compact-suv: -0.0000
  Vehicle Class_electric-car: 0.0000
  Vehicle Class_electric-suv: 0.0000
  Vehicle Class_full-size: 0.0000
  Vehicle Class_full-size-truck: -0.0000
  Vehicle Class_hybrid-car: -0.0000
  Vehicle Class_hybrid-suv: -0.0000
  Vehicle Class_luxury-hybrid-suv: 0.0000

### Functional and Experiental Aggregated

In [14]:
#aggregated
selected_topics = ['Topic_0', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_7', 'Topic_8', 'Topic_9']

for topic in selected_topics:
    df[f'{topic}_Weighted'] = df[topic] * df['Sentiment Score']

df = pd.get_dummies(df, columns=['Drivetrain','Car Model', 'Car Brand', 'Vehicle Class'], drop_first=True)

aggregation_dict = {
    feature: 'mean' if feature != 'Drivetrain' or feature != 'Car Brand' or feature != 'Vehicle Class' or feature != 'Car Model' else 'first'
    for feature in functional_features + [f'{topic}_Weighted' for topic in selected_topics] + ['Sentiment Score', 'Average KBB Fair Price ($)']
}
aggregated_data = df.groupby(['Model Full Name', 'Year']).agg(aggregation_dict).reset_index()

car_data_encoded = aggregated_data



weighted_topic_features = [f'{topic}_Weighted' for topic in selected_topics]
features = functional_features + weighted_topic_features + [col for col in car_data_encoded.columns if col.startswith('Model Full Name_')]

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)


price_model = LinearRegression()
price_model.fit(X_train, y_train)
y_pred = price_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


price_coefficients = dict(zip(X.columns, price_model.coef_))

# Print R^2 and RMSE
print(f"R^2: {r2:.4f}")
print(f"\nRMSE: {rmse:.4f}")

# Print Model Coefficients
print(f"\nModel Coefficients:")
for feature, coef in price_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept
print(f"\nIntercept: {price_model.intercept_:.4f}")

division_title = "Functional & Experiential Aggregated"
model_type = "Linear Regression"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": None,
    "R^2": r2,
    "RMSE": rmse
})


R^2: 0.6470

RMSE: 5088.1908

Model Coefficients:
  Horsepower (hp): 44.9353
  Curb Weight (lbs): -0.7144
  Combined MPG: 3.8321
  Fuel Capacity (gallons): 91.3450
  Age: -2089.2104
  Drivetrain_4WD: 2942.7200
  Drivetrain_AWD: -2071.4461
  Drivetrain_FWD: -2715.7629
  Drivetrain_RWD: -5.8770
  Vehicle Class_compact-suv: 211.6559
  Vehicle Class_electric-car: 6447.1981
  Vehicle Class_electric-suv: 5062.8956
  Vehicle Class_full-size: -1104.0260
  Vehicle Class_full-size-truck: -1284.0530
  Vehicle Class_hybrid-car: 764.9949
  Vehicle Class_hybrid-suv: -3033.5552
  Vehicle Class_luxury-hybrid-suv: 5664.2482
  Vehicle Class_mid-size: 2191.8208
  Vehicle Class_mid-size-suv: 939.7780
  Vehicle Class_mid-size-truck: 3492.4755
  Vehicle Class_minivan: 1924.3700
  Topic_0_Weighted: -81750.3118
  Topic_2_Weighted: -166701.4937
  Topic_3_Weighted: 203352.7398
  Topic_4_Weighted: 42828.8157
  Topic_5_Weighted: -40744.1437
  Topic_7_Weighted: -133470.6638
  Topic_8_Weighted: 71093.5622
  Topic_9

#### Functional and Experiental Aggregated lasso

In [15]:

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is your input features

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 200, 300, 500, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional & Experiential Aggregated"
model_type = "Lasso"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})



Alpha: 0.1
  R^2: 0.6470
  RMSE: 5087.9646

Alpha: 1.0
  R^2: 0.6473
  RMSE: 5085.5279

Alpha: 10.0
  R^2: 0.6505
  RMSE: 5062.5979

Alpha: 100
  R^2: 0.6766
  RMSE: 4870.0405

Alpha: 200
  R^2: 0.6940
  RMSE: 4736.7433

Alpha: 300
  R^2: 0.7022
  RMSE: 4672.9310

Alpha: 500
  R^2: 0.7027
  RMSE: 4669.4612

Alpha: 1000
  R^2: 0.6996
  RMSE: 4693.6620

Best Alpha based on RMSE: 500
Best R^2: 0.7027
Best RMSE: 4669.4612

Model Coefficients for the Best Alpha:
  Horsepower (hp): 3563.3705
  Curb Weight (lbs): 0.0000
  Combined MPG: 511.1445
  Fuel Capacity (gallons): 0.0000
  Age: -8347.3837
  Drivetrain_4WD: 749.1527
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -277.1480
  Drivetrain_RWD: 121.2650
  Vehicle Class_compact-suv: -170.2542
  Vehicle Class_electric-car: 0.0000
  Vehicle Class_electric-suv: 83.0012
  Vehicle Class_full-size: -0.2334
  Vehicle Class_full-size-truck: -0.0000
  Vehicle Class_hybrid-car: -0.0000
  Vehicle Class_hybrid-suv: -24.5466
  Vehicle Class_luxury-hybrid-su

#### Functional and Experiental Aggregated Ridge

In [16]:

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X is your input features

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 200, 300, 500, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional & Experiential Aggregated"
model_type = "Ridge"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.1
  R^2: 0.6470
  RMSE: 5087.7660

Alpha: 1.0
  R^2: 0.6475
  RMSE: 5084.2992

Alpha: 10.0
  R^2: 0.6492
  RMSE: 5071.8508

Alpha: 100
  R^2: 0.6333
  RMSE: 5185.3411

Alpha: 200
  R^2: 0.6125
  RMSE: 5330.9658

Alpha: 300
  R^2: 0.5855
  RMSE: 5513.2996

Alpha: 500
  R^2: 0.5255
  RMSE: 5898.7645

Alpha: 1000
  R^2: 0.3960
  RMSE: 6655.5024

Best Alpha based on RMSE: 10.0
Best R^2: 0.6492
Best RMSE: 5071.8508

Model Coefficients for the Best Alpha:
  Horsepower (hp): 3543.5295
  Curb Weight (lbs): 0.0000
  Combined MPG: 25.3977
  Fuel Capacity (gallons): 0.0000
  Age: -8084.9927
  Drivetrain_4WD: 293.3834
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -0.0000
  Drivetrain_RWD: 0.0000
  Vehicle Class_compact-suv: -0.0000
  Vehicle Class_electric-car: 0.0000
  Vehicle Class_electric-suv: 96.3568
  Vehicle Class_full-size: -0.0000
  Vehicle Class_full-size-truck: -0.0000
  Vehicle Class_hybrid-car: -0.0000
  Vehicle Class_hybrid-suv: -0.0000
  Vehicle Class_luxury-hybrid-suv: 0.00

# All Feature Model

In [17]:
df.columns.values

array(['Unnamed: 0', 'id', 'Year', 'Review', 'Rating', 'Value Rating',
       'Performance Rating', 'Quality Rating', 'Comfort Rating',
       'Reliability Rating', 'Styling Rating', 'Model Full Name',
       'Horsepower (hp)', 'Curb Weight (lbs)', 'Combined MPG',
       'Fuel Capacity (gallons)', 'Recommended Fuel',
       'Average MSRP Price ($)', 'Average KBB Fair Price ($)',
       'Review_Topics', 'Topic_0', 'Topic_1', 'Topic_2', 'Topic_3',
       'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7', 'Topic_8', 'Topic_9',
       'Sentiment Score', 'Depreciated Value', 'Sentiment Bin',
       'Weighted Sentiment Score', 'Topic_0_Weighted', 'Topic_2_Weighted',
       'Topic_3_Weighted', 'Topic_4_Weighted', 'Topic_5_Weighted',
       'Topic_7_Weighted', 'Topic_8_Weighted', 'Topic_9_Weighted', 'Age',
       'Drivetrain_4WD', 'Drivetrain_AWD', 'Drivetrain_FWD',
       'Drivetrain_RWD', 'Car Model_3-series', 'Car Model_300',
       'Car Model_4runner', 'Car Model_5-series', 'Car Model_500x',
    

## Results

In [18]:
results_df = pd.DataFrame(results_list).round({"R^2": 2, "RMSE": 2})
print("\nResults Summary:")
results_df


Results Summary:


Unnamed: 0,Division Title,Model Type,Best Alpha,R^2,RMSE
0,Functional Unaggregated,Linear Regression,,0.78,4123.8
1,Functional Unaggregated,Lasso,1.0,0.78,4123.73
2,Functional Unaggregated,Ridge,100.0,0.78,4123.44
3,Functional Aggregated,Linear Regression,,0.64,5149.06
4,Functional Aggregated,Lasso,1000.0,0.7,4693.66
5,Functional Aggregated,Ridge,100.0,0.71,4624.13
6,Functional & Experiential Unaggregated,Linear Regression,,0.91,2610.3
7,Functional & Experiential Unaggregated,Lasso,0.1,0.91,2610.32
8,Functional & Experiential Unaggregated,Ridge,0.1,0.91,2610.3
9,Functional & Experiential Aggregated,Linear Regression,,0.65,5088.19


In [19]:
car_data_encoded

Unnamed: 0,Model Full Name,Year,Horsepower (hp),Curb Weight (lbs),Combined MPG,Fuel Capacity (gallons),Age,Drivetrain_4WD,Drivetrain_AWD,Drivetrain_FWD,...,Topic_0_Weighted,Topic_2_Weighted,Topic_3_Weighted,Topic_4_Weighted,Topic_5_Weighted,Topic_7_Weighted,Topic_8_Weighted,Topic_9_Weighted,Sentiment Score,Average KBB Fair Price ($)
0,Acura Rdx,2010,240.0,3931.0,19.0,18.0,14.0,0.0,1.0,0.0,...,0.010983,0.008371,0.007901,0.007013,0.000398,0.001676,0.010811,0.009977,0.552172,7210.0
1,Acura Rdx,2013,273.0,3838.0,22.0,16.0,11.0,0.0,1.0,0.0,...,0.014091,0.006550,0.005801,0.010964,0.000553,0.005570,0.006619,0.007627,0.499811,10589.0
2,Acura Rdx,2019,272.0,3783.0,24.0,17.1,5.0,0.0,0.0,1.0,...,0.009899,0.004513,-0.000620,0.002953,0.001104,0.002835,0.003356,0.007962,0.321741,23046.5
3,Acura Tlx,2015,206.0,3483.0,28.0,17.2,9.0,0.0,0.0,1.0,...,0.018985,0.007476,0.004036,0.008199,0.000478,0.008157,0.001749,0.010193,0.633109,11905.5
4,Acura Tlx,2021,272.0,3709.0,25.0,15.9,3.0,0.0,0.0,1.0,...,0.016929,0.002959,0.005005,0.005594,0.000507,0.006455,0.001271,0.003832,0.675924,29701.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321,Volvo Xc40,2019,187.0,3574.0,27.0,14.2,5.0,0.0,0.0,1.0,...,0.020192,0.001120,0.001512,0.001359,0.000159,0.008877,0.003691,0.028231,0.698563,18941.0
322,Volvo Xc60,2010,235.0,4012.0,21.0,18.5,14.0,0.0,0.0,0.0,...,0.012794,0.006283,0.007155,0.009381,0.000846,0.002356,0.005801,0.010385,0.613741,7055.0
323,Volvo Xc60,2018,316.0,4074.0,23.0,18.8,6.0,0.0,1.0,0.0,...,0.007880,0.000921,-0.000955,0.005646,0.000010,0.003236,0.000951,0.003517,0.281155,23469.0
324,Volvo Xc90,2010,235.0,4751.0,17.0,21.1,14.0,0.0,1.0,0.0,...,0.008383,0.004934,0.010953,0.008031,0.001703,0.001079,0.006141,0.003632,0.557620,6471.0
