In [1]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
#from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
current_year = datetime.now().year

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nehab\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df = pd.read_csv('final_cleaned.csv', index_col = False)
car_data_encoded = pd.get_dummies(df, columns=['Model Full Name', 'Drivetrain'], drop_first=True)

In [3]:
results_list = []

# Functional

In [4]:
functional_features = [
    'Horsepower (hp)', 'Curb Weight (lbs)', 'Combined MPG', 'Fuel Capacity (gallons)', 'Age'
] + [col for col in car_data_encoded.columns if col.startswith('Drivetrain_')]
features = functional_features

### Unaggregated

In [5]:
#unaggregated

y_price = car_data_encoded['Average KBB Fair Price ($)']
X = car_data_encoded[features]

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)


price_model = LinearRegression()
price_model.fit(X_train, y_train)
y_pred = price_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


price_coefficients = dict(zip(X.columns, price_model.coef_))

# Print R^2 and RMSE
print(f"R^2: {r2:.4f}")
print(f"\nRMSE: {rmse:.4f}")

# Print Model Coefficients
print(f"\nModel Coefficients:")
for feature, coef in price_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept
print(f"\nIntercept: {price_model.intercept_:.4f}")

division_title = "Functional Unaggregated"
model_type = "Linear Regression"

# Append the results to the list as a dictionary
results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": None,
    "R^2": r2,
    "RMSE": rmse
})

R^2: 0.7588

RMSE: 4318.3028

Model Coefficients:
  Horsepower (hp): 48.6318
  Curb Weight (lbs): -2.1142
  Combined MPG: 31.0233
  Fuel Capacity (gallons): 184.6320
  Age: -1722.8755
  Drivetrain_4WD: 3729.0607
  Drivetrain_AWD: -423.9212
  Drivetrain_FWD: -1954.2304
  Drivetrain_RWD: 763.4948

Intercept: 23434.2587


#### Lasso

In [6]:

y_price = car_data_encoded['Average KBB Fair Price ($)']
X = car_data_encoded[features]

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

alpha_values = [0.001, 0.01, 0.1, 1.0, 10.0]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional Unaggregated"
model_type = "Lasso"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})



Alpha: 0.001
  R^2: 0.7588
  RMSE: 4318.3027

Alpha: 0.01
  R^2: 0.7588
  RMSE: 4318.3015

Alpha: 0.1
  R^2: 0.7588
  RMSE: 4318.2905

Alpha: 1.0
  R^2: 0.7588
  RMSE: 4318.1830

Alpha: 10.0
  R^2: 0.7589
  RMSE: 4317.3874

Best Alpha based on RMSE: 10.0
Best R^2: 0.7589
Best RMSE: 4317.3874

Model Coefficients for the Best Alpha:
  Horsepower (hp): 48.9345
  Curb Weight (lbs): -2.1108
  Combined MPG: 31.0625
  Fuel Capacity (gallons): 184.1724
  Age: -1720.6839
  Drivetrain_4WD: 3629.9590
  Drivetrain_AWD: -374.4815
  Drivetrain_FWD: -1915.5861
  Drivetrain_RWD: 677.7930

Intercept: 23332.3001


#### Ridge

In [7]:

y_price = car_data_encoded['Average KBB Fair Price ($)']
X = car_data_encoded[features]

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

alpha_values = [0.0001, 0.001, 0.01, 0.1, 1.0]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    Ridge_model = Ridge(alpha=alpha)
    Ridge_model.fit(X_train, y_train)
    y_pred = Ridge_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional Unaggregated"
model_type = "Ridge"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.0001
  R^2: 0.7588
  RMSE: 4318.3028

Alpha: 0.001
  R^2: 0.7588
  RMSE: 4318.3028

Alpha: 0.01
  R^2: 0.7588
  RMSE: 4318.3026

Alpha: 0.1
  R^2: 0.7588
  RMSE: 4318.3006

Alpha: 1.0
  R^2: 0.7588
  RMSE: 4318.2809

Best Alpha based on RMSE: 1.0
Best R^2: 0.7588
Best RMSE: 4318.2809

Model Coefficients for the Best Alpha:
  Horsepower (hp): 48.9345
  Curb Weight (lbs): -2.1108
  Combined MPG: 31.0625
  Fuel Capacity (gallons): 184.1724
  Age: -1720.6839
  Drivetrain_4WD: 3629.9590
  Drivetrain_AWD: -374.4815
  Drivetrain_FWD: -1915.5861
  Drivetrain_RWD: 677.7930

Intercept: 23332.3001


### Aggregated

In [8]:
#aggregated
aggregated_data = df.groupby(['Model Full Name']).agg({
    'Horsepower (hp)': 'mean',
    'Curb Weight (lbs)': 'mean',
    'Combined MPG': 'mean',
    'Fuel Capacity (gallons)': 'mean',
    'Age': 'mean',
    'Average KBB Fair Price ($)': 'mean',
    'Drivetrain': 'first'
}).reset_index()

car_data_encoded_agg = pd.get_dummies(aggregated_data, columns=['Model Full Name', 'Drivetrain'], drop_first=True)

y_price = car_data_encoded_agg['Average KBB Fair Price ($)']
X = car_data_encoded_agg[functional_features]

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

price_model = LinearRegression()
price_model.fit(X_train, y_train)
y_pred = price_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


price_coefficients = dict(zip(X.columns, price_model.coef_))

# Print R^2 and RMSE
print(f"R^2: {r2:.4f}")
print(f"\nRMSE: {rmse:.4f}")

# Print Model Coefficients
print("Model Coefficients:")
for feature, coef in price_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept
print(f"\nIntercept: {price_model.intercept_:.4f}")

division_title = "Functional Aggregated"
model_type = "Linear Regression"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": None,
    "R^2": r2,
    "RMSE": rmse
})


R^2: 0.6676

RMSE: 5500.9378
Model Coefficients:
  Horsepower (hp): 43.6321
  Curb Weight (lbs): -0.3676
  Combined MPG: 68.7316
  Fuel Capacity (gallons): 230.0007
  Age: -1975.5590
  Drivetrain_4WD: 3691.9952
  Drivetrain_AWD: -1794.9761
  Drivetrain_FWD: -1417.8011
  Drivetrain_RWD: -891.4719

Intercept: 18283.2862


#### Lasso

In [9]:

y_price = car_data_encoded_agg['Average KBB Fair Price ($)']
X = car_data_encoded_agg[functional_features]

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional Aggregated"
model_type = "Lasso"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})



Alpha: 0.1
  R^2: 0.6676
  RMSE: 5500.6065

Alpha: 1.0
  R^2: 0.6680
  RMSE: 5497.6274

Alpha: 10.0
  R^2: 0.6715
  RMSE: 5468.1091

Alpha: 100
  R^2: 0.6834
  RMSE: 5368.0047

Alpha: 1000
  R^2: 0.7023
  RMSE: 5205.9640

Best Alpha based on RMSE: 1000
Best R^2: 0.7023
Best RMSE: 5205.9640

Model Coefficients for the Best Alpha:
  Horsepower (hp): 49.4808
  Curb Weight (lbs): 0.6178
  Combined MPG: 60.9700
  Fuel Capacity (gallons): 71.4483
  Age: -1914.5963
  Drivetrain_4WD: 0.0000
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -0.0000
  Drivetrain_RWD: -0.0000

Intercept: 15130.8722


#### Ridge

In [10]:

y_price = car_data_encoded_agg['Average KBB Fair Price ($)']
X = car_data_encoded_agg[functional_features]

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional Aggregated"
model_type = "Ridge"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.1
  R^2: 0.6679
  RMSE: 5498.2777

Alpha: 1.0
  R^2: 0.6705
  RMSE: 5476.4569

Alpha: 10.0
  R^2: 0.6834
  RMSE: 5368.2621

Alpha: 100
  R^2: 0.6948
  RMSE: 5270.6930

Alpha: 1000
  R^2: 0.6432
  RMSE: 5699.3143

Best Alpha based on RMSE: 100
Best R^2: 0.6948
Best RMSE: 5270.6930

Model Coefficients for the Best Alpha:
  Horsepower (hp): 49.4808
  Curb Weight (lbs): 0.6178
  Combined MPG: 60.9700
  Fuel Capacity (gallons): 71.4483
  Age: -1914.5963
  Drivetrain_4WD: 0.0000
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -0.0000
  Drivetrain_RWD: -0.0000

Intercept: 15130.8722


# Functional and Experiential

### Unaggregated

In [11]:
selected_topics = ['Topic_0', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_7', 'Topic_8', 'Topic_9']

for topic in selected_topics:
    car_data_encoded[f'{topic}_Weighted'] = car_data_encoded[topic] * car_data_encoded['Sentiment Score']

weighted_topic_features = [f'{topic}_Weighted' for topic in selected_topics]
features = functional_features + weighted_topic_features + [col for col in car_data_encoded.columns if col.startswith('Model Full Name_')]

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

price_model = LinearRegression()
price_model.fit(X_train, y_train)
y_pred = price_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


price_coefficients = dict(zip(X.columns, price_model.coef_))

print(f"R^2: {r2:.4f}")
print(f"\nRMSE: {rmse:.4f}")

# Print Model Coefficients
print(f"\nModel Coefficients:")
for feature, coef in price_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept
print(f"\nIntercept: {price_model.intercept_:.4f}")

division_title = "Functional & Experiential Unaggregated"
model_type = "Linear Regression"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": None,
    "R^2": r2,
    "RMSE": rmse
})

R^2: 0.9119

RMSE: 2610.2974

Model Coefficients:
  Horsepower (hp): 5.6206
  Curb Weight (lbs): -4.1024
  Combined MPG: -39.8883
  Fuel Capacity (gallons): -70.7150
  Age: -1818.9153
  Drivetrain_4WD: 408.0925
  Drivetrain_AWD: -1565.4814
  Drivetrain_FWD: -4207.0753
  Drivetrain_RWD: -2462.8216
  Topic_0_Weighted: -7599.1892
  Topic_2_Weighted: -1092.6125
  Topic_3_Weighted: 5602.0659
  Topic_4_Weighted: -1684.7965
  Topic_5_Weighted: -3175.9618
  Topic_7_Weighted: -639.9738
  Topic_8_Weighted: 716.6864
  Topic_9_Weighted: -86.9938
  Model Full Name_Acura Tlx: -802.1584
  Model Full Name_Alfa-romeo Giulia: -698.0654
  Model Full Name_Alfa-romeo Stelvio: -2780.3955
  Model Full Name_Audi A4: -342.5302
  Model Full Name_Audi A6: 301.2338
  Model Full Name_Audi Q3: -3598.4843
  Model Full Name_Audi Q5: -129.4084
  Model Full Name_Audi Q7: 3581.5054
  Model Full Name_Bmw 3-series: -954.9996
  Model Full Name_Bmw 5-series: 53.8052
  Model Full Name_Bmw X3: -922.8784
  Model Full Name_Bmw 

#### Lasso

In [12]:

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional & Experiential Unaggregated"
model_type = "Lasso"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.1
  R^2: 0.9118
  RMSE: 2611.6903

Alpha: 1.0
  R^2: 0.9078
  RMSE: 2669.6840

Alpha: 10.0
  R^2: 0.8637
  RMSE: 3245.9577

Alpha: 100
  R^2: 0.7569
  RMSE: 4334.8105

Alpha: 1000
  R^2: 0.7281
  RMSE: 4584.6565

Best Alpha based on RMSE: 0.1
Best R^2: 0.9118
Best RMSE: 2611.6903

Model Coefficients for the Best Alpha:
  Horsepower (hp): 7.2497
  Curb Weight (lbs): -4.0376
  Combined MPG: -37.1475
  Fuel Capacity (gallons): -46.3838
  Age: -1817.2866
  Drivetrain_4WD: 459.7536
  Drivetrain_AWD: -1538.5852
  Drivetrain_FWD: -4217.1825
  Drivetrain_RWD: -2336.3824
  Topic_0_Weighted: -6987.1901
  Topic_2_Weighted: -735.9657
  Topic_3_Weighted: 5012.3151
  Topic_4_Weighted: -1298.8123
  Topic_5_Weighted: -2833.2141
  Topic_7_Weighted: -485.4700
  Topic_8_Weighted: 370.0426
  Topic_9_Weighted: -0.0000
  Model Full Name_Acura Tlx: -586.1320
  Model Full Name_Alfa-romeo Giulia: -670.2321
  Model Full Name_Alfa-romeo Stelvio: -2686.8654
  Model Full Name_Audi A4: -92.3178
  Model Fu

#### Ridge

In [13]:

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional & Experiential Unaggregated"
model_type = "Ridge"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.1
  R^2: 0.9117
  RMSE: 2612.2637

Alpha: 1.0
  R^2: 0.9092
  RMSE: 2649.4697

Alpha: 10.0
  R^2: 0.8938
  RMSE: 2865.3707

Alpha: 100
  R^2: 0.8438
  RMSE: 3474.7130

Alpha: 1000
  R^2: 0.7739
  RMSE: 4181.1080

Best Alpha based on RMSE: 0.1
Best R^2: 0.9117
Best RMSE: 2612.2637

Model Coefficients for the Best Alpha:
  Horsepower (hp): 54.9404
  Curb Weight (lbs): -0.8939
  Combined MPG: 16.6959
  Fuel Capacity (gallons): 50.7800
  Age: -1623.9290
  Drivetrain_4WD: 0.0000
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -0.0000
  Drivetrain_RWD: 0.0000
  Topic_0_Weighted: -0.0000
  Topic_2_Weighted: -0.0000
  Topic_3_Weighted: 0.0000
  Topic_4_Weighted: 0.0000
  Topic_5_Weighted: 0.0000
  Topic_7_Weighted: -0.0000
  Topic_8_Weighted: 0.0000
  Topic_9_Weighted: 0.0000
  Model Full Name_Acura Tlx: -0.0000
  Model Full Name_Alfa-romeo Giulia: -0.0000
  Model Full Name_Alfa-romeo Stelvio: -0.0000
  Model Full Name_Audi A4: 0.0000
  Model Full Name_Audi A6: -0.0000
  Model Full Name_

### Aggregated

In [14]:
#aggregated
selected_topics = ['Topic_0', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_7', 'Topic_8', 'Topic_9']

for topic in selected_topics:
    df[f'{topic}_Weighted'] = df[topic] * df['Sentiment Score']

df = pd.get_dummies(df, columns=['Drivetrain'], drop_first=True)

aggregation_dict = {
    feature: 'mean' if feature != 'Drivetrain' else 'first'
    for feature in functional_features + [f'{topic}_Weighted' for topic in selected_topics] + ['Sentiment Score', 'Average KBB Fair Price ($)']
}
aggregated_data = df.groupby(['Model Full Name']).agg(aggregation_dict).reset_index()

car_data_encoded = pd.get_dummies(aggregated_data, columns=['Model Full Name'], drop_first=True)

weighted_topic_features = [f'{topic}_Weighted' for topic in selected_topics]
features = functional_features + weighted_topic_features + [col for col in car_data_encoded.columns if col.startswith('Model Full Name_')]

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

price_model = LinearRegression()
price_model.fit(X_train, y_train)
y_pred = price_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


price_coefficients = dict(zip(X.columns, price_model.coef_))

# Print R^2 and RMSE
print(f"R^2: {r2:.4f}")
print(f"\nRMSE: {rmse:.4f}")

# Print Model Coefficients
print(f"\nModel Coefficients:")
for feature, coef in price_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept
print(f"\nIntercept: {price_model.intercept_:.4f}")

division_title = "Functional & Experiential Aggregated"
model_type = "Linear Regression"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": None,
    "R^2": r2,
    "RMSE": rmse
})


R^2: 0.7089

RMSE: 5147.9545

Model Coefficients:
  Horsepower (hp): 44.6699
  Curb Weight (lbs): -0.1154
  Combined MPG: 63.7918
  Fuel Capacity (gallons): 74.6505
  Age: -1883.8054
  Drivetrain_4WD: 4668.9724
  Drivetrain_AWD: -676.3371
  Drivetrain_FWD: -1527.3354
  Drivetrain_RWD: -813.3311
  Topic_0_Weighted: 39.2581
  Topic_2_Weighted: -158.9767
  Topic_3_Weighted: 233.5348
  Topic_4_Weighted: 378.0856
  Topic_5_Weighted: 374.8409
  Topic_7_Weighted: -278.9404
  Topic_8_Weighted: 122.8339
  Topic_9_Weighted: 107.0679
  Model Full Name_Acura Tlx: 90.4122
  Model Full Name_Alfa-romeo Giulia: -1633.3511
  Model Full Name_Alfa-romeo Stelvio: -2984.2098
  Model Full Name_Audi A4: 1238.1802
  Model Full Name_Audi A6: 37.2312
  Model Full Name_Audi Q3: -0.0000
  Model Full Name_Audi Q5: -346.3178
  Model Full Name_Audi Q7: -939.7253
  Model Full Name_Bmw 3-series: -0.0000
  Model Full Name_Bmw 5-series: -1297.0023
  Model Full Name_Bmw X3: -771.0912
  Model Full Name_Bmw X5: -0.0000
  M

#### lasso

In [15]:

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    y_pred = lasso_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional & Experiential Aggregated"
model_type = "Lasso"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})



Alpha: 0.1
  R^2: 0.6958
  RMSE: 5262.2706

Alpha: 1.0
  R^2: 0.6421
  RMSE: 5707.3696

Alpha: 10.0
  R^2: 0.6359
  RMSE: 5757.0751

Alpha: 100
  R^2: 0.6842
  RMSE: 5361.5088

Alpha: 1000
  R^2: 0.7023
  RMSE: 5205.9640

Best Alpha based on RMSE: 1000
Best R^2: 0.7023
Best RMSE: 5205.9640

Model Coefficients for the Best Alpha:
  Horsepower (hp): 49.4808
  Curb Weight (lbs): 0.6178
  Combined MPG: 60.9700
  Fuel Capacity (gallons): 71.4483
  Age: -1914.5963
  Drivetrain_4WD: 0.0000
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -0.0000
  Drivetrain_RWD: -0.0000
  Topic_0_Weighted: -0.0000
  Topic_2_Weighted: -0.0000
  Topic_3_Weighted: 0.0000
  Topic_4_Weighted: 0.0000
  Topic_5_Weighted: 0.0000
  Topic_7_Weighted: -0.0000
  Topic_8_Weighted: 0.0000
  Topic_9_Weighted: 0.0000
  Model Full Name_Acura Tlx: -0.0000
  Model Full Name_Alfa-romeo Giulia: -0.0000
  Model Full Name_Alfa-romeo Stelvio: -0.0000
  Model Full Name_Audi A4: 0.0000
  Model Full Name_Audi A6: -0.0000
  Model Full Name

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


#### Ridge

In [16]:

X = car_data_encoded[features]
y_price = car_data_encoded['Average KBB Fair Price ($)']

X_train, X_test, y_train, y_test = train_test_split(X, y_price, test_size=0.2, random_state=15)

alpha_values = [0.1, 1.0, 10.0, 100, 1000]
best_rmse = float('inf')  # Initialize with a very high RMSE
best_alpha = None
best_coefficients = None
best_intercept = None

# Loop through each alpha value
for alpha in alpha_values:
    # Initialize and train the Lasso model with the current alpha
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    y_pred = ridge_model.predict(X_test)
    
    # Calculate evaluation metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print RMSE and R^2 for this alpha
    print(f"\nAlpha: {alpha}")
    print(f"  R^2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    
    # Check if this model has the best RMSE so far
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
        best_r2 = r2
        best_coefficients = dict(zip(X.columns, lasso_model.coef_))
        best_intercept = lasso_model.intercept_

# Print the best alpha and its coefficients
print(f"\nBest Alpha based on RMSE: {best_alpha}")
print(f"Best R^2: {best_r2:.4f}")
print(f"Best RMSE: {best_rmse:.4f}")

# Print coefficients for the best alpha
print("\nModel Coefficients for the Best Alpha:")
for feature, coef in best_coefficients.items():
    print(f"  {feature}: {coef:.4f}")

# Print Intercept for the best alpha
print(f"\nIntercept: {best_intercept:.4f}")

division_title = "Functional & Experiential Aggregated"
model_type = "Ridge"

results_list.append({
    "Division Title": division_title,
    "Model Type": model_type,
    "Best Alpha": best_alpha,
    "R^2": best_r2,
    "RMSE": best_rmse
})


Alpha: 0.1
  R^2: 0.6821
  RMSE: 5379.0355

Alpha: 1.0
  R^2: 0.6785
  RMSE: 5409.6371

Alpha: 10.0
  R^2: 0.6898
  RMSE: 5314.0398

Alpha: 100
  R^2: 0.6960
  RMSE: 5260.3338

Alpha: 1000
  R^2: 0.6432
  RMSE: 5698.8766

Best Alpha based on RMSE: 100
Best R^2: 0.6960
Best RMSE: 5260.3338

Model Coefficients for the Best Alpha:
  Horsepower (hp): 49.4808
  Curb Weight (lbs): 0.6178
  Combined MPG: 60.9700
  Fuel Capacity (gallons): 71.4483
  Age: -1914.5963
  Drivetrain_4WD: 0.0000
  Drivetrain_AWD: -0.0000
  Drivetrain_FWD: -0.0000
  Drivetrain_RWD: -0.0000
  Topic_0_Weighted: -0.0000
  Topic_2_Weighted: -0.0000
  Topic_3_Weighted: 0.0000
  Topic_4_Weighted: 0.0000
  Topic_5_Weighted: 0.0000
  Topic_7_Weighted: -0.0000
  Topic_8_Weighted: 0.0000
  Topic_9_Weighted: 0.0000
  Model Full Name_Acura Tlx: -0.0000
  Model Full Name_Alfa-romeo Giulia: -0.0000
  Model Full Name_Alfa-romeo Stelvio: -0.0000
  Model Full Name_Audi A4: 0.0000
  Model Full Name_Audi A6: -0.0000
  Model Full Name_

In [17]:
results_df = pd.DataFrame(results_list).round({"R^2": 2, "RMSE": 2})
print("\nResults Summary:")
results_df


Results Summary:


Unnamed: 0,Division Title,Model Type,Best Alpha,R^2,RMSE
0,Functional Unaggregated,Linear Regression,,0.76,4318.3
1,Functional Unaggregated,Lasso,10.0,0.76,4317.39
2,Functional Unaggregated,Ridge,1.0,0.76,4318.28
3,Functional Aggregated,Linear Regression,,0.67,5500.94
4,Functional Aggregated,Lasso,1000.0,0.7,5205.96
5,Functional Aggregated,Ridge,100.0,0.69,5270.69
6,Functional & Experiential Unaggregated,Linear Regression,,0.91,2610.3
7,Functional & Experiential Unaggregated,Lasso,0.1,0.91,2611.69
8,Functional & Experiential Unaggregated,Ridge,0.1,0.91,2612.26
9,Functional & Experiential Aggregated,Linear Regression,,0.71,5147.95


In [18]:
car_data_encoded

Unnamed: 0,Horsepower (hp),Curb Weight (lbs),Combined MPG,Fuel Capacity (gallons),Age,Drivetrain_4WD,Drivetrain_AWD,Drivetrain_FWD,Drivetrain_RWD,Topic_0_Weighted,...,Model Full Name_Volkswagen Arteon,Model Full Name_Volkswagen Atlas,Model Full Name_Volkswagen Golf-gti,Model Full Name_Volkswagen Id-4,Model Full Name_Volkswagen Jetta,Model Full Name_Volkswagen Passat,Model Full Name_Volkswagen Tiguan,Model Full Name_Volvo Xc40,Model Full Name_Volvo Xc60,Model Full Name_Volvo Xc90
0,262.060606,3849.206061,21.715152,17.015758,9.927273,0.0,0.660606,0.339394,0.0,0.011670,...,False,False,False,False,False,False,False,False,False,False
1,224.000000,3544.636364,27.181818,16.845455,7.363636,0.0,0.000000,1.000000,0.0,0.018425,...,False,False,False,False,False,False,False,False,False,False
2,280.000000,3761.000000,27.000000,15.300000,7.000000,0.0,0.000000,0.000000,1.0,0.012348,...,False,False,False,False,False,False,False,False,False,False
3,280.000000,3761.000000,24.000000,16.900000,6.000000,0.0,1.000000,0.000000,0.0,0.018979,...,False,False,False,False,False,False,False,False,False,False
4,202.031250,3626.937500,28.135417,16.388542,11.010417,0.0,0.000000,1.000000,0.0,0.015253,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,211.890625,3281.984375,25.296875,18.500000,11.640625,0.0,0.000000,1.000000,0.0,0.010684,...,False,False,False,False,False,True,False,False,False,False
163,192.000000,3704.000000,22.000000,16.050000,10.000000,0.0,0.500000,0.500000,0.0,0.014438,...,False,False,False,False,False,False,True,False,False,False
164,187.000000,3574.000000,27.000000,14.200000,5.000000,0.0,0.000000,1.000000,0.0,0.020192,...,False,False,False,False,False,False,False,True,False,False
165,275.500000,4043.000000,22.000000,18.650000,10.000000,0.0,0.500000,0.000000,0.0,0.010337,...,False,False,False,False,False,False,False,False,True,False
