In [None]:
conda env list


: 

In [None]:
# if using a non-conda environment
python -m pip install <xyz> -U --force-reinstall
# If using a conda environment
conda install --name <environment name> --update-deps --force-reinstall

: 

In [None]:
!pip install xgboost
!pip install statsmodels

: 

In [None]:
import pandas as pd
import numpy as np
import math

# 迴歸模型
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# 資料切分 & 指標
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr


# statsmodels for OLS
import statsmodels.api as sm
import statsmodels.formula.api as smf

# ============= 1. 讀取資料 =============
# 假設已經有一個 CSV，包含至少以下欄位:
# "rank" (連續型), "average_rating", "price", "sentiment_score", "0","1","2","3"
df = pd.read_csv('amazon_music_reviews_with_sentiment.csv')  

# ============= 2. 特徵 (X) 與目標 (y) =============
# 目標: 連續型 rank
y = df['rank']

# (A) 不加主題
X_baseline = df[['average_rating', 'price', 'sentiment_score']].fillna(0)

# (B) 加主題 (含 0,1,2,3)
X_topic = df[['average_rating', 'price', 'sentiment_score', '0', '1', '2', '3']].fillna(0)

# ============= 3. 定義一個函式來訓練 & 評估 & 輸出結果 =============
def train_and_evaluate(model, X, y, model_name="model", info=""):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # 模型訓練
    model.fit(X_train, y_train)
    
    # 預測
    y_pred = model.predict(X_test)
    
    # 指標: MSE, R^2, Spearman
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    spearman_corr, _ = (y_test, y_pred)
    rmse = math.sqrt(mse)
    
    print(f"=== {model_name} ({info}) ===")
    print(f"MSE = {mse:.6f}")
    print(f"R²  = {r2:.6f}")
    print(f"rmse = {rmse:.6f}\n")

# ============= 4. 三種迴歸模型：Baseline vs With Topic =============
lin_reg = LinearRegression()
rf_reg = RandomForestRegressor(random_state=42)
xgb_reg = XGBRegressor(random_state=42)

print(">> 不加主題欄位 (Baseline) <<\n")
train_and_evaluate(lin_reg, X_baseline, y, "LinearRegression", "Baseline")
train_and_evaluate(rf_reg, X_baseline, y, "RandomForestRegressor", "Baseline")
train_and_evaluate(xgb_reg, X_baseline, y, "XGBRegressor", "Baseline")

print(">> 加入主題欄位 (With Topic) <<\n")
train_and_evaluate(lin_reg, X_topic, y, "LinearRegression", "With Topic")
train_and_evaluate(rf_reg, X_topic, y, "RandomForestRegressor", "With Topic")
train_and_evaluate(xgb_reg, X_topic, y, "XGBRegressor", "With Topic")

# ============= 5. 額外：使用 statsmodels OLS 查看 p-value =============
# 為了 formula 語法順利，請先將 '0','1','2','3' 改成 'topic0','topic1','topic2','topic3'
df_renamed = df.rename(columns={
    '0': 'topic0',
    '1': 'topic1',
    '2': 'topic2',
    '3': 'topic3'
})

# 只做 "With Topic" 的線性模型, 以查看各主題的係數與 p-value
# rank ~ average_rating + price + sentiment_score + topic0 + topic1 + topic2 + topic3
formula = "rank ~ average_rating + price + sentiment_score + topic0 + topic1 + topic2 + topic3"

ols_model = smf.ols(formula=formula, data=df_renamed).fit()
print("=== OLS Regression (With Topic) ===")
print(ols_model.summary())


: 