In [18]:
# =============================================
# 1) 安裝 & Import 套件
# =============================================
# !pip install jieba scikit-learn pandas numpy

import pandas as pd
import numpy as np
import jieba
import re
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

In [33]:
# =============================================
# 2) 讀取資料
# =============================================

# ⚠️ 請把 train.csv 換成你的檔案
df = pd.read_csv("train.csv")

df = df[['name', 'price']].dropna()
df['name'] = df['name'].astype(str)
df.head()

Unnamed: 0,name,price
0,! 十八麻油鴨粽-熱,50
1,! 大福 小米肉粽-凍,104
2,! 筍芋竹香粽-凍,298
3,! 總 蛋黃肉粽-熱,42
4,! 鮑魚干貝荷飯-凍,258


In [48]:
# =============================================
# 3) 商品名稱清洗（去除符號、全形括號等）
# =============================================
# def preprocess_name(text):
#     """
#     商品名稱前處理（保留小數 + 中文單位）
#     """
#     if not isinstance(text, str):
#         text = str(text)

#     text = text.lower()

#     # ✅ 保留「小數 + 可選中文字單位」的數值，例如：1.1米、2.5吋、0.75公斤
#     text = re.sub(r"(\d+\.\d+)(?=[\u4e00-\u9fa5])", r"<DECIMAL_UNIT:\1>", text)

#     # ✅ 保留純小數，例如：1.5
#     text = re.sub(r"(\d+\.\d+)", r"<DECIMAL:\1>", text)

#     # 移除非 中英數 / 空白 / 中文
#     text = re.sub(r"[^a-z0-9\u4e00-\u9fa5\s]", " ", text)

#     # 還原小數
#     text = re.sub(r"<DECIMAL_UNIT:([\d\.]+)>", r" \1 ", text)
#     text = re.sub(r"<DECIMAL:([\d\.]+)>", r" \1 ", text)

#     # 英文與數字分詞
#     text = re.sub(r"([a-z]+)", r" \1 ", text)
#     text = re.sub(r"(\d+)", r" \1 ", text)

#     # 中文逐字分詞
#     text = re.sub(r"([\u4e00-\u9fa5])", r" \1 ", text)

#     # 移除多餘空白
#     text = re.sub(r"\s+", " ", text).strip()

#     return text

def tokenize_text(text):
    # 1. 清理：只保留中文、英文、數字，其餘用空格取代
    # [^\u4e00-\u9fa5a-zA-Z0-9] 的意思是 "不是中文、不是大小寫英文、不是數字" 的任何字元
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
    
    # 2. 轉為小寫
    text = text.lower()
    
    # 3. 使用 jieba 斷詞，並過濾掉多餘的空格
    # jieba.lcut 會回傳一個 list, e.g., ['iphone', '12', 'promax', '亮面', '保護貼']
    words = jieba.lcut(text)
    words = [word for word in words if word.strip()] # 移除只包含空白的元素
    
    # 4. 將詞彙用空格重新組合起來
    return ' '.join(words)

df['name_clean'] = df['name'].apply(tokenize_text)
df.head(20)

Unnamed: 0,name,price,name_clean
0,! 十八麻油鴨粽-熱,50,十八 麻油 鴨 粽 熱
1,! 大福 小米肉粽-凍,104,大福 小米 肉 粽 凍
2,! 筍芋竹香粽-凍,298,筍芋 竹香 粽 凍
3,! 總 蛋黃肉粽-熱,42,總 蛋黃肉 粽 熱
4,! 鮑魚干貝荷飯-凍,258,鮑魚 干貝 荷飯 凍
5,!(呷)香菇粽-熱,57,呷 香菇 粽 熱
6,!新生活家具!《心之綠洲》沙發床 亞麻布 三段式調整 4色可選,4200,新 生活 家具 心之綠洲 沙發床 亞 麻布 三段式 調整 4 色可選
7,!新生活家具!《桃樂斯》沙發床 亞麻布 三段式調整 4色可選,4300,新 生活 家具 桃樂斯 沙發床 亞 麻布 三段式 調整 4 色可選
8,!藍寶3效超濃縮洗衣粉[2kg],98,藍寶 3 效超 濃縮 洗衣粉 2kg
9,#09-PILOT可擦印章 FRIXION stamp,60,09 pilot 可擦 印章 frixion stamp


In [None]:
# =============================================
# 4) 建立品牌詞庫（自動產生候選清單）
# =============================================

# 取得出現頻率最高的前 400 個詞（可能包含品牌）
candidates = Counter(df['name_clean'].str.split().str[0])
brand_candidates = pd.Series(dict(candidates)).sort_values(ascending=False).head(400)


# ⚠️ 手動篩選掉不是品牌的詞
# 建議：你把這些結果 print 出來看一下 → 然後留下真正的品牌
brand_list = [
    "muji","nike","adidas","nintendo","panasonic","3m",
    "apple","xiaomi","asus","acer","sony","lenovo",
    "uniqlo","ikea","dyson","logitech","canon","hp",
    "msi","samsung","philips"
]

brands = sorted(set(brand_list), key=len, reverse=True)

def extract_brand(name):
    name_low = name.lower()
    for b in brands:
        if b in name_low:
            return b
    return "other"

df['brand'] = df['name_clean'].apply(extract_brand)

AttributeError: 'int' object has no attribute 'lower'

In [22]:
# =============================================
# 5) 型號解析（B+ 核心 Feature）
# =============================================
def extract_model(name):
    name = name.upper()
    pattern = r'\b([A-Z0-9]+(?:[-_][A-Z0-9]+)*\d*[A-Z0-9]*(?:\([A-Z0-9\-]+\))?)\b'
    hits = re.findall(pattern, name)

    blacklist = {"GB", "ML", "CM", "KG", "MM", "L"}
    hits = [h for h in hits if h not in blacklist]

    if len(hits) == 0:
        return "nomodel"
    return hits[0]

df['model'] = df['name_clean'].apply(extract_model)

In [23]:
# =============================================
# 6) 規格解析：容量 / 重量 / 尺寸 / 件數
# =============================================
def parse_specs(name):
    n = name.lower()

    gb = re.findall(r'(\d+)\s*gb', n)
    gb = int(gb[0]) if gb else 0

    tb = re.findall(r'(\d+)\s*tb', n)
    gb += int(tb[0]) * 1024 if tb else 0  # TB → GB

    w = re.findall(r'(\d+)\s*(g|kg|ml|l)\b', n)
    if w:
        value, unit = w[0]
        value = float(value)
        if unit == "kg":
            value *= 1000
        elif unit == "l":
            value *= 1000
        weight = value
    else:
        weight = 0

    sz = re.findall(r'(\d+)\s*[x\*]\s*(\d+)', n)
    size = int(sz[0][0]) * int(sz[0][1]) if sz else 0

    qty = re.findall(r'(\d+)\s*(入|件|pcs)', n)
    qty = int(qty[0][0]) if qty else 1

    return pd.Series([gb, weight, size, qty])

df[['gb', 'weight', 'size', 'qty']] = df['name_clean'].apply(parse_specs)
df['weight'] = np.log1p(df['weight'])
df['size']   = np.log1p(df['size'])

In [24]:
# =============================================
# 7) 中文斷詞（Jieba 斷詞）
# =============================================
import jieba

def jieba_cut(s):
    return " ".join(jieba.cut(str(s)))

df['name_cut'] = df['name_clean'].apply(jieba_cut)

In [25]:
# =============================================
# 8) Train / Test Split
# =============================================
X = df[['name_cut', 'brand', 'model', 'gb', 'weight', 'size', 'qty']]
y = np.log1p(df['price'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [26]:
# =============================================
# 9) 建立模型（不用深度學習，使用 Ridge）
# =============================================
preprocess = ColumnTransformer([
    ("tfidf", TfidfVectorizer(max_features=30000, ngram_range=(1,2), min_df=3), 'name_cut'),
    ("brand_ohe", OneHotEncoder(handle_unknown='ignore'), ['brand']),
    ("model_ohe", OneHotEncoder(handle_unknown='ignore'), ['model'])
], remainder='passthrough')

model = Pipeline([
    ("prep", preprocess),
    ("reg", Ridge(alpha=1.0))
])

model.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('tfidf', ...), ('brand_ohe', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [29]:
# =============================================
# 10) 評估模型
# =============================================
def smape(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8)) * 100

y_pred = np.expm1(model.predict(X_test))
y_true = np.expm1(y_test)

print("MAE:", mean_absolute_error(y_true, y_pred))
print("SMAPE (%):", smape(y_true, y_pred))
print("R²:", r2_score(y_true, y_pred))

MAE: 1216.0076228441671
SMAPE (%): 84.05278159041465
R²: -0.02296786929524597


In [28]:
df['price'].describe()

count    4.953870e+05
mean     1.451716e+03
std      2.698696e+04
min      0.000000e+00
25%      1.880000e+02
50%      4.350000e+02
75%      9.990000e+02
max      1.417623e+07
Name: price, dtype: float64

======================================================================================

In [36]:
from collections import Counter
import pandas as pd
import re

# 取得第一個詞
first_words = df["name_clean"].str.split().str[0]

# 統計出現頻率
counts = Counter(first_words)

# 取前 2000 個候選（數量越大越能自動學習）
brand_candidates = pd.Series(counts).sort_values(ascending=False).head(2000)
print(brand_candidates)

MUJI           3242
NIKE           2879
代收代付           2311
3M             1768
adidas官方旗艦館    1498
               ... 
titan            35
crayola          35
克補               35
日本FURIMORI       35
Balea            35
Length: 2000, dtype: int64


In [39]:
BLACKWORDS = set([
    "新款","原廠","正版","專用","通用","超值","特價","加厚","最新","限量","公司貨",
    "大","小","男","女","中","兒童","加大","雙人","進口", "代收代付", "日本", "美國", 
    "即期品", "德國", "台灣", "台製", "台灣製", "台灣公司貨", "台灣現貨", "現貨", "現貨供應",
    "快速出貨", "快速到貨", "免運", "免運費", "送貨到府", "到府安裝", "韓國"
])

def is_valid_brand(word):
    if len(word) <= 1: return False
    if re.fullmatch(r"\d+", word): return False
    if re.search(r"\d", word): return False  # 含數字 → 多半不是品牌
    if word.lower() in BLACKWORDS: return False
    return True

auto_brand_list = [w.lower() for w in brand_candidates.index if is_valid_brand(w)]
print(auto_brand_list)

['muji', 'nike', 'adidas官方旗艦館', 'apple', 'panasonic', 'ec', 'nintendo', 'asus', 'adidas', 'new', 'sony', 'samsung', 'kinyo', 'coach', 'skechers', '初色', 'casio', 'philips', '曼黛瑪璉', 'gap', 'sandisk', 'under', 'hp', 'iphone', '冷藏肉', 'my', '上野物產', '群加', '特力屋', '綠聯', '生活工場', 'baseus', 'philips', 'lego', 'aibo', 'playboy', 'porter', 'sampo', 'one', 'e.dot', 'cookpower', 'iris', 'jiago', 'locknlock樂扣樂扣', '玉如阿姨', 'swear', 'easy', 'epson', 'udilife', '義美', 'disney', 'michael', 'samsung', 'puma', 'push', 'shiseido', 'the', 'adisi', 'uag', 'giordano', 'fila', '瑪登瑪朵', '索樂生活', 'kose', 'mk馬克', 'anden', 'wd', 'tomica', '世一', 'canon', 'triumph', 'transcend', 'cap', 'bosch', 'powersync', '犀牛盾', '熊媽媽買菜網', '魔法baby', '極鮮配', 'ns', 'treny', 'atunas', 'lg', 'the', '太星電工', '理膚寶水', 'kolin', 'kipling', '風車圖書', '福利品', 'antian', '完美主義', 'baby', '英國', 'benevo', 'keyway', 'lanni', '橘魔法', 'oxo', 'tefal', 'elecom', 'bk', 'ot', 'dike', 'innisfree', 'lion', 'mizuno', 'fp', 'ainmax', 'vxtra', '蔬菜工坊', '○糊塗鞋匠○', '妙管家', '米

In [51]:
def extract_price_from_name(name):
    # 這個 regex 會尋找 $、NT、元、價 等符號前後的數字
    # \d+[\.,]?\d* 會匹配 1690, 1,690, 1.5 這樣的數字
    patterns = [
        r'[$NTD元$價特價]\s*(\d+[\.,]?\d*)', # $1690, 1690元, 特價1290
        r'(\d+[\.,]?\d*)\s*[元$]' # 1290 元
    ]
    
    for pattern in patterns:
        match = re.search(pattern, name)
        if match:
            # 找到匹配，清理掉逗號並轉成數字
            price_str = match.group(1).replace(',', '')
            try:
                return float(price_str)
            except ValueError:
                continue # 如果轉換失敗，嘗試下一個 pattern
                
    # 如果都沒找到，回傳 NaN (Not a Number)
    return np.nan

# 建立新欄位 'price_in_name'
df['price_in_name'] = df['name'].apply(extract_price_from_name)

# 顯示結果
print("--- 步驟 1：擷取名稱中的價格 ---")
print(df[['name', 'price_in_name', 'price']].head(100))

--- 步驟 1：擷取名稱中的價格 ---
                    name  price_in_name  price
0             ! 十八麻油鴨粽-熱            NaN     50
1            ! 大福 小米肉粽-凍            NaN    104
2              ! 筍芋竹香粽-凍            NaN    298
3             ! 總 蛋黃肉粽-熱            NaN     42
4             ! 鮑魚干貝荷飯-凍            NaN    258
..                   ...            ...    ...
95        #小財神撲滿擺飾[1入一包]            NaN    129
96       #彩金元寶壁貼-寶[1入一包]            NaN     99
97  #彩金大掛軸[1入一包 60*91cm]            NaN    499
98                #恰好香香豆            NaN     71
99               #手摘果物梅肉            NaN     35

[100 rows x 3 columns]
