In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import gc
import time
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
# import seaborn as sns
# sns.set(style="white")

In [2]:
train = pd.read_table('train.tsv', engine='c')

print('價格數序性統計\n' + str(round(train.price.describe())))

價格數序性統計
count    1482535.0
mean          27.0
std           39.0
min            0.0
25%           10.0
50%           17.0
75%           29.0
max         2009.0
Name: price, dtype: float64


In [3]:
train

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,59.0,0,"Banana republic bottoms, Candies skirt with ma..."
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,64.0,0,Size small but straps slightly shortened to fi...
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,6.0,1,You get three pairs of Sophie cheer shorts siz...
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,19.0,0,Girls Size small Plus green. Three shorts total.
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,I realized his pants are on backwards after th...


In [4]:
# 選擇比較熱門的品牌數值當作分析依據，其他以missing value代替之
dataset = train
brandnum = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing']
NUM_BRANDS = len(brandnum) - len(brandnum[brandnum<=4]) #中位數評斷

# 選擇熱門的種類當作參考，其他以missing value代替之
NUM_CATEGORIES = 1000

# 製作項目名稱詞袋時，所有被算入的字詞最少要有被提到過10次以上，否則不列入計算
NAME_MIN_DF = 10

# 製作description詞向量時，最大的feature以重要的50000字為主
MAX_FEATURES_ITEM_DESCRIPTION = 10000

In [5]:
# 以RMSLE當作損失函數
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean(np.power(np.log1p(y_true) - np.log1p(y_pred), 2)))

# 將category分得更系的切分函數
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

# 部份變數有遺失值，填寫成NA函數
def handle_missing_inplace(dataset):
    dataset['general_cat'].fillna(value='missing', inplace=True)
    dataset['subcat_1'].fillna(value='missing', inplace=True)
    dataset['subcat_2'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)

# 挑選出表現成績好的品牌與分類，當作分析依據
def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category1 = dataset['general_cat'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category2 = dataset['subcat_1'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    pop_category3 = dataset['subcat_2'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_CATEGORIES]
    dataset.loc[~dataset['general_cat'].isin(pop_category1), 'general_cat'] = 'missing'
    dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'missing'
    dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'missing'

# 挑選將category變成類別函數
def to_categorical(dataset):
    dataset['general_cat'] = dataset['general_cat'].astype('category')
    dataset['subcat_1'] = dataset['subcat_1'].astype('category')
    dataset['subcat_2'] = dataset['subcat_2'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')
    
start_time = time.time()

# 統計test data的欄數
nrow_test = train.shape[0] #-dftt.shape[0]

# 將小於1美元的（對於平台業者沒價值的商品）商品移除
dftt = train[(train.price < 1.0)]
train = train.drop(train[(train.price < 1.0)].index)

nrow_train = train.shape[0] #-dftt.shape[0]

#y = train["price"]
merge: pd.DataFrame = pd.concat([train, dftt])
y = np.log1p(merge["price"])
del merge['price']
#merge: pd.DataFrame = pd.concat([train, dftt, test])

del train
gc.collect()

# 將category_name切成三塊，再將原本category_name移除，這樣分析的就更仔細拉
merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = zip(*merge['category_name'].apply(lambda x: split_cat(x)))
merge.drop('category_name', axis=1, inplace=True)
print('{} 完成切分時間'.format(time.time() - start_time))

# 處理NA
handle_missing_inplace(merge)
print('[{}] 處理遺失值的時間.'.format(time.time() - start_time))

# 將popular品牌挑選出來，把明不見經傳的品牌或僅有1次出現的變成遺失值，避免造成過度擬和(overfitting)之狀況
cutting(merge)
print('[{}] 處理熱門品牌時間.'.format(time.time() - start_time))

to_categorical(merge)
print('[{}] 轉換成名目變數'.format(time.time() - start_time))

# 如果字數出現次數小於10，便不進行字數的計算（vector space model）
cv = CountVectorizer(min_df=NAME_MIN_DF)
X_name = cv.fit_transform(merge['name'])
print('[{}] Count vectorize `name` completed.'.format(time.time() - start_time))

# 將Category轉變成計數形式（vector space model）
cv = CountVectorizer()
X_category1 = cv.fit_transform(merge['general_cat'])
X_category2 = cv.fit_transform(merge['subcat_1'])
X_category3 = cv.fit_transform(merge['subcat_2'])
print('[{}] categories詞袋完成.'.format(time.time() - start_time))

6.307417154312134 完成切分時間
[6.900678873062134] 處理遺失值的時間.
[17.551525115966797] 處理熱門品牌時間.
[18.63574242591858] 轉換成名目變數
[27.838124990463257] Count vectorize `name` completed.
[42.496277809143066] categories詞袋完成.


In [6]:
# 在item_description以tf-idf法做
tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                     ngram_range=(1, 3),
                     stop_words='english')

X_description = tv.fit_transform(merge['item_description'])
print('[{}] TFIDF於 item_description花費時間.'.format(time.time() - start_time))

# 做brand name, item_condition_id, shipping變數 的 one hot encoding
lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
print('[{}] one hot encoding於 brand_name花費時間 .'.format(time.time() - start_time))

X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                      sparse=True).values.astype(int))
print('[{}] 將item_condition_id 及 shipping 轉變成類別變數的時間.'.format(time.time() - start_time))

# 將所有非結構化變數轉變成稀疏-結構化矩陣，以便分析
sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()
print('[{}] 將所有非結構化變數轉變成稀疏-結構化矩陣'.format(time.time() - start_time))

[227.58718848228455] TFIDF於 item_description花費時間.
[236.77531599998474] one hot encoding於 brand_name花費時間 .
[240.1822907924652] 將item_condition_id 及 shipping 轉變成類別變數的時間.
[259.1296684741974] 將所有非結構化變數轉變成稀疏-結構化矩陣


In [7]:
X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                      sparse=True).values.astype(int))
print('[{}] 將item_condition_id 及 shipping 轉變成類別變數的時間.'.format(time.time() - start_time))

[307.4716639518738] 將item_condition_id 及 shipping 轉變成類別變數的時間.


In [8]:
train_X, test_X, train_y, test_y = train_test_split(sparse_merge, y, test_size = 0.1, random_state = 144) 

model = Ridge(alpha=.5, copy_X=True, fit_intercept=True, max_iter=100,
  normalize=False, random_state=101, solver='auto', tol=0.01)

model.fit(train_X, train_y)
print('[{}] Train ridge completed'.format(time.time() - start_time))
predsR = model.predict(X=test_X)
print('[{}] Predict ridge completed'.format(time.time() - start_time))

rmsleR = rmsle(predsR, test_y)
rmsle(np.expm1(predsR), np.expm1(test_y))
np.sqrt(mean_squared_error(np.expm1(predsR), np.expm1(test_y)))
print(rmsle(np.expm1(predsR), np.expm1(test_y)))
#  0.4676692829338082

[352.45053148269653] Train ridge completed
[352.46204805374146] Predict ridge completed
0.47356770978


In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(200)

model.fit(train_X, train_y)
print('[{}] Train ridge completed'.format(time.time() - start_time))
predsR = model.predict(X=test_X)
print('[{}] Predict ridge completed'.format(time.time() - start_time))

rmsleR = rmsle(predsR, test_y)
rmsle(np.expm1(predsR), np.expm1(test_y))
np.sqrt(mean_squared_error(np.expm1(predsR), np.expm1(test_y)))
print(rmsle(np.expm1(predsR), np.expm1(test_y)))
#  0.4676692829338082

In [9]:
pred_true_df = pd.concat([pd.DataFrame(predsR),pd.DataFrame(test_y.reset_index() )], axis = 1)
del pred_true_df['index']
pred_true_df.columns = ['Pred_price', 'true_price']
pred_true_df['Pred_price'] = np.expm1(pred_true_df['Pred_price'] )
pred_true_df['true_price'] = np.expm1(pred_true_df['true_price'] )

pred_true_df.head(10)

Unnamed: 0,Pred_price,true_price
0,9.649394,9.0
1,58.924672,11.0
2,24.16013,40.0
3,14.870223,22.0
4,21.98841,84.0
5,13.354307,12.0
6,16.567349,14.0
7,37.055872,26.0
8,29.321192,23.0
9,29.490493,36.0


In [10]:
print( '真實值比預測值高的有' + str(len(pred_true_df[pred_true_df['true_price']>pred_true_df['Pred_price'] ])) + '個')
# 真實值比預測值高的有71263個

print( '預測值高比真實值比高的有' + str(len(pred_true_df[pred_true_df['true_price']<pred_true_df['Pred_price'] ])) + '個')
# 預測值高比真實值比高的有76904個

(pred_true_df['Pred_price'] -  pred_true_df['true_price']).mean()
# 平均價格 -3.3035621763466594

真實值比預測值高的有71755個
預測值高比真實值比高的有76499個


-3.6154141544640614