#  Feature engineering in MPSC

In [32]:
# import package
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error

import time
import math

In [2]:
# load data
train = pd.read_csv('./data/train.tsv',delimiter='\t')
test = pd.read_csv('./data/test.tsv',delimiter='\t')

In [4]:
# concat train and test
dataset = pd.concat([train, test]).reset_index(drop=True)

## 1 item_description
因为item_description缺失只有4条，因此采取直接丢失的办法，然后用TF-IDF + SVD降维进行处理。最后生成一个DataFrame：**itemsvd**

In [17]:
# 丢掉item_description缺失的行
print(dataset.shape)
dataset = dataset.loc[dataset['item_description']== dataset['item_description']]
dataset.index = range(0, len(dataset.index))
print(dataset.shape)

(2175890, 9)
(2175890, 9)


In [11]:
# TF-IDF提取特征
time1 = time.time()
print("Tf-idf start...")

vectorizer = TfidfVectorizer(stop_words='english')
item_Tfidf = vectorizer.fit_transform(list(dataset["item_description"].values))

time2 = time.time()
print("Tf-idf end")
print("Tf-idf cost ", time2-time1, "s")

Tf-idf start...
Tf-idf end
Tf-idf cost  74.70364570617676 s


In [30]:
print(type(item_Tfidf))
print("降维前数据大小：", item_Tfidf.shape)

<class 'scipy.sparse.csr.csr_matrix'>
降维前数据大小： (2175890, 197497)


In [37]:
# SVD降维
time1 = time.time()
print("Dimensionality reduction start...")

n_comp = 20 # 降维之后的维度
trunsvd = TruncatedSVD(n_components=n_comp, algorithm='arpack')
itemsvd = pd.DataFrame(trunsvd.fit_transform(item_Tfidf))
itemsvd.columns = ['subitem_'+str(i) for i in range(n_comp)]

time2 = time.time()
print("Dimensionality reduction end")
print("Dimensionality reduction cost ", time2-time1, "s")

Dimensionality reduction start...
Dimensionality reduction end
Dimensionality reduction cost  41.22044324874878 s


In [38]:
print(type(itemsvd))
print("降维后数据大小：", itemsvd.shape)
itemsvd.head(3)

<class 'pandas.core.frame.DataFrame'>
降维后数据大小： (2175890, 20)


Unnamed: 0,subitem_0,subitem_1,subitem_2,subitem_3,subitem_4,subitem_5,subitem_6,subitem_7,subitem_8,subitem_9,subitem_10,subitem_11,subitem_12,subitem_13,subitem_14,subitem_15,subitem_16,subitem_17,subitem_18,subitem_19
0,0.999997,-0.00179,0.00031,-6.9e-05,-0.00083,-0.000124,-2.6e-05,0.000374,0.000142,0.000101,6.4e-05,0.000128,-1.9e-05,0.000111,9.4e-05,-0.000115,0.0001,4.1e-05,5e-06,-6.9e-05
1,0.000141,0.068881,-0.068222,0.081429,-0.025125,0.006459,-0.031117,-0.063037,0.08244,-0.005979,0.086945,-0.062698,-0.026055,0.038097,0.01711,0.021093,-0.026514,0.017586,-0.03163,0.022994
2,0.000132,0.038013,-0.022175,-0.024939,0.03418,-0.006637,0.005821,-0.032253,0.005179,-0.084634,-0.07778,-0.056461,-0.003477,0.014078,-0.044704,-0.041864,-0.038415,0.024993,0.001792,-0.047669


## 2 name
用TF-IDF + SVD降维处理，处理方法与item_description相同。最后生成一个DataFrame：**namesvd**

In [42]:
# TF-IDF提取特征
time1 = time.time()
print("Tf-idf start...")

vectorizer = TfidfVectorizer(stop_words='english')
name_Tfidf = vectorizer.fit_transform(list(dataset["name"].values))

time2 = time.time()
print("Tf-idf end")
print("Tf-idf cost ", time2-time1, "s")

Tf-idf start...
Tf-idf end
Tf-idf cost  22.021522998809814 s


In [43]:
print(type(name_Tfidf))
print("降维前数据大小：", name_Tfidf.shape)

<class 'scipy.sparse.csr.csr_matrix'>
降维前数据大小： (2175890, 130775)


In [45]:
# SVD降维
time1 = time.time()
print("Dimensionality reduction start...")

n_comp = 20 # 降维之后的维度
trunsvd = TruncatedSVD(n_components=n_comp, algorithm='arpack')
namesvd = pd.DataFrame(trunsvd.fit_transform(name_Tfidf))
namesvd.columns = ['subname_'+str(i) for i in range(n_comp)]

time2 = time.time()
print("Dimensionality reduction end")
print("Dimensionality reduction cost ", time2-time1, "s")

Dimensionality reduction start...
Dimensionality reduction end
Dimensionality reduction cost  22.48158025741577 s


In [47]:
print(type(namesvd))
print("降维后数据大小：", namesvd.shape)
namesvd.head(3)

<class 'pandas.core.frame.DataFrame'>
降维后数据大小： (2175890, 20)


Unnamed: 0,subname_0,subname_1,subname_2,subname_3,subname_4,subname_5,subname_6,subname_7,subname_8,subname_9,subname_10,subname_11,subname_12,subname_13,subname_14,subname_15,subname_16,subname_17,subname_18,subname_19
0,0.051746,0.029468,0.019688,-0.069712,0.126573,-0.026622,0.092028,0.103965,0.126855,0.143488,-0.043068,-0.05408,-0.005593,-0.09001,-0.00946,0.026508,0.024064,-0.007097,-0.042367,-0.000171
1,7.5e-05,2.2e-05,4e-05,-0.000113,0.000137,0.000621,-9.1e-05,-0.000122,-0.000149,9.5e-05,-0.000113,0.000157,-0.000103,1.5e-05,-1e-06,-1.2e-05,-3.4e-05,-1e-05,-4.4e-05,2.6e-05
2,0.001794,0.001015,0.001415,-0.002837,0.004432,0.001054,0.00596,-0.000219,-0.001012,0.000202,-0.000159,-0.003266,0.000844,-0.00082,-0.000521,-0.000137,-0.001059,-8e-05,0.003946,-0.00216


## 3 brand_name
生成两个特征，**brand_bool**和**brand_label**

In [49]:
# generated brand_bool
def if_brand(x):
    if x == x:
        return 1
    else:
        return 0
    
dataset["brand_bool"] = dataset["brand_name"].apply(lambda x : if_brand(x))
print(dataset["brand_bool"].value_counts())

1    1247686
0     928204
Name: brand_bool, dtype: int64


In [51]:
# generated brand_label
keys = dataset['brand_name'].dropna().unique() # 除去缺失值后brand_name的unique
indexs = list(range(1, len(keys)+1))

brand_dict = dict(zip(keys, indexs))

def brand_label(x):
    try:
        return brand_dict[x]
    except:
        return 0

dataset["brand_label"] = dataset['brand_name'].apply(lambda x: brand_label(x))

In [52]:
print(dataset.shape)
dataset.head(3)

(2175890, 11)


Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,brand_bool,brand_label
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,0,0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,1,1
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,1,2


##  4 category_name
首先将category_name分割成三个子特征,subcat1,subcat2和subcat3，再为每个子特征生成一个新特征**cat1_label，cat2_label，cat3_label**
<br>，再根据category_name生成**cat_bool**特征，总共新生成了4个特征。具体做法与brand_name相同

In [63]:
# generator subcat_1 subcat_2 and subcat_3
def split_cat(text):
    try: return text.split("/", 2)
    except: return ("NoLabel", "NoLabel", "NoLabel")
    
# zip返回一个元祖列表，该元祖按顺序包含每个序列的相应元素，以最小的一个为准。* 用来传递任意个无名字参数，这些参数会一个Tuple的形式访问
dataset['subcat_1'], dataset['subcat_2'], dataset['subcat_3'] = zip(*dataset['category_name'].apply(lambda x: split_cat(x))) 

In [64]:
# making dictionaries for different categories 
keys = dataset["subcat_1"].unique()
indexs = list(range(1, len(keys)+1))
cat1_dict = dict(zip(keys, indexs))

keys2 = dataset["subcat_2"].unique()
indexs2 = list(range(1, len(keys2)+1))
cat2_dict = dict(zip(keys2, indexs2))

keys3 = dataset["subcat_3"].unique()
indexs3 = list(range(1, len(keys3)+1))
cat3_dict = dict(zip(keys3, indexs3))

In [65]:
print(cat1_dict)

{'NoLabel': 10, 'Sports & Outdoors': 5, 'Kids': 9, 'Beauty': 7, 'Women': 3, 'Vintage & Collectibles': 6, 'Home': 4, 'Electronics': 2, 'Handmade': 11, 'Men': 1, 'Other': 8}


In [66]:
# generate cat1_label cat2_label and cat3_label
def cat_label(row,cat1_dict = cat1_dict, cat2_dict = cat2_dict, cat3_dict = cat3_dict):
    txt1 = row['subcat_1']
    txt2 = row['subcat_2']
    txt3 = row['subcat_3']
    try:
        return cat1_dict[txt1], cat2_dict[txt2], cat3_dict[txt3]
    except:
        return 0

dataset["cat1_label"], dataset["cat2_label"], dataset["cat3_label"] = zip(*dataset.apply(lambda val: cat_label(val), axis =1))

In [67]:
# generate cat_bool
def if_category(x):
    if x == x:
        return 1
    else:
        return 0
    
dataset["cat_bool"] = dataset["category_name"].apply(lambda x : if_category(x))
print(dataset["cat_bool"].value_counts())

1    2166505
0       9385
Name: cat_bool, dtype: int64


In [68]:
# drop subcat_1 subcat_2 and subcat_3
dataset.drop(labels=["subcat_1", "subcat_2", "subcat_3"], axis=1, inplace=True) # inplace=True表示直接在原dataset中进行删除

In [71]:
print(dataset.shape)
dataset.head(3)

(2175890, 15)


Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,brand_bool,brand_label,cat1_label,cat2_label,cat_bool,cat3_label
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,0,0,1,1,1,1
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,1,1,2,2,1,2
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,1,2,3,3,1,3


# write to csv

In [72]:
# drop brand_name category_name item_description and name
dataset.drop(labels=["brand_name", "category_name", "item_description", "name"], axis=1, inplace=True) # inplace=True表示直接在原dataset中进行删除

In [73]:
print(dataset.shape)
dataset.head()

(2175890, 11)


Unnamed: 0,item_condition_id,price,shipping,test_id,train_id,brand_bool,brand_label,cat1_label,cat2_label,cat_bool,cat3_label
0,3,10.0,1,,0.0,0,0,1,1,1,1
1,3,52.0,0,,1.0,1,1,2,2,1,2
2,1,10.0,1,,2.0,1,2,3,3,1,3
3,1,35.0,1,,3.0,0,0,4,4,1,4
4,1,44.0,0,,4.0,0,0,3,5,1,5


In [74]:
print(itemsvd.shape, namesvd.shape, dataset.shape)

(2175890, 20) (2175890, 20) (2175890, 11)


In [75]:
# 将itemsvd和namesvd加入到dataset中
time1 = time.time()
print('Concat start...')

dataset_new = pd.concat([dataset, itemsvd, namesvd], axis=1)

time2 = time.time()
print('Concat end')
print("Concat cost ", time2-time1, "s")

Concat start...
Concat end
Concat cost  1.757620096206665 s


In [80]:
print(dataset_new.shape)
print(dataset_new.isnull().sum())

(2175890, 51)
item_condition_id          0
price                 693359
shipping                   0
test_id              1482531
train_id              693359
brand_bool                 0
brand_label                0
cat1_label                 0
cat2_label                 0
cat_bool                   0
cat3_label                 0
subitem_0                  0
subitem_1                  0
subitem_2                  0
subitem_3                  0
subitem_4                  0
subitem_5                  0
subitem_6                  0
subitem_7                  0
subitem_8                  0
subitem_9                  0
subitem_10                 0
subitem_11                 0
subitem_12                 0
subitem_13                 0
subitem_14                 0
subitem_15                 0
subitem_16                 0
subitem_17                 0
subitem_18                 0
subitem_19                 0
subname_0                  0
subname_1                  0
subname_2                  0


In [81]:
print(dataset_new.columns)
dataset_new.head()

Index(['item_condition_id', 'price', 'shipping', 'test_id', 'train_id',
       'brand_bool', 'brand_label', 'cat1_label', 'cat2_label', 'cat_bool',
       'cat3_label', 'subitem_0', 'subitem_1', 'subitem_2', 'subitem_3',
       'subitem_4', 'subitem_5', 'subitem_6', 'subitem_7', 'subitem_8',
       'subitem_9', 'subitem_10', 'subitem_11', 'subitem_12', 'subitem_13',
       'subitem_14', 'subitem_15', 'subitem_16', 'subitem_17', 'subitem_18',
       'subitem_19', 'subname_0', 'subname_1', 'subname_2', 'subname_3',
       'subname_4', 'subname_5', 'subname_6', 'subname_7', 'subname_8',
       'subname_9', 'subname_10', 'subname_11', 'subname_12', 'subname_13',
       'subname_14', 'subname_15', 'subname_16', 'subname_17', 'subname_18',
       'subname_19'],
      dtype='object')


Unnamed: 0,item_condition_id,price,shipping,test_id,train_id,brand_bool,brand_label,cat1_label,cat2_label,cat_bool,...,subname_10,subname_11,subname_12,subname_13,subname_14,subname_15,subname_16,subname_17,subname_18,subname_19
0,3,10.0,1,,0.0,0,0,1,1,1,...,-0.043068,-0.05408,-0.005593,-0.09001,-0.00946,0.026508,0.024064,-0.007097,-0.042367,-0.000171
1,3,52.0,0,,1.0,1,1,2,2,1,...,-0.000113,0.000157,-0.000103,1.5e-05,-1e-06,-1.2e-05,-3.4e-05,-1e-05,-4.4e-05,2.6e-05
2,1,10.0,1,,2.0,1,2,3,3,1,...,-0.000159,-0.003266,0.000844,-0.00082,-0.000521,-0.000137,-0.001059,-8e-05,0.003946,-0.00216
3,1,35.0,1,,3.0,0,0,4,4,1,...,-0.011362,-0.01351,0.002793,-0.002177,0.000622,-0.001707,-0.008709,0.003225,0.023608,0.017805
4,1,44.0,0,,4.0,0,0,3,5,1,...,-0.009359,-0.006795,0.010102,0.001983,-0.000288,0.000131,0.00157,0.010042,0.00258,0.001851


In [82]:
time1 = time.time()
print('Write start...')

dataset_new.to_csv('./data/dataset_new.csv', index = False)

time2 = time.time()
print('Write end')
print("Write cost ", time2-time1, "s")

Write start...
Write end
Write cost  257.78454303741455 s
