In [1]:
import pandas as pd
import numpy as np

import scipy

In [28]:
from sklearn.linear_model import Ridge , LogisticRegression
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

import gc

In [3]:
NUM_BRANDS = 2500
NAME_MIN_DF = 10
MAX_FEAT_DESCP = 50000

In [4]:
df_train = pd.read_csv('train.tsv' , sep='\t')
df_test = pd.read_csv('test.tsv' , sep='\t')

In [5]:
df = pd.concat([df_train , df_test] , axis=0)

In [6]:
nrow_train = df_train.shape[0]
nrow_train
#训练数据样本数量

1482535

In [7]:
y_train = np.log1p(df_train.price)
#对数除偏

In [8]:
del df_train

gc.collect()

28

In [9]:
df.memory_usage(deep = True)

Index                 17407152
brand_name           112333748
category_name        190129667
item_condition_id     17407152
item_description     491095076
name                 181881830
price                 17407152
shipping              17407152
test_id               17407152
train_id              17407152
dtype: int64

In [10]:
df.category_name = df.category_name.fillna('Other').astype('category')
df.brand_name = df.brand_name.fillna('unknown')

In [11]:
pop_brands = df.brand_name.value_counts().index[:NUM_BRANDS]
#前2500名的品牌

In [13]:
df.brand_name.value_counts()

unknown                 928207
Nike                     79277
PINK                     79092
Victoria's Secret        70508
LuLaRoe                  45598
Apple                    25435
FOREVER 21               22327
Nintendo                 22156
Lululemon                21391
Michael Kors             20335
American Eagle           19418
Rae Dunn                 18031
Sephora                  17849
Disney                   15425
Coach                    15309
Bath & Body Works        15187
Adidas                   15002
Funko                    13568
Under Armour             12414
Sony                     11729
Old Navy                 11089
Hollister                10182
Carter's                  9289
The North Face            9137
Urban Decay               8979
Independent               8681
Too Faced                 8479
Xbox                      8406
Brandy Melville           8366
Kate Spade                7863
                         ...  
Caravelli                    1
BOSS Bla

In [14]:
df.loc[~ df.brand_name.isin(pop_brands) , 'brand_name'] = 'Other'
#将2500名以外的品牌置为other

In [16]:
df.item_description = df.item_description.fillna('None')
df.item_condition_id = df.item_condition_id.astype('category')
df.brand_name = df.brand_name.astype('category')

In [17]:
print(df.memory_usage(deep=True))

Index                 17407152
brand_name             4519098
category_name          4468732
item_condition_id      2175934
item_description     491095192
name                 181881830
price                 17407152
shipping              17407152
test_id               17407152
train_id              17407152
dtype: int64


In [18]:
count = CountVectorizer(min_df = NAME_MIN_DF)
X_name = count.fit_transform(df.name)

In [19]:
X_name

<2175894x21257 sparse matrix of type '<class 'numpy.int64'>'
	with 8946093 stored elements in Compressed Sparse Row format>

In [25]:
unique_categories = pd.Series('/'.join(df.category_name.unique().astype('str')).split('/')).unique()
count_category = CountVectorizer()
X_category = count_category.fit_transform(df.category_name)

In [20]:
df.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,unknown,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0
3,unknown,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0
4,unknown,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0


In [24]:
gc.collect()

243

In [30]:
#tfidf
count_descp = TfidfVectorizer(max_features = MAX_FEAT_DESCP,
                              ngram_range = (1,3),
                              stop_words='english')


X_descp = count_descp.fit_transform(df.item_description)

In [31]:
X_descp

<2175894x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 48510021 stored elements in Compressed Sparse Row format>

In [32]:
vect_brand = LabelBinarizer(sparse_output=True)
X_brand = vect_brand.fit_transform(df.brand_name)
#使用二进制位的one-hot
#logN bits binary

In [33]:
X_brand

<2175894x2501 sparse matrix of type '<class 'numpy.int32'>'
	with 2175894 stored elements in Compressed Sparse Row format>

In [34]:
df.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,unknown,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0
3,unknown,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0
4,unknown,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0


In [35]:
X_dummies = scipy.sparse.csr_matrix(
pd.get_dummies(df[['item_condition_id','shipping']],
              sparse=True).values)



In [36]:
X = scipy.sparse.hstack((X_dummies,
                        X_descp,
                        X_brand,
                        X_category,
                        X_name)).tocsr()

In [37]:
X.shape

(2175894, 74785)

In [38]:
X_dummies.shape

(2175894, 6)

In [39]:
X_descp

<2175894x50000 sparse matrix of type '<class 'numpy.float64'>'
	with 48510021 stored elements in Compressed Sparse Row format>

In [41]:
X_train = X[: nrow_train]

model = Ridge(solver='lsqr' , fit_intercept=False)

In [42]:
model.fit(X_train , y_train)

X_test = X[nrow_train:]
preds = model.predict(X_test)



In [44]:
df_test['price'] = np.expm1(preds)

df_test[['test_id' , 'price']].to_csv('mm.csv',
                                     index=False)