In [31]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder

In [46]:
train = pd.read_table('train.tsv', delimiter = '\t')
test = pd.read_csv('test.tsv', delimiter = '\t')

In [37]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [38]:
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [39]:
train.shape, test.shape

((1482535, 8), (693359, 7))

In [40]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
train_id             1482535 non-null int64
name                 1482535 non-null object
item_condition_id    1482535 non-null int64
category_name        1476208 non-null object
brand_name           849853 non-null object
price                1482535 non-null float64
shipping             1482535 non-null int64
item_description     1482531 non-null object
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


In [41]:
# trying to find out the missing values
train.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

In [47]:
# splitting the category name with '/' and making three new columns out of it

def split_cat(text):
    try: return text.split("/")
    except: return ("missing", "missing", "missing")
    
train['general_cat'], train['subcat_1'], train['subcat_2'] = zip(*train['category_name'].apply(lambda x: split_cat(x)))
test['general_cat'], test['subcat_1'], test['subcat_2'] = zip(*test['category_name'].apply(lambda x: split_cat(x)))

train.drop('category_name', axis=1, inplace = True)
test.drop('category_name', axis=1, inplace = True)

y_train = np.log1p(train['price'])

train.brand_name.fillna(value="missing", inplace=True)
train.item_description.fillna(value="missing", inplace=True)

test.brand_name.fillna(value="missing", inplace=True)
test.item_description.fillna(value="missing", inplace=True)

In [48]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,general_cat,subcat_1,subcat_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,missing,10.0,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,missing,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,missing,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


In [49]:
train.isnull().sum()

train_id             0
name                 0
item_condition_id    0
brand_name           0
price                0
shipping             0
item_description     0
general_cat          0
subcat_1             0
subcat_2             0
dtype: int64

In [50]:
train.brand_name.value_counts()

missing                    632682
PINK                        54088
Nike                        54043
Victoria's Secret           48036
LuLaRoe                     31024
Apple                       17322
FOREVER 21                  15186
Nintendo                    15007
Lululemon                   14558
Michael Kors                13928
American Eagle              13254
Rae Dunn                    12305
Sephora                     12172
Coach                       10463
Disney                      10360
Bath & Body Works           10354
Adidas                      10202
Funko                        9237
Under Armour                 8461
Sony                         7994
Old Navy                     7567
Hollister                    6948
Carter's                     6385
Urban Decay                  6210
The North Face               6172
Independent                  5902
Too Faced                    5794
Xbox                         5709
Brandy Melville              5680
Kate Spade    

In [51]:
print("Handling categorical variables...")
le = LabelEncoder()    

le.fit(np.hstack([train.brand_name, test.brand_name]))
train.brand_name = le.transform(train.brand_name)
test.brand_name = le.transform(test.brand_name)

le.fit(np.hstack([train.subcat_1, test.subcat_1]))
train.subcat_1 = le.transform(train.subcat_1)
test.subcat_1 = le.transform(test.subcat_1)

le.fit(np.hstack([train.subcat_2, test.subcat_2]))
train.subcat_2 = le.transform(train.subcat_2)
test.subcat_2 = le.transform(test.subcat_2)

le.fit(np.hstack([train.general_cat, test.general_cat]))
train.general_cat = le.transform(train.general_cat)
test.general_cat = le.transform(test.general_cat)

del le

train.head()

Handling categorical variables...


Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,general_cat,subcat_1,subcat_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,5265,10.0,1,No description yet,5,102,773
1,1,Razer BlackWidow Chroma Keyboard,3,3889,52.0,0,This keyboard is in great condition and works ...,1,30,215
2,2,AVA-VIV Blouse,1,4588,10.0,1,Adorable top with a hint of lace and a key hol...,9,103,97


In [54]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,general_cat,subcat_1,subcat_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,5265,10.0,1,No description yet,5,102,773
1,1,Razer BlackWidow Chroma Keyboard,3,3889,52.0,0,This keyboard is in great condition and works ...,1,30,215
2,2,AVA-VIV Blouse,1,4588,10.0,1,Adorable top with a hint of lace and a key hol...,9,103,97
3,3,Leather Horse Statues,1,5265,35.0,1,New with tags. Leather horses. Retail for [rm]...,3,55,410
4,4,24K GOLD plated rose,1,5265,44.0,0,Complete with certificate of authenticity,9,58,542


In [53]:
train.name.nunique(), train.brand_name.nunique(), train.item_description.nunique(), \
train.subcat_1.nunique(), train.subcat_2.nunique(), train.general_cat.nunique()

(1225273, 4810, 1281427, 114, 871, 11)

In [11]:
def get_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(np.expm1(y_true), np.expm1(y_pred)))

cv = KFold(n_splits=10, shuffle=True, random_state=42)
for train_ids, valid_ids in cv.split(X_train):
    model = Ridge(
        solver='auto',
        fit_intercept=True,
        alpha=0.5,
        max_iter=100,
        normalize=False,
        tol=0.05)
    model.fit(X_train[train_ids], y_train[train_ids])
    y_pred_valid = model.predict(X_train[valid_ids])
    rmsle = get_rmsle(y_pred_valid, y_train[valid_ids])
    print(f'valid rmsle: {rmsle:.5f}')
    break

valid rmsle: 0.45331


In [10]:
# from sklearn.naive_bayes import MultinomialNB
# nb = MultinomialNB()
# nb.fit(X_train_dtm, y_train)
# y_pred_class = nb.predict(X_test_dtm)