In [None]:
# Credit: 
# https://www.kaggle.com/apapiu/ridge-script/code
# https://www.kaggle.com/thykhuely/mercari-interactive-eda-topic-modelling

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy

from sklearn.feature_extraction.text import CountVectorizer #convert text to word count vectors
from sklearn.feature_extraction.text import TfidfVectorizer #convert text to word frequency vectors
from sklearn.feature_extraction.text import HashingVectorizer #convert text to unique integers 
from sklearn.preprocessing import LabelBinarizer

from sklearn.linear_model import Ridge, LogisticRegression

In [None]:
#NUM_BRANDS = 2500  # set a maximum number of brand names recorded in the dataset - pop_brand
NAME_MIN_DF = 5
MAX_FEAT_DESCP = 60000
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 300)

In [None]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
[stopWords.discard(w) for w in ['with','not','doesn\'t','don\'t','didn\'t','under','above','all',
                               'aren','aren\'t','below','both','but','couldn','couldn\'t','didn',
                               'doesn','don','doesn','don','down','few','hadn','hadn\'t','hasn',
                               'hasn\'t','haven','haven\'t','isn','isn\'t','mightn','mightn\'t',
                               'more','most','mustn','mustn\'t','needn','needn\'t','no','nor',
                               'off','once','only','other',"shouldn't",'some','up','very',"wasn't",
                               "weren't","won't","wouldn't"]]
#stopWords.update(['like','"','go','don\'t','get','know','wikipedia','people'])
len(stopWords)

In [None]:
train = pd.read_table('../input/train.tsv')
test = pd.read_table('../input/test.tsv')

In [None]:
#features = test.columns.values # test.tsv test_id vs train_id in train.tsv
# Remove items with price $0 in the training set
train = train[train.price>0.0]

In [None]:
#train.category_name[train.category_name.str.contains('Women', na=False)].value_counts()
m = (train.category_name.value_counts()>1000).sum()
top_categories = train.category_name.value_counts().index[:m]
#top_categories

In [None]:
'''
# Split the datasets for segmented linear regression
# 'Electronics', 'Women/Jewelry', 'Women/Women's Handbags', 'Women/Tops & Blouses', 'Men', other
train2 = train[train.category_name.str.contains('Women\'s Handbags',na=False)]
train1 = train[train.category_name.str.contains('Smartphones',na=False)]
train0 = train[-train.category_name.str.contains('Smartphones|Women\'s Handbags',na=False)]
#n2 = len(train2)
#n1 = len(train1)
#n0 = len(train0)
#y_train1 = np.log1p(train.price[train.category_name.str.contains('Electronics',na=False)]) # target: price
#y_train0 = np.log1p(train.price[-train.category_name.str.contains('Electronics',na=False)]) # target: price
test2 = test[test.category_name.str.contains('Women\'s Handbags',na=False)]
test1 = test[test.category_name.str.contains('Smartphones',na=False)]
test0 = test[-test.category_name.str.contains('Smartphones|Women\'s Handbags',na=False)]
'''


In [None]:
'''
df2 = pd.concat([train2, test2], 0)
df1 = pd.concat([train1, test1], 0)
df0 = pd.concat([train0, test0], 0)
'''

In [None]:

def segmented_linear_regression(df):
    df["category_name"] = df["category_name"].fillna("unknown").astype('category')
    df["brand_name"] = df["brand_name"].fillna("unknown")
    pop_brands = df["brand_name"].value_counts().index[:]
    df.loc[~df["brand_name"].isin(pop_brands), "brand_name"] = "Other"
    df["brand_name"] = df["brand_name"].astype("category")
    df["item_description"] = df["item_description"].fillna("None")
    df["item_condition_id"] = df["item_condition_id"].astype("category")
        
    # 'name' - count vectorizer
    count_name = CountVectorizer(min_df=NAME_MIN_DF,ngram_range=(1,3),stop_words = stopWords)
    X_name = count_name.fit_transform(df["name"])

    # 'category_name' - count vecrtorizer
    count_cat = CountVectorizer(ngram_range = (1,3), stop_words = stopWords)
    X_category = count_cat.fit_transform(df["category_name"])
    
    # 'item_description' - Tfidf
    tfidf_descp = TfidfVectorizer(max_features = MAX_FEAT_DESCP, 
                                  ngram_range = (1,3),
                                  stop_words = stopWords)
    X_descp = tfidf_descp.fit_transform(df["item_description"])
    
    # 'brand_name' - Label Binarizer
    bnrz_brand = LabelBinarizer(sparse_output=True)
    X_brand = bnrz_brand.fit_transform(df["brand_name"])
    
    # 'item_condition_id': One Hot Key Encoding
    X_condition = scipy.sparse.csr_matrix(pd.get_dummies(df[[
        "item_condition_id", "shipping"]], sparse = True).values)
    
    X = scipy.sparse.hstack((X_condition, 
                         X_descp,
                         X_brand,
                         X_category,
                         X_name)).tocsr()
    
    n = df.train_id.count()
    X_train = X[:n]
    y_train = np.log1p(df['price'][:n])
    
    model = Ridge(solver = "lsqr", fit_intercept=False)
    print("Fitting Model")
    model.fit(X_train, y_train)
    
    train_sum_sqr_error = (model.predict(X_train) - y_train).pow(2).sum()
    
    X_test = X[n:]
    p = model.predict(X_test)

    df_test = df[n:]
    df_test['price'] = np.expm1(p)
    return (df_test[['test_id','price']], train_sum_sqr_error, n)
    

In [None]:
test_new = pd.DataFrame()
train_sum_sqr_error = 0
train_others = train.copy()
test_others = test.copy()
for cat in top_categories:
    train_by_cat = train[train.category_name==cat]
    test_by_cat = test[test.category_name==cat]
    df = pd.concat([train_by_cat, test_by_cat], 0)
    df_seg, error, n = segmented_linear_regression(df)
    test_new = pd.concat([test_new,df_seg], 0)
    print(cat, n, (error/n)**0.5)
    train_sum_sqr_error += error
    train_others = train_others[train_others.category_name!=cat]
    test_others = test_others[test_others.category_name!=cat]
df = pd.concat([train_others, test_others], 0)
df_seg, error, n = segmented_linear_regression(df)
test_new = pd.concat([test_new,df_seg], 0)
print("Others", n, (error/n)**0.5)
train_sum_sqr_error += error

train_error = (train_sum_sqr_error/len(train))**0.5
print("train error:", train_error)

In [None]:
'''
test_new = pd.DataFrame()
train_sum_sqr_error = 0
for df in [df2, df1, df0]:
    df_seg, error, n = segmented_linear_regression(df)
    test_new = pd.concat([test_new,df_seg], 0)
    print(error, n, (error/n)**0.5)
    train_sum_sqr_error += error 
train_error = (train_sum_sqr_error/len(train))**0.5
print("train error:", train_error)
'''

In [None]:
test_new = test_new.sort_values('test_id')
test_new['test_id'] = test_new.test_id.astype('int64')
test_new[["test_id", "price"]].to_csv("submission_ridge.csv", index = False)

In [None]:
test_new.head()