# https://www.kaggle.com/c/mercari-price-suggestion-challenge 

In [None]:
import matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from lightgbm import LGBMRegressor

%matplotlib inline
sns.set()
plt.rcParams["figure.figsize"] = (20, 10)
pd.options.display.float_format = '{:.2f}'.format

### Используйте параметр nrows, чтобы уменьшить выборку и сделать базовый разведочный анализ данных

In [None]:
df = pd.read_csv('../input/train.tsv', sep='\t', index_col=0)

And why would index_col usage result in any comparisons?

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df_test = pd.read_csv('../input/test.tsv', sep='\t', index_col=0)

In [None]:
df_test.info()

In [None]:
df_test.head()

# Задание
1. Сделать baseline submission
  * Исследовать признак price. 
  * Исследовать признак price в зависимости от brand_name или других признаков
2. Реализовать цикл анализа
  * признаки -> модель -> настройка параметров -> лучшая модель и ее значение метрики качества на кросс-валидации
3. Провести простые преобразования признаков и добавление простых признаков
  * разделить category_name на уровни
  * посмотреть на наличие числовых значений в описании и имени
  * ...
4. Составить план по применению нескольких моделей на разных признаках
  * спроектируйте эксперимент. Нужно заранее спланировать порядок перебора признаков и моделей. Потом только писать код. Обратный порядок вызывает необходимость переписывать существующий код, это трата времени
5. Просмотреть 1-5 kernel на kaggle. (только в таком порядке. сначала работаем самостоятельно, потом смотрим идеи других. при появлении опыта можно сразу начинать с них)
6. Скорректировать план
  * убрать пункты, которые кажутся неудачными
  * добавить идеи из kernel, кажущиеся удачными
7. Построить модель, выбрать лучшую
8. Построить ансамбль, настрить парамертры. Сравнить с другими моделями.
9. Применить и засабмитить лучшую на cv модель
10. Прислать блокнот и свой ник в лидерборде

### Exploration

#### name

In [None]:
sum(df.name.isnull())

It contains formal decription of an item; TF-IDFing most frequent words should do just fine.

#### item_condition_id

In [None]:
df.item_condition_id.unique()

Ready-to-use feature, we need to just OHE it

#### category_name

In [None]:
df.category_name.unique().shape

In [None]:
sum(df.category_name.isnull())

In [None]:
df.category_name.fillna("//").str.split("/").apply(lambda x: x[0]).unique().shape

In [None]:
df.category_name.fillna("//").str.split("/").apply(lambda x: x[1]).unique().shape

In [None]:
df.category_name.fillna("//").str.split("/").apply(lambda x: x[2]).unique().shape

In [None]:
df.groupby("category_name").agg({"price": "mean"}).sort_values("price", ascending=False).head(10)

It makes sense to split categories and their subcategories and OHE them all, CountVectorizer should make it almost automatically.

#### brand_name

In [None]:
sum(df.brand_name.isnull())

In [None]:
df.brand_name.unique().shape

In [None]:
df.groupby("brand_name").agg({"price": "mean"}).sort_values("price", ascending=False).head(10)

Brand *clearly* has a significant influence on the price.  
We need to replace NaNs with *some* class, then perform label encoding and OHE.

#### price

In [None]:
sum(df.price.isnull())

Whew, no NaNs

In [None]:
df.price.describe()

In [None]:
df.price.quantile(0.9)

In [None]:
df.price.quantile(0.99)

Very skewed distribution

In [None]:
sum(df.price == 0)

Why? Does someone just give away goods?

In [None]:
plot = np.log10(df.price + 1).hist(bins=20, log=True)
plot.set_ylabel("Count")
plot.set_xlabel("log10(price)")

Using this as a target with some linear regressors could pose a problem due to skeweness. One solution is to use `log(price)` instead or, even better, `log(price+1)` (because of zeroes)

#### shipping

In [None]:
df.shipping.unique().shape

In [None]:
df.groupby("shipping").agg({"price": "mean"})

Just a nice binary feature, can be use as-is.

#### item_description

In [None]:
sum(df.item_description.isnull())

It's likely that this feature contains some sentient text, that could greatly improve prediction.  
There're many possible approaches. One of them is to TF-IDF it with sensible constraints, maybe even try n-grams.  
However, it could take more then one attempt to find out how generous Kaggle execution environment is.

### Data preparation

In [None]:
class LabelEncoderPipelineFriendly(LabelEncoder):
    def __init__(self, **kwargs):
        super(LabelEncoderPipelineFriendly, self).__init__(**kwargs)
    
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelEncoderPipelineFriendly, self).fit(X)
        
    def transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).transform(X).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X).reshape(-1, 1)

In [None]:
def prepare_data(train, test):
    def get_name_col(df):
        return df["name"]
    
    def get_condition_col(df):
        return df[["item_condition_id"]]
    
    def get_category_col(df):
        return df["category_name"].fillna("None").astype("category")
    
    def get_brand_col(df):
        return df["brand_name"].fillna("None").astype("category")
    
    def get_shipping_col(df):
        return df[["shipping"]]
    
    def get_desc_col(df):
        return df["item_description"].fillna("None")
    
    p = make_union(*[
        make_pipeline(FunctionTransformer(get_name_col, validate=False), 
                      TfidfVectorizer(min_df=15)), # we really don't want to end up with a gazzilion of columns
        make_pipeline(FunctionTransformer(get_condition_col, validate=False),
                      OneHotEncoder()),
        make_pipeline(FunctionTransformer(get_category_col, validate=False),
                      CountVectorizer()),
        make_pipeline(FunctionTransformer(get_brand_col, validate=False),
                      LabelEncoderPipelineFriendly(),
                      OneHotEncoder(sparse=True)),
        make_pipeline(FunctionTransformer(get_shipping_col, validate=False)),
        make_pipeline(FunctionTransformer(get_desc_col, validate=False),
                      TfidfVectorizer(ngram_range=(1, 3), 
                                      stop_words="english", 
                                      max_features=10000))
        ])
    
    train_rows = train.shape[0]
    df = pd.concat([train, test], axis=0)
    transformed = p.fit_transform(df)
    transformed_train, transformed_test = transformed[:train_rows], transformed[train_rows:]
    del df
    return (transformed_train, transformed_test)

In [None]:
X, X_test = prepare_data(df, df_test) # we need both of them to reliably get every categorical level
y = np.array(df.price)
log_y = np.log1p(y)

In [None]:
X.shape, X_test.shape, y.shape

#### Baseline

In [None]:
# https://www.kaggle.com/wiki/RootMeanSquaredLogarithmicError
def rmsle(h, y): 
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())
scorer = make_scorer(score_func=rmsle, greater_is_better=False)

In [None]:
# three linear models and one tree
models = [
    ("Lasso", Lasso, {"alpha": [0.1, 0.5], # bigger alpha -> bigger error
                      "random_state": [0],
                      "max_iter": [1000],
                      "tol": [0.001],
                      "selection": ["random"],
                      "fit_intercept": [False]}), # everything is normalized, thanks to TF-IDF
    ("Ridge", Ridge, {"solver": ["lsqr", "sparse_cg"], # svd doesn't support sparse matrices
                      "alpha": [0.1, 0.5, 1],          # cholesky is slow, as well as sag and saga
                      "random_state": [0],
                      "tol": [0.001],
                      "fit_intercept": [False]}),
    ("ElasticNet", ElasticNet, {"alpha": [0.1, 0.5], # bigger alpha -> bigger error
                                "l1_ratio": [0.1, 0.5, 0.9],
                                "random_state": [0],
                                "max_iter": [1000],
                                "tol": [0.001],
                                "selection": ["random"],
                                "fit_intercept": [False]}),
    ("DecisionTreeRegressor", DecisionTreeRegressor, {"max_depth": [3, 7], # more depth -> slower learning rate
                                                      "random_state": [0]})
]

In [None]:
best_models = []

for name, model_class, params in models:
    gs = GridSearchCV(model_class(), params, scoring=scorer, cv=5, n_jobs=1, refit=True)
    gs.fit(X, log_y)
    best_models.append((name, model_class, gs.best_estimator_, gs.best_params_, gs.best_score_))

In [None]:
for model_name, _, estimator, _, _ in best_models:
    predicted = np.expm1(estimator.predict(X_test))
    pd.DataFrame({"price": predicted}, index=df_test.index).to_csv("baseline_{}.csv".format(model_name), sep=",")

### Ensembles

Anyone can just apply XGBoost or RandomForest. Let's stack our very own ensemble with the baseline models and LGBMRegressor!  
Alas, we're really out of time here, so, no CV for ensemble.

In [None]:
lgr = LGBMRegressor(n_jobs=-1, n_estimators=100)
lgr.fit(X, log_y, eval_metric=rmsle)

In [None]:
best_models.append(("LGBMRegressor", LGBMRegressor, lgr, lgr.get_params(), lgr.best_score_))

In [None]:
all_models_preds_train = np.zeros((X.shape[0], len(best_models)))
for i in range(len(best_models)):
    _, _, estimator, _, _ = best_models[i]
    all_models_preds_train[..., i] = estimator.predict(X)

In [None]:
all_models_preds_test = np.zeros((X_test.shape[0], len(best_models)))
for i in range(len(best_models)):
    _, _, estimator, _, _ = best_models[i]
    all_models_preds_test[..., i] = estimator.predict(X_test)

In [None]:
lr = LinearRegression()
lr.fit(all_models_preds_train, log_y)
final_preds = np.expm1(lr.predict(all_models_preds_test))

In [None]:
pd.DataFrame({"price": final_preds}, index=df_test.index).to_csv("ensemble.csv", sep=",")