**Hi Kagglers, I'm new to Python and Data Science, so any comments or advice would help me a lot ! ** 

# Get Input files

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import re

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
import os

In [None]:
train=pd.read_csv('../input/train.tsv', sep='\t', encoding='utf-8')
test=pd.read_csv('../input/test.tsv', sep='\t', encoding='utf-8')
sample = pd.read_csv('../input/sample_submission.csv', sep='\t', encoding='utf-8')

# Preparation

1. Make log price tags
1. Push padding to "NaN" category name ("Other/Other/Other")
1. Add 1st / 2nd level category labels

In [None]:
# price to logprice + 1
train["logprice"] = np.log(train["price"]+1)

# Push "Other/Other/Other" into NaN category name
train.loc[train["category_name"].isnull(), ["category_name"]] = "Other/Other/Other" #Merge into others
test.loc[test["category_name"].isnull(), ["category_name"]] = "Other/Other/Other" #Merge into others

# make 1st / 2nd level category label
train["1st_category"] = train["category_name"].str.extract('([^/]+)/[^/]+/[^/]+')
train["2nd_category"] = train["category_name"].str.extract('([^/]+/[^/]+)/[^/]+')
test["1st_category"] = test["category_name"].str.extract('([^/]+)/[^/]+/[^/]+')
test["2nd_category"] = test["category_name"].str.extract('([^/]+/[^/]+)/[^/]+')

train.head(20)

# All categories overview

Now, we have 3 levels of categories.
For example:

 * 1st category: Beauty
 * 2nd category: Beauty/Skin Care
 * 3rd category:  Beauty/Skin Care/Face ( = original category_name)

In [None]:
print(train["1st_category"].drop_duplicates().count())
print(train["2nd_category"].drop_duplicates().count())
print(train["category_name"].drop_duplicates().count())

1st category has only 10 categories below.
2nd has 138, and 3rd has 1287 categories.

|1st_category|
|-------|
|Men|
|Electronics|
|Women|
|Home|
|Sports&Outdoors|
|Vintage&Collectibles|
|Beauty|
|Other|
|Kids|
|Handmade|

## variance / deviation of (log price)

In [None]:
c1 = train.groupby(["1st_category"])["logprice"].std()
print(c1.mean())

c2 = train.groupby(["2nd_category"])["logprice"].std()
print(c2.mean())

c3 = train.groupby(["category_name"])["logprice"].std()
print(c3.mean())

More detailed categories have smaller variance and deviation.
By only this information, predicting price based on 3rd category makes smaller error than based on 1st.

## category difference in train and test

Are there any category contained only in test data ? => **yes**

In [None]:
set(["hoge", "piyo", "fuga"]).difference(set(["hoge", "piyo", "foo"]))

In [None]:
train_cat = train["category_name"].drop_duplicates().values.tolist()
test_cat = test["category_name"].drop_duplicates().values.tolist()
set(test_cat).difference(set(train_cat))

These categories above (↑) only included in test data, not in train data.
And this number of categories below (↓) only included in train data, not in test data.

In [None]:
len(set(train_cat).difference(set(test_cat))) # number only to avoid long list

In [None]:
train_cat2 = train["2nd_category"].drop_duplicates().values.tolist()
test_cat2 = test["2nd_category"].drop_duplicates().values.tolist()
print(set(test_cat2).difference(set(train_cat2)))
print(set(train_cat2).difference(set(test_cat2)))

Saying about **2nd category set, there is no difference between train and test.**

# 1st category details

In [None]:
group1 = train.groupby(["1st_category"])
cat1 = pd.DataFrame(group1["price"].mean())
cat1["num"] = group1["1st_category"].count()
cat1["logprice"] = group1["logprice"].mean()
cat1["logstd"] = group1["logprice"].std()
cat1["min"] = group1["price"].min()
cat1["max"] = group1["price"].max()
cat1["std"] = group1["price"].std()
cat1["median"] = group1["price"].median()
cat1 = cat1.sort_values(by='num', ascending = False)
cat1

In [None]:
f1 = train[["logprice", "1st_category"]]
plt.figure(figsize=(15, 8))
ax = sns.lvplot(y=f1["1st_category"], x=f1["logprice"])

 * **all 1st categories seem to have enough samples** (Sports & Outdoors has the smallest, but still 25342)

# 2nd category details

In [None]:
group2 = train.groupby(["2nd_category"])
cat2 = pd.DataFrame(group2["price"].mean())
cat2["num"] = group2["2nd_category"].count()
cat2["logprice"] = group2["logprice"].mean()
cat2["logstd"] = group2["logprice"].std()
cat2["min"] = group2["price"].min()
cat2["max"] = group2["price"].max()
cat2["std"] = group2["price"].std()
cat2["median"] = group2["price"].median()
cat2 = cat2.sort_values(by='num', ascending = False)
cat2.head(20)

In [None]:
plt.figure(figsize=(15, 8))
#ax = sns.lvplot(x=cat.index, y = cat.num)
ax = sns.stripplot(x=cat2.index, y = cat2.num, color="Red")
d = ax.set_ylim(0,)
d = ax.set(xlabel='category_name', ylabel='Items in category')
d = ax.set(xticklabels=[])

Zoom of tail 60 items. ↓

In [None]:
plt.figure(figsize=(15, 8))
#ax = sns.lvplot(x=cat.index, y = cat.num)
ax = sns.stripplot(x=cat2.tail(60).index, y = cat2.tail(60).num, color="Red")
d = ax.set_ylim(0,)
d = ax.set(xlabel='category_name', ylabel='Items in category')
d = ax.set(xticklabels=[])

**Except a few numbers of categories, 2nd categories seems to have reasonable numbers of samples.**

In [None]:
#f2 = train.loc[train["2nd_category"].isin(cat2)][["logprice", "2nd_category"]]
f2 = train[["logprice", "2nd_category"]].sort_values(by=["2nd_category"])
plt.figure(figsize=(9,27))
ax = sns.lvplot(y=f2["2nd_category"], x =f2["logprice"])

# 3rd level category details

Now, we start investigation by category_name.
This is a rough statistics of categories.

In [None]:
group3 = train.groupby(["category_name"])
cat3 = pd.DataFrame(group3["price"].mean())
cat3["num"] = group3["category_name"].count()
cat3["logprice"] = group3["logprice"].mean()
cat3["logstd"] = group3["logprice"].std()
cat3["min"] = group3["price"].min()
cat3["max"] = group3["price"].max()
cat3["std"] = group3["price"].std()
cat3["median"] = group3["price"].median()
cat3 = cat3.sort_values(by='num', ascending = False)
cat3.head(20)

In [None]:
plt.figure(figsize=(15, 8))
#ax = sns.lvplot(x=cat.index, y = cat.num)
ax = sns.stripplot(x=cat3.index, y = cat3.num, color="Red")
d = ax.set_ylim(0,)
d = ax.set(xlabel='category_name', ylabel='Items in category')
d = ax.set(xticklabels=[])

In [None]:
print(len(cat3.index)) # 1287
print(len(cat3.where(cat3["num"]<=100).dropna()))
print(len(cat3.where(cat3["num"]<=15).dropna()))

**Most of 3rd categories does not have enough samples**

Most of categories includes only few examples. About half of categories (600/1287) have less than 100 items, and about 300 categories have only less than 15 items.

Visualize (log)price distribution, of top 30 categories.

In [None]:
cat3_top30 = list(cat3.head(30).index) # top 30 categories
cat3_top30
f3 = train.loc[train["category_name"].isin(cat3_top30)].sort_values(by="category_name")[["price", "category_name"]]

f3["logprice"] = np.log(f3["price"]+1)
plt.figure(figsize=(12, 9))
ax = sns.lvplot(y=f3["category_name"], x =f3["logprice"])
d = ax.set(ylabel='category_name(top 30)', xlabel='logprice')

In [None]:
group3 = train.groupby(["category_name"])
#c32 = pd.DataFrame()
#c32["logprice"] = group3["logprice"].mean()
cat3["logstd"] = group3["logprice"].std()
#c32["num"] =  group3["category_name"].count()
#c32 = c32.sort_values(by="num", ascending=False)

plt.figure(figsize=(12, 9))
ax = sns.stripplot(y=cat3["logstd"], x =cat3.index)
d = ax.set(ylabel='logstd', xlabel='<- popular  category    unpopular->')
d = ax.set(xticklabels=[])

As expected, categories with small samples have large range of deviations (and variance), affected by prices of small samples.

# Making optimized category label

**Category feature engineerings** 

e.g.
  * Basically, take 2nd category as price marker
  * Use 3rd instead when:
      * We have enough samples in train
      * Included in both train and test in common

** Next task: make optimized category label **