In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import squarify

In [None]:
# 导入数据
train = pd.read_csv("../input/train.tsv",delimiter="\t",dtype={"item_description":str})

In [None]:
# 导入数据
test = pd.read_csv("../input/test.tsv",delimiter="\t",dtype={"item_description":str})

In [None]:
test.head()

**Firstly, we'll check some general informations about our dataset :**

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train.head(5)

## 观察结果 ##

1.  训练集有1482535行，8列
2.  变量大概有6种，其中有连续变量、类别变量、二元变量
3.  3列有缺失: category_name、item_description、brand_name

## name

问题: 有多少个name?

In [None]:
i = train.name.value_counts().size
print("name 的个数:",i)

 在产品中,存在name相同的产品

# item_condition_id #

In [None]:
total = float(len(train.item_condition_id))

plt.figure(figsize=(17,10))
ax = sns.countplot(train.item_condition_id)

plt.title("Repartition of conditions", fontsize = 25)
plt.ylabel("Number of items", fontsize = 20)
plt.xlabel("Item condition ID", fontsize = 20)

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width() / 2,
           height + 3,
           "{:.2f}%".format((height/total)*100),
            ha="center")

# category_name #

问题 ：有多少个类别名?

In [None]:
nb_cat = train.category_name.value_counts().size
print("类别名的个数:",nb_cat)

** 前10产品的类别 **

In [None]:
plt.figure(figsize=(17,10))
sns.countplot(y=train.category_name,\
             order=train.category_name.value_counts().iloc[:10].index,\
             orient="v")
plt.title("Top 10 categories",fontsize=25)
plt.ylabel("Category name",fontsize=20)
plt.xlabel("Number of product in the category",fontsize=20)

**Treemap of the categories :**

In [None]:
# Size of each category
cats = pd.DataFrame(train.category_name.value_counts())
cats.reset_index(level = 0, inplace=True)
cats = cats.sort_values(by='category_name', ascending = False).head(20)
cats.columns =('category_name', 'size')

# Price by category
group = train.groupby(train.category_name)
mean_price = group.price.mean()
mean_price = pd.DataFrame(mean_price)
mean_price.reset_index(level = 0, inplace=True)

# Merging
cats = pd.merge(cats, mean_price, how='left', on = 'category_name')

# Colors setting
cmap = matplotlib.cm.viridis
mini=min(cats['size'])
maxi=max(cats['size'])
norm = matplotlib.colors.Normalize(vmin=mini, vmax=maxi)
colors = [cmap(norm(value)) for value in cats['size']]

# Labels setting
labels = ["%s\n%d items\n Mean price : %d$" % (label) \
          for label in zip(cats['category_name'], cats['size'], cats['price'])]

# Plotting
plt.figure(figsize=(30,20))
plt.rc('font', size=15)
squarify.plot(sizes = cats['size'], label = labels, alpha = .7, color=colors)
plt.axis('off')

** 前20类别产品的平均价格**

## category-price ##

In [None]:
group = train.groupby(train.category_name)
mean_price = pd.DataFrame(group.price.mean())
mean_price = mean_price.sort_values(by="price",ascending=False).head(20)
mean_price.reset_index(level = 0,inplace=True)

plt.figure(figsize=(17,20))
sns.barplot(x="price",y="category_name",data=mean_price,orient="h")
plt.title("Top 20 categories with higher mean price",fontsize = 30)
plt.ylabel("Categories",fontsize=25)
plt.xlabel("Mean price",fontsize=25)

**  类别价格的分布 ** 

In [None]:
mean_price_2 = pd.DataFrame(group.price.mean())
mean_price_2.reset_index(level = 0, inplace=True)

plt.figure(figsize =(12,7))
sns.kdeplot(mean_price_2.price, shade = True)
plt.title('Mean price by category distribution', fontsize = 20)
plt.xlabel('Mean price of each category', fontsize = 16)

正如我们所期望的那样，大部分类别的产品都是便宜的(<50),少部分的类别的产品是昂贵的

## brand_name ##

问题1: 有多少个brand_name ?

In [None]:
i = train.brand_name.value_counts().size
print("品牌名的个数:",i) 

**Top 10 brands by number of products :**

In [None]:
plt.figure(figsize=(17,10))
sns.countplot(y=train.brand_name,\
               order = train.brand_name.value_counts().iloc[:10].index,\
               orient = "v")
plt.title("Top 10 brand",fontsize=25)
plt.ylabel("Brand name",fontsize=20)
plt.xlabel("Number of product of the brand",fontsize=20)

** 那个品牌是最贵的 **

In [None]:
group = train.groupby (train.brand_name)
ranking = pd.DataFrame(group.price.mean())
ranking.reset_index(level = 0, inplace=True)
ranking = ranking.sort_values(by='price', ascending = False).head(15)

plt.figure(figsize=(14,12))
sns.barplot(x="price",y="brand_name",data=ranking,orient="h")
plt.title("Top 15 most expensive brands", fontsize = 30)
plt.ylabel("Categories",fontsize = 25)
plt.xlabel("Mean price",fontsize = 25)

疑问: 采用均值衡量品牌的价格是否合理?

**More details on brands with a treemap :**

In [None]:
# Brands sorted by number of item
brands = pd.DataFrame(train.brand_name.value_counts())
brands.reset_index(level = 0, inplace=True)
brands = brands.sort_values(by='brand_name', ascending = False).head(15)
brands.columns = ('brand_name', 'number_of_item')

# Brands by price
group = train.groupby (train.brand_name)
brands_prices = pd.DataFrame(group.price.mean())
brands_prices.reset_index(level = 0, inplace=True)

# Merging
brands = pd.merge(brands, brands_prices, how = 'left', on = 'brand_name')

# Labels setting
labels = ["%s\n%d items\n Mean price : %d$" % (label) \
          for label in zip(brands['brand_name'], brands['number_of_item'], brands['price'])]

# Plotting
plt.figure(figsize=(22,13))
plt.rc('font', size=18)
squarify.plot(sizes = brands['number_of_item'], label = labels, alpha = .7, color=colors)
plt.title('Brands treemap', fontsize = 35)
plt.axis('off')

** 价格**

In [None]:
pd.options.display.float_format = "{:2f}".format
train.price.describe()

In [None]:
i = train.price.quantile(0.99)
print ('The 99th quantile is :', i)

In [None]:
plt.figure(figsize=(17,10))
sns.kdeplot(train.price,shade=True)
plt.title("Simple distribution plot of the price",fontsize=20)

大多数的产品都比较低，还有不少的产品的价格为0

** 统计价格为0的个数**

In [None]:
i = train.price[train.price == 0].count()
print (i, 'items have a price of zero.')

**  那些类别的产品的价格为0**

In [None]:
price_of_zero = train.loc[train.price == 0]

plt.figure(figsize=(17,10))
sns.countplot(y=price_of_zero.category_name,\
             order=price_of_zero.category_name.value_counts().iloc[:10].index,\
             orient="v")

plt.title("Top 10 categories of items with a price of 0",fontsize = 25)
plt.ylabel("Category name",fontsize = 20)
plt.xlabel("Number of product in the category", fontsize=20)

In [None]:
price_of_zero = train.loc[train.price == 0]

plt.figure(figsize=(17,10))
sns.countplot(y = price_of_zero.category_name, \
              order = price_of_zero.category_name.value_counts().iloc[:10].index, \
                                                      orient = 'v')
plt.title('Top 10 categories of items with a price of 0', fontsize = 25)
plt.ylabel('Category name',  fontsize = 20)
plt.xlabel('Number of product in the category',  fontsize = 20)

It's actualy very similar to the top 10 categories of the whole dataset. So, having a price of zero is not specific to a category

# shipping #

In [None]:
total = float(len(train.shipping))

plt.figure(figsize=(10,7))
ax = sns.countplot(train.shipping)
plt.title("shipping fee paid by seller(1) or by buyer (0)",fontsize = 25)
plt.ylabel("Number of products", fontsize = 20)

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2,
           height+3,
           "{:.2f}%".format( 100 * height/total),
           ha="center")

问题: 价格是否依赖与shipping?

In [None]:
train[train.shipping == 0].size,train[train.shipping == 1].size

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x=train.shipping, y = train.price, showfliers=False, orient = 'v')
plt.title('Does shipping depend of prices ?', fontsize = 25)
plt.xlabel('Shipping fee paid by seller (1) or by buyer (0)', fontsize = 20)
plt.ylabel('Price without outliers', fontsize = 20)

We use the option "showfliers=False" to flat the graph (because of those few very high prices).
As expected, seller are paying shipping fees more often when a item is more expensive.

# item_description #

问题: 有多少是"No description yet"

In [None]:
train["no_descrip"] = 0
train.loc[train.item_description == "No description yet","no_descrip"] = 1
i = str(round(train["no_descrip"].value_counts(normalize=True).iloc[1] * 100,2)) + "%"
print(i,"of the items have no a description")

问题：是否是价格越贵，描述越完整?(或不完整的描述大多数发生在那个范围)

In [None]:
no_description_price = train.loc[train.no_descrip == 1,"price"]

plt.figure(figsize=(17,10))
sns.kdeplot(no_description_price,shade=True)
plt.xlabel("No description product price")

In [None]:
train['no_descrip'] = 0
train.loc[train.item_description=='No description yet', 'no_descrip'] = 1
i = str(round(train['no_descrip'].value_counts(normalize=True).iloc[1] * 100,2)) + '%'
print(i, 'of the items have no a description. \n')

i1 = str(round((train.no_descrip[train.price > 200].sum() / len(train.no_descrip))*100,2)) + '%'
i2 = str(round((train.no_descrip[train.price <= 200].sum() / len(train.no_descrip))*100,2)) + '%'

print('While', i2, 'of the items with a price lower than 100$ have no description, \n only',\
      i1, 'of the items with a price higher than 100$ have no description.')

1. 似乎存在有个有趣的现象,没有描述的产品的价格分布和价格的总体分布相似
2. 从图中看出,是否价格为0的产品几乎都没有描述

**What words do people use ?**

In [None]:
wc = WordCloud(background_color="white", max_words=5000, 
               stopwords=STOPWORDS, max_font_size= 50)

wc.generate(" ".join(str(s) for s in train.item_description.values))

plt.figure(figsize=(20,12))
plt.imshow(wc)
plt.axis('off')
#plt.imshow(wc, interpolation='bilinear')
plt.show()