# 商品信息可视化与文本处理结果可视化展示

In [None]:
# 启动: juyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
import nltk
import string
import re
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')

from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import _stop_words

from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import  plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go
import plotly.tools as tls
%matplotlib inline

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook

import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('1da').setLevel(logging.WARNING)

In [None]:
train = pd.read_csv('train.csv', sep='\t')
test = pd.read_csv('test.csv', sep='\t')

In [None]:
# size of training and dataset
print(train.shape)
print(test.shape)

In [None]:
# different data types in the dataset: categorical (strings) and numeric
train.dtypes

In [None]:
train.head()

#### 对我们将要提供的建议价格进行处理，使用log变换

In [None]:
train.price.describe()

#### 价格属性转换前和转换后的分布情况对比

In [None]:
plt.subplot(1, 2, 1)
(train['price']).plot.hist(bins=50, figsize=(20,10), edgecolor='white', range=[0, 250])
plt.xlabel('price+', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Price Distribution - Training Set', fontsize=17)

plt.subplot(1, 2, 2)
np.log(train['price']+1).plot.hist(bins=50, figsize=(20,10), edgecolor='white')
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.tick_params(labelsize=15)
plt.title('Log(Price)) Distribution - Training Set', fontsize=17)
plt.show()

#### 运费承担:大概有55%的卖家是承担运费的

In [None]:
train.shipping.value_counts()/len(train)

#### 看一下运费不同情况的价格变化

In [None]:
prc_shipBySeller = train.loc[train.shipping==1, 'price']
prc_shipByBuyer = train.loc[train.shipping==0, 'price']

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
ax.hist(np.log(prc_shipBySeller + 1), color='#8CB4E1', alpha=1.0, bins = 50,
        label='Price when Seller pays Shipping')
ax.hist(np.log(prc_shipByBuyer + 1), color='#007D00', alpha=0.7, bins=50,
        label='Price when Buyer pays Shipping')
ax.set(title="Histogram Comparison", ylabel='% of Dataset in Bin')
plt.legend()
plt.xlabel('log(price+1)', fontsize=17)
plt.ylabel('frequency', fontsize=17)
plt.title('Price Distribution by Shipping Type', fontsize=17)
plt.tick_params(labelsize=15)
plt.show()

#### 商品类别

In [None]:
print("There are %d unique values in the category column." % train['category_name'].unique())

In [None]:
# Top 5 RAW CATEGORIES
train['category_name'].value_counts()[:5]

In [None]:
# missing categories
print("There are %d items that do not have a label." % train['category_name'].isnull().sum())

#### 类别细分一下

In [None]:
def spilt_cat(text):
    try:
        return text.split('/')
    except:
        return ("No Label", "No Label", "No Label")

train['general_cat'], train['subcat_1'], train['subcat_2'] = zip(*train['category_name'].apply(lambda x: spilt_cat(x)))
train.head()

In [None]:
# repeat the same step for the test set
test['general_cat'], test['subcat_1'], test['subcat_2'] = zip(*test['category_name'].apply(lambda x:spilt_cat(x)))

In [None]:
print("There are %d unique first sub-categories." % train['subcat_1'].nunique())

In [None]:
print("There are %d unique second sub-categories." % train['subcat_2'].nunique())

#### 总体来说，我们有7个类别(第一个子类别中的114个和第二个子类别中有871个)：女性和美容项目是最受欢迎的两类(超过50％的观察)，其次是儿童和电子产品
#### 各大主类别分布情况:

In [None]:
x = train['general_cat'].value_counts().index.values.astype('str')
y = train['general_cat'].value_counts().values()
pct = [("%.2f"%(v*100)) + "%" for v in (y/len(train))]

In [None]:
# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.tools as tls
trace1 = go.Bar(x=x, y=y, text=pct)
layout = dict(title = 'Number of Items by Main Category',
              yaxis = dict(title = 'Count'),
              xaxis = dict(title = 'Category'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)

#### subcat_1类别分布情况

In [None]:
x = train['subcat_1'].value_counts().index.values.astype('str')[:15]
y = train['subcat_1'].value_counts().values()[:15]
pct = [("%.2f"%(v*100)) + "%" for v in (y/len(train))][:15]

In [None]:
trace1 = go.Bar(x=x, y=y, text=pct,
                marker=dict(
                    color = y, colorscale = 'Portland', showscale = True,
                    reversescale = False
                ))
layout = dict(title = 'Number of Items by Sub Category (Top 15)',
              yaxis = dict(title = 'Count'),
              xaxis = dict(title = 'Category'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)
# 运动服装 化装 女t

In [None]:
general_cats = train['general_cat'].unique()
x = [train.loc[train['general_cat'] == cat, 'price']  for cat in general_cats]

In [None]:
data = [go.Box(x=np.log(x[i]+1), name = general_cats[i]) for i in range(len(general_cats))]

In [None]:
layout = dict(title = 'Price Distribution by General Category',
              yaxis = dict(title = 'Frequency'),
              xaxis = dict(title = 'Category'))
fig = dict(data=data, layout=layout)
py.iplot(fig)

#### 品牌名字

In [None]:
print("There are %d unique brand names in the training dataset." % train['brand_name'].nunique())

In [None]:
x = train['brand_name'].value_counts().index.values.astype('str')[:10]
y = train['brand_name'].value_counts().values[:10]

In [None]:
trace1 = go.Bar(x = x, y = y,
                marker=dict(
                    color = y, colorscale = 'Portland', showscale = True,
                    reversescale = False
                ))
layout = dict(title = 'Top 10 Brand by Number of Items',
              yaxis = dict(title = 'Brand Name'),
              xaxis = dict(title = 'Count'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)

商品描述由于它是非结构化数据，因此解析这个特定项目会更具有挑战性，这是否意味着和更长的描述会导致更高的？我们将将删除所有的，删除一些英文停用词(如"a","the"等)以及长度小于3的任何其他词

In [None]:
def wordCount(text):
    # convert to lower case and strip regex
    try:
        # convert to lower case and strip regex
        text = text.lower()
        regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
        txt  = regex.sub(" ", text)
        # tokenize
        # words = nltk.word_tokenize(clean_txt)
        # remove words in stop words
        words = [w for w in txt.split(" ") if not w in _stop_words.ENGLISH_STOP_WORDS and len(w) > 3]
        return  len(words)
    except:
        return  0

In [None]:
# add a column to word counts to both the training and test set
train['desc_len'] = train['item_description'].apply(lambda x:wordCount(x))
test['desc_len'] = train['item_description'].apply(lambda x:wordCount(x))

In [None]:
train.head()

In [None]:
df = train.groupby('desc_len')['price'].mean().reset_index()

#### 名字长短与价格有关吗？

In [None]:
trace1 = go.Scatter(
    x = df['desc_len'],
    y = np.log(df['price']+1),
    mode = 'lines+markers',
    name = 'lines+markers'
)

layout = dict(title = 'Average Log(Price) by Description Length',
              yaxis = dict(title='Average Log(Price)'),
              xaxis = dict(title='Description Length'))
fig = dict(data=[trace1], layout=layout)
py.iplot(fig)

In [None]:
train.item_description.isnull().sum() # 缺失值

In [None]:
# remove missing values in item description
train = train[pd.notnull(train['item_description'])]

In [None]:
# create a dictionary of words for each category
tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
cat_desc = dict()
for cat in general_cats:
    text = " ".join(train.loc[train['general_cat'] == cat, 'item_description'].values())
    cat_desc[cat] = tokenize.tokenize(text)

# flat list of all words combined
flat_lst = [item for sublist in list(cat_desc.values()) for item in sublist]
allWordsCount = Counter(flat_lst)
all_top10 = allWordsCount.most_common(20)