## VITAMIN FOR WOMEN
**Pre-processing**

In [None]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
pd.set_option('display.max_rows', None)

xhs = pd.read_csv('../data/clean/xhs_clean_female.csv')
tmall = pd.read_csv('../data/clean/tmall_clean.csv')

In [None]:
xhs.head()

In [None]:
tmall.head()

In [None]:
top100 = pd.read_csv('../data/clean/top_100_sku.csv')
top100

### Merge Two Platforms

In [None]:
combine = pd.concat([xhs['text'], tmall['comment']])

df = pd.DataFrame(combine, columns=['text'])

df.reset_index(drop=True, inplace=True)

In [None]:
df.shape

### Pre-processing
- 自定义stop word
- 结巴分词 --(自定义字典)
- tfidf
- 选取关键词

1. 删除停用词

In [None]:
#  加载停用词列表
def load_stopword():
    f_stop = open('../asset/stop_words.txt', encoding='utf-8')  # 自己的中文停用词表
    sw = [line.strip() for line in f_stop]  # strip() 方法用于移除字符串头尾指定的字符（默认为空格）
    f_stop.close()
    return sw

2. 分词

In [None]:
def seg_word(sentence):
    file_userDict = '../asset/dict.txt'  # 自定义的词典
    jieba.load_userdict(file_userDict)

    sentence_seged = jieba.cut(sentence.strip())
    stopwords = load_stopword()
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '/t':
                outstr += word
                outstr += " "
    return outstr

### Combined XHS + Tmall

In [None]:
df['segmented'] = df['text'].apply(seg_word)

In [None]:
df.head()

#### One word

In [None]:
tvec = TfidfVectorizer(stop_words='english', 
                      min_df=10,
                      max_features=3000)

df_tvec = pd.DataFrame(tvec.fit_transform(df['segmented']).toarray(),
                                          columns=tvec.get_feature_names())

df_tvec.head()

In [None]:
df_tvec.sum().sort_values(ascending=False).head(1000)

#### Bigram

In [None]:
tvec_bigram = TfidfVectorizer(stop_words='english', 
                      min_df=10,
                      max_features=3000,
                      ngram_range=(2,2))

df_tvec_bigram = pd.DataFrame(tvec_bigram.fit_transform(df['segmented']).toarray(),
                                          columns=tvec_bigram.get_feature_names())

df_tvec_bigram.head()

In [None]:
df_tvec_bigram.sum().sort_values(ascending=False).head(1000)

#### One word & bigram

In [None]:
tvec_ngram = TfidfVectorizer(stop_words='english', 
                      min_df=10,
                      max_features=5000,
                      ngram_range=(1,2))

df_tvec_ngram = pd.DataFrame(tvec_ngram.fit_transform(df['segmented']).toarray(),
                                          columns=tvec_ngram.get_feature_names())

df_tvec_ngram.head()

In [None]:
df_tvec_ngram.sum().sort_values(ascending=False).head(1000)

### XHS only

#### One word

In [None]:
xhs_seg = pd.DataFrame(xhs.text, columns=['text'])

xhs_seg['segmented'] = xhs_seg['text'].apply(seg_word)

tvec = TfidfVectorizer(stop_words='english', 
                      max_df=0.99,
                      min_df=10,
                      max_features=3000)

xhs_tvec = pd.DataFrame(tvec.fit_transform(xhs_seg['segmented']).toarray(),
                                          columns=tvec.get_feature_names())

xhs_tvec.head()

In [None]:
# pd.set_option('display.max_rows', None)

xhs_tvec.sum().sort_values(ascending=False).head(1000)

#### Bigram

In [None]:
tvec_bigram = TfidfVectorizer(stop_words='english', 
                      max_df=0.99,
                      min_df=10,
                      max_features=3000,
                      ngram_range=(2,2))

xhs_tvec_bigram = pd.DataFrame(tvec_bigram.fit_transform(xhs_seg['segmented']).toarray(),
                                          columns=tvec_bigram.get_feature_names())

xhs_tvec_bigram.head()

In [None]:
xhs_tvec_bigram.sum().sort_values(ascending=False).head(1000)

#### One word & bigram

In [None]:
tvec_ngram = TfidfVectorizer(stop_words='english', 
                      max_df=0.99,
                      min_df=10,
                      max_features=6000,
                      ngram_range=(1,2))

xhs_tvec_ngram = pd.DataFrame(tvec_ngram.fit_transform(xhs_seg['segmented']).toarray(),
                                          columns=tvec_ngram.get_feature_names())

xhs_tvec_ngram.head()

In [None]:
xhs_tvec_ngram.sum().sort_values(ascending=False).head(1000)

### Tmall only

#### One word

In [None]:
tmall_seg = pd.DataFrame(tmall['comment'], columns=['comment'])

tmall_seg['segmented'] = tmall_seg['comment'].apply(seg_word)

In [None]:
tvec_t = TfidfVectorizer(stop_words='english', 
                      max_df=0.99,
                      min_df=10,
                      max_features=3000,
                      ngram_range=(1,1))

tmall_tvec = pd.DataFrame(tvec_t.fit_transform(tmall_seg['segmented']).toarray(),
                                          columns=tvec_t.get_feature_names())

tmall_tvec.head()

In [None]:
tmall_tvec.sum().sort_values(ascending=False).head(1000)

#### Bigram

In [None]:
tvec_bigram = TfidfVectorizer(stop_words='english', 
                      max_df=0.99,
                      min_df=10,
                      max_features=3000,
                      ngram_range=(2,2))

tmall_tvec_bigram = pd.DataFrame(tvec_bigram.fit_transform(tmall_seg['segmented']).toarray(),
                                          columns=tvec_bigram.get_feature_names())

tmall_tvec_bigram.head()

In [None]:
tmall_tvec_bigram.sum().sort_values(ascending=False).head(1000)

#### One word & bigram

In [None]:
tvec_ngram = TfidfVectorizer(stop_words='english', 
                      max_df=0.99,
                      min_df=10,
                      max_features=6000,
                      ngram_range=(1,2))

tmall_tvec_ngram = pd.DataFrame(tvec_ngram.fit_transform(tmall_seg['segmented']).toarray(),
                                          columns=tvec_ngram.get_feature_names())

tmall_tvec_ngram.head()

In [None]:
tmall_tvec_ngram.sum().sort_values(ascending=False).head(1000)