In [3]:
import numpy as np
import pandas as pd
import nltk
import scipy as sp
import sklearn as sk
import os,time,re,string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,KFold
from sklearn import metrics

* 大概思路：利用正负面描述的 **词语频率** 预测Amazon的商品评价
    1. 读取文字描述，查看大概数据情况
    2. 整理标签(>3 stars = positive)
    3. 整理summary文字，并建立语料库
    4. 建模
    5. 结果

In [4]:
review_raw = pd.read_csv('./data/Reviews.csv')

In [5]:
review_raw.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
review_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
Id                        568454 non-null int64
ProductId                 568454 non-null object
UserId                    568454 non-null object
ProfileName               568438 non-null object
HelpfulnessNumerator      568454 non-null int64
HelpfulnessDenominator    568454 non-null int64
Score                     568454 non-null int64
Time                      568454 non-null int64
Summary                   568427 non-null object
Text                      568454 non-null object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [9]:
review_raw.isnull().sum(axis=0)

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [24]:
review_raw.Score.unique().shape[0]

5

In [25]:
n_samples = review_raw.shape[0]
n_samples

568454

In [12]:
df_uni = pd.DataFrame(index=range(10),
                      columns=['Feature','No. of NAs','No. of Unique Values','# of total samples','Unique_Percentage(%)'])


In [13]:
df_uni

Unnamed: 0,Feature,No. of NAs,No. of Unique Values,# of total samples,Unique_Percentage(%)
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [26]:
for i,feat in enumerate(review_raw.columns.values):
    feat_na = review_raw[feat].isnull().sum(axis=0)
    feat_uni = review_raw[feat].unique().shape[0]
    perc = feat_uni/n_samples * 100
    df_uni.iloc[i,:] = [feat, feat_na,feat_uni, n_samples, perc]

In [30]:
df_uni

Unnamed: 0,Feature,No. of NAs,No. of Unique Values,# of total samples,Unique_Percentage(%)
0,Id,0,568454,568454,100.0
1,ProductId,0,74258,568454,13.0632
2,UserId,0,256059,568454,45.0448
3,ProfileName,16,218417,568454,38.423
4,HelpfulnessNumerator,0,231,568454,0.0406365
5,HelpfulnessDenominator,0,234,568454,0.0411643
6,Score,0,5,568454,0.000879579
7,Time,0,3168,568454,0.557301
8,Summary,27,295743,568454,52.0258
9,Text,0,393579,568454,69.2367


In [31]:
review = review_raw.dropna()

In [34]:
review.shape

(568411, 10)

### 2. 整理label, 作为正负评价的标签
* 将结果分为两类: positive (score > 3) and negative (score <= 3)

In [35]:
label = review['Score'].apply(lambda x: 1 if x>3 else 0 )

In [36]:
label

0         1
1         0
2         1
3         0
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        0
13        1
14        1
15        1
16        0
17        1
18        1
19        1
20        1
21        1
22        1
23        1
24        1
25        1
26        0
27        1
28        1
29        1
         ..
568424    1
568425    1
568426    0
568427    1
568428    1
568429    1
568430    1
568431    0
568432    0
568433    0
568434    0
568435    0
568436    1
568437    1
568438    1
568439    1
568440    1
568441    1
568442    1
568443    1
568444    1
568445    1
568446    0
568447    1
568448    1
568449    1
568450    0
568451    1
568452    1
568453    1
Name: Score, Length: 568411, dtype: int64

### 3 整理summary文字栏

* 去标点等符号
* 分词
* stopwords,去掉stopwords
* 词干提取Stemming与词形还原Lemmatization
* 清洗结果对比

In [39]:
summary = review['Summary']
summary.head()

0    Good Quality Dog Food
1        Not as Advertised
2    "Delight" says it all
3           Cough Medicine
4              Great taffy
Name: Summary, dtype: object

#### 3.1 数据清洗: 去掉标点符号
* 不要忘记import string
* import string
* string.punctuation

In [40]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [44]:
a = str.maketrans('','',string.punctuation)

# issue 1 -  str.maketrans('','',string.punctuation)
```python
import string
# This uses the 3-argument version of str.maketrans with arguments (x, y, z) where 'x' and 'y'
# must be equal-length strings and characters in 'x' are replaced by characters in 'y'. 
# 'z' is a string(string.punctuation here) where each character in the string is mapped to None
translator = str.maketrans('', '', string.punctuation)
s = 'string with "punctuation" inside of it! Does this work? I hope so.'
print(s.translate(translator))
=> 'string with punctuation inside of it Does this work I hope so'
```

# Explaination 

```python
str.maketrans('abc', 'xyz', 'hij')

# This is the same as the two argument version, except that the characters from the third string are removed, 
# as if they were mapped to None. 
# So your table is saying "Don't replace anything, but remove the characters that show up in this string".
```
**Don't replace anything, but remove the characters that show up in this string**

reference: [How to explain the str.maketrans function in Python 3.6?](https://stackoverflow.com/questions/41535571/how-to-explain-the-str-maketrans-function-in-python-3-6#41536036)