In [None]:
#原始数据集
#数据来源：http://jmcauley.ucsd.edu/data/amazon/
#与UCSD下载的数据集有些出入
#JSON格式

#课上使用的数据集
#未去除特殊符号，不影响本project
#仅提取reviewertext和overall两列

In [1]:
%matplotlib inline

import time
import functools
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sparse

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split



### Pipeline

![Pipeline](pipeline.png)

In [None]:
#pipeline每个步骤都有相应的package
#可以尝试不同的方法，或方法组合

#Vectorization
#BoW，磁带方法, Bag of Words
#N-Gram，举例N=2
#Word2Vec，自己尝试，效果比BoW好

#Model
#Max Entropy，本质上是logistic regression
#SVM,经典模型
#LSTM,long shortern memory,长短记忆法，难一点

### Get Familiar with dataset

In [2]:
# Load csv file into DataFrame
kindle_data = pd.read_csv('sampled_data.csv')
type(kindle_data)

pandas.core.frame.DataFrame

In [3]:
# Print first row
# Format: data_frame.col_nam[row]
print("overall    :", kindle_data.overall[0])
print("reviewText :", kindle_data.reviewText[0])

overall    : pos
reviewText : This book ended even before it started and it made me want for more. Oh oh such a teaser. I want the book now please. So exciting.


In [4]:
# Length of kindle_data
len(kindle_data)

126871

In [5]:
# Get a sample (head) of the data frame
kindle_data.head()

Unnamed: 0,overall,reviewText
0,pos,This book ended even before it started and it ...
1,pos,This is a great read with so much emotion you ...
2,pos,"It&#8217;s Christmas Eve and miraculously, Sal..."
3,pos,I enjoyed meeting the character of Cassandra. ...
4,pos,"Can I be the next Hunter wife? Again, I have ..."


In [6]:
# Statics on tags
kindle_data.overall.value_counts()

pos    64559
neg    62312
Name: overall, dtype: int64

See [Pandas DataFrame](http://pandas.pydata.org/pandas-docs/stable/10min.html?highlight=data%20frame) for more details.

In [7]:
# Split complete data set into [pos, neg]
def splitPosNeg(data_):
    neg = data_.loc[data_.overall=='neg']
    pos = data_.loc[data_.overall=='pos']
    return [pos,neg]

[pos,neg] = splitPosNeg(kindle_data)

In [8]:
print(type(pos))
print("pos:", len(pos), ", neg:", len(neg))

<class 'pandas.core.frame.DataFrame'>
pos: 64559 , neg: 62312


In [None]:
#大型数据处理，debug很麻烦，两种简单的处理方法：
#1.逐步验证
#  图像处理，验证结果矩阵
#2.先用小数据集验证程序是否正确

### Preprocessing

In [None]:
#1、去掉无意义的标点符号
#2、去掉对情感极性分析无实际意义的词，stopwords

In [None]:
**********************************************************************
  Resource 'corpora/stopwords' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/home/wing/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

In [9]:
#定义没有实际意义的词，此处使用nltk中默认的英文中的stopwords
lemmatizer = nltk.WordNetLemmatizer()
stop = stopwords.words('english')

#定义翻译表，将标点符号替换为长度相同的空格
translation = str.maketrans(string.punctuation,' '*len(string.punctuation))

In [10]:
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
#翻译表使用示例
transtbl = str.maketrans('abc','def')
'ababc'.translate(transtbl)

'dedef'

In [14]:
#将每一条review转化为tokens
#
#
def preprocessing(line):
    tokens=[]
    line = str(line).translate(translation)  # Replace punctuation
    line = nltk.word_tokenize(line.lower())  # Tokenization，传入转化为小写的line
    
    for t in line:
        # Remove stopwords
        if t not in stop:
            stemmed = lemmatizer.lemmatize(t)#词形还原
            tokens.append(stemmed)
    
    return ' '.join(tokens)

In [13]:
# Yet a more compact way to write the code
def preprocessing(line: str) -> str:
    #line: str, python3新增的定义方法，指定输入变量的类型，有一个强制检查，输入不是str会有warning
    #-> str, python3新增标记方法，指定输出的类型
    line = str(line).translate(translation)
    line = nltk.word_tokenize(line.lower())
    
    line = [lemmatizer.lemmatize(t) for t in line if t not in stop]
    #list comprehension, python中的语法，类似于sql
    #lemmatizer.lemmatize(t), 相当于select子句
    #for，t的来源，相当于from子句
    #if，t需满足的条件，相当于where子句
    return ' '.join(line)

In [14]:
test_str = "I bought it yesterday and I really love it!"
preprocessing(test_str)

'bought yesterday really love'

In [None]:
#注意：此处bought没有被还原
#因lemmatizer.lemmatize（）默认只对名词进行还原
#若需对动词还原，需设置lemmatizer.lemmatize(word, pos='n')，pos='v'

In [15]:
# 对动词也进行还原的预处理
def preprocessing2(line: str) -> str:
    line = str(line).translate(translation)
    line = nltk.word_tokenize(line.lower())
    
    line = [lemmatizer.lemmatize(t, pos='v') for t in line if t not in stop]
    return ' '.join(line)

In [16]:
preprocessing2(test_str)

'buy yesterday really love'

In [17]:
# Preprocess all data
pos_data = [preprocessing(p) for p in pos['reviewText']]
neg_data = [preprocessing(p) for p in neg['reviewText']]

In [20]:
# Yet a more modern way to write code
#map()参数1，处理方法
#map()参数2，可迭代的数据
pos_data = list(map(preprocessing, pos['reviewText']))
neg_data = list(map(preprocessing, neg['reviewText']))

### Some modern functions to introduce
- map
- reduce
- filter

They are very useful when running the project on a cluster or distributed compute system like Hadoop or Spark.

In [None]:
#现在编程语言，如java，都加入了map和lambda
#大数据处理平台，hadoop-->spark，将数据操作简化为3类
#map，对数据进行同样的处理，映射
#reduce，对应统计的方法，比如平均值，最大值等
#filter，对数据进行筛选
#便于平台间数据接口

In [21]:
# Some useful modern functions
l = [0,1,2,3,4,5,6,7,8,9]

# Map
def square(x: int) -> int:
    return x * x

print( list(map(square, l)) )

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [22]:
# Using lambda function
print( list(map(lambda x: x * x, l)) )

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [23]:
# Reduce
# reduce function is moved to functools
def add(x, y):
    return x + y

rst = functools.reduce(add, l)
print ("reduce", l, "by add:", rst)

reduce [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] by add: 45


In [24]:
# Using lambda function
# reduce is moved to functools in Python 3
rst = functools.reduce(lambda x, y: x + y, l)
print ("reduce", l, "by add:", rst)

reduce [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] by add: 45


In [25]:
rst = functools.reduce(lambda x, y: max(x, y), l)
print ("reduce", l, "by max:", rst)

reduce [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] by max: 9


In [26]:
# Filter
# Much faster than loop, similar with list comprehension
list(filter(lambda x: x < 5, l))

[0, 1, 2, 3, 4]

### Split Training Data & Test Data

In [27]:
data = pos_data + neg_data
# remember this is sampled
labels = np.concatenate((pos['overall'].values,neg['overall'].values))

In [28]:
# Split data into training set and testing set (20:80)
# stratify: make sure pos/neg remains the same in training set and testing set
#用于需要迭代优化的算法，SVM，LogisticRegression，便于结果预估
#朴素贝耶斯不需要，因为不涉及迭代优化过程
train_data, test_data, train_labels, test_labels = \
train_test_split(
    data, 
    labels, 
    test_size=0.2, #切分比例，test_data的占比为20%
    stratify=labels, #随机切分，防止抽样不均衡，此处=labels，表示pos:neg与原始数据labels中比例相同
    random_state=1234
)

In [29]:
print("training size = ", len(train_data), "testing size = ", len(test_data))

training size =  101496 testing size =  25375


#### Underfitting vs Overfitting
![](http://scikit-learn.org/stable/_images/sphx_glr_plot_underfitting_overfitting_001.png)

Common Method:
- 20:80 Split
- K-fold

To estimate accuracy (f-score):
- 20:20:60 Split
- 10:20:70 Split

In [None]:
#10:20:70更常用
#10，验证
#20，测试
#70，训练

In [27]:
# Push all tokens and compute frequency of words
#将所有tokens展开放入一维数组
t = []
for line in train_data:
    l = nltk.word_tokenize(line)
    for w in l:
        t.append(w)

#统计频数        
word_features = nltk.FreqDist(t)

In [31]:
# Yet another more python-y style
tokens = [word for line in train_data \
               for word in nltk.word_tokenize(line)]

word_features = nltk.FreqDist(tokens)

In [32]:
print(word_features)

<FreqDist with 84617 samples and 4195256 outcomes>


In [33]:
word_features.most_common(10)

[('book', 124073),
 ('story', 62132),
 ('read', 55777),
 ('one', 36880),
 ('like', 32032),
 ('character', 30265),
 ('good', 28930),
 ('would', 27660),
 ('author', 24824),
 ('love', 24805)]

In [34]:
#认为限制前10000个词
topwords = [fpair[0] for fpair in list(word_features.most_common(10000))]

### Vectorizer

In [35]:
cnt_vec = CountVectorizer()
cnt_vec

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [None]:
#ngram_range=(1, 1),BAG of words,默认
#ngram_range=(2, 2),bi-gram
#ngram_range=(1, 2),(Uni+Bi)-Gram

In [36]:
# Create our BAG of words (specify words we care about)
#一维矩阵，即单词表， bag
cnt_fit = cnt_vec.fit_transform([' '.join(topwords)])
cnt_fit

<1x9969 sparse matrix of type '<class 'numpy.int64'>'
	with 9969 stored elements in Compressed Sparse Row format>

#### Tf–idf term weighting

- Tf: term-frequency
- idf: inverse document-frequency
- Tf-idf = $tf(t,d) \times idf(t)$

$$
idf(t) = log{\frac{1 + nd}{1 + df(d, t)}} + 1
$$

![](http://www.onemathematicalcat.org/Math/Algebra_II_obj/Graphics/log_base_gt1.gif)

> Sentent 1: The boy **love** the toy

> Sentent 2: The boy **hate** the toy

In [None]:
#并非出现次数越多，对分类结果影响越大，比如toy，没有倾向性
#love仅在pos出现，对结果的区分度更大
#而naive bayes中，出现次数越多的词，对结果的影响越大

#引入调整权重的方法， Tf–idf term weighting
#公式中，nd是document数量，df(d,t)是document frequency
#在上面的例子中，
#love出现1次，df(d,t)=1，nd=2（两个句子），其idf=log(1+2)/(1+1)+1>1
#boy,其idf=log(1+2)/(1+2)+1=1

In [38]:
#Naive Bayes，独立性假设
transformer = TfidfTransformer(smooth_idf=False)#smooth_idf，平滑操作，防止违反独立性假设出现0，导致连乘的计算结果为0的情况
transformer

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False,
         use_idf=True)

In [39]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]
tfidf = transformer.fit_transform(counts)
tfidf

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [None]:
#feature1，每行出现多次，说明没什么意义
#feature3、feature3，仅在不同行出现，对分类结果比较有意义

In [40]:
tfidf.toarray()

array([[ 0.81940995,  0.        ,  0.57320793],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.47330339,  0.88089948,  0.        ],
       [ 0.58149261,  0.        ,  0.81355169]])

In [41]:
tf_trans = TfidfTransformer()
tf_fit = tf_trans.fit_transform(cnt_fit)
tf_fit

<1x9969 sparse matrix of type '<class 'numpy.float64'>'
	with 9969 stored elements in Compressed Sparse Row format>

In [42]:
# Since CountVectorizer and TfidTransformer are often used together，可用于替代之前的两个步骤
# There is a class named TfidfVectorizer that combine these two steps
tf_vec = TfidfVectorizer()
tf_fit = tf_vec.fit_transform([' '.join(topwords)])
tf_fit

<1x9969 sparse matrix of type '<class 'numpy.float64'>'
	with 9969 stored elements in Compressed Sparse Row format>

In [None]:
#数据的使用
#1.train_data-->tokens-->word_features-->topwords-->cnt_fit-->tf_fit，得到单词表；tf_vec，得到向量器
#2.tf_vec(train_data)-->train_features-->mnb.model-->mnb.predict(test_data)

### Feature Extraction

In [43]:
# Extract features from training set
# Vocabulary is from topwords
train_features = tf_vec.transform(train_data)

# cnt_train_features = cnt_vec.transform(train_data)
# train_features = tf_trans.transform(cnt_train_features)

In [44]:
# Array[n_train_data * n_features]
train_features.shape

(101496, 9969)

In [None]:
#列数9969即单词表的长度

In [45]:
# Extract features from test set
test_features = tf_vec.transform(test_data)

# cnt_test_features = cnt_vec.transform(test_data)
# test_features = tf_trans.transform(cnt_test_features)

### [Multinomial NB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

The multinomial Naive Bayes classifier is suitable for **classification with discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [None]:
#实际使用发现，tf-idf比wordcounts效果要好
#http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [46]:
from sklearn.naive_bayes import MultinomialNB

In [47]:
mnb_model = MultinomialNB()
mnb_model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
#alpha,Laplace拉普拉斯平滑因子， (0 for no smoothing)，若单词从未出现过则假设出现了1次
#class_prior
#fit_prior，是否要事先学习train与test的比例

In [48]:
# Train Model
start = time.time()
mnb_model.fit(train_features, train_labels)
end = time.time()

print("Multinomial NB model trained in %f seconds" % (end-start))

Multinomial NB model trained in 0.270455 seconds


In [None]:
#naive bayes的一个极大优势，非常快，几乎没有训练时间
#SVM，特别慢
#linear SVM，会快一些

In [50]:
# Predict
pred = mnb_model.predict(test_features)
print(pred)

['neg' 'pos' 'pos' ..., 'neg' 'pos' 'neg']


In [51]:
# Metrics，统计模型参数的包
# metrics.accuracy_score(y_true, y_pred)，参数顺序颠倒亦可
accuracy = metrics.accuracy_score(pred,test_labels)
print(accuracy)

0.8199408867


In [52]:
# Use keyword arguments to set arguments explicitly
print(metrics.classification_report(y_true=test_labels, y_pred=pred))

             precision    recall  f1-score   support

        neg       0.83      0.80      0.81     12463
        pos       0.81      0.84      0.83     12912

avg / total       0.82      0.82      0.82     25375



In [None]:
#precision vs. accuracy
#一般用平均的f-score

In [53]:
# Example from sklearn documentation
#三分类问题
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
print(metrics.classification_report(y_true, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.50      1.00      0.67         1
    class 1       0.00      0.00      0.00         1
    class 2       1.00      0.67      0.80         3

avg / total       0.70      0.60      0.61         5



### Predict new sentences

In [54]:
# Predict a new sentence
# vectorizer needs to be pre-fitted
# At the end of the project, the function signature should be something like:
# predict_new(sentent: str, vec, model) -> str

def predict_new(sentence: str):
    sentence = preprocessing(sentence)
    features = tf_vec.transform([sentence])
    pred = mnb_model.predict(features)
    return pred[0]

In [55]:
predict_new("Not bad")

'neg'

In [56]:
predict_new("This product is bad")

'neg'

In [None]:
#这种情况怎么处理？

### Save model

In [57]:
import pickle

In [58]:

#？模型
# Save vectorizer
with open('tf_vec.pkl', 'wb') as pkl_file:
    pickle.dump(tf_vec, pkl_file)

In [59]:
#naive bayes模型
with open('mnb_model.pkl', 'wb') as pkl_file:
    pickle.dump(mnb_model, pkl_file)

#### Train & test using Uni-Gram + Bi-Gram features

In [60]:
# (Uni+Bi)-Gram
bg_tf_vec = TfidfVectorizer(ngram_range=(1,2))
bg_tf_vec.fit([' '.join(topwords)])
bg_train_features = bg_tf_vec.transform(train_data)

bg_train_features.shape

# Array[n_train_data * (uni_gram_features + bi_gram_features)]

(101496, 19937)

In [62]:
bg_test_features = bg_tf_vec.transform(test_data)

In [44]:
# Extract (uni+bi)-gram test featuresransform(test_data)

In [63]:
# Train & test using (uni+bi)-gram features
bg_mnb_model = MultinomialNB()
bg_mnb_model.fit(bg_train_features, train_labels)
bg_pred = bg_mnb_model.predict(bg_test_features)
print(bg_pred)

['neg' 'pos' 'pos' ..., 'neg' 'pos' 'neg']


In [64]:
# Statistics
bg_accuracy = metrics.accuracy_score(bg_pred,test_labels)
print(bg_accuracy)

0.819783251232


In [65]:
print(metrics.classification_report(y_true=test_labels, y_pred=bg_pred))

             precision    recall  f1-score   support

        neg       0.83      0.80      0.81     12463
        pos       0.81      0.84      0.83     12912

avg / total       0.82      0.82      0.82     25375



In [None]:
#爬数据，scrapy.org

#数据来源：
#1、darksky.net/dev
#2、学校官网，如UCSD