## 1. 任务介绍
> 介绍任务的基本内容，以及问题的公式化
---
文本分类是自然语言处理中最基础的任务之一，主要是通过分类器将给定的文本划分到特定的类，比如情绪分类、垃圾邮件分类、电影评论分类等。具体任务公式化如下：
$$
\begin{aligned}
文本 ： &X = (x_1,x_2,\dots,x_n) \\
类标签 ：& Y = (y_1,y_2,\dots,y_n)\\
模型 ：& f: x_i  \xrightarrow{f} y_i, \hspace{1em} i = 1,2, \dots,n
\end{aligned}
$$
本文选用Kaggle的电影评论情感分析来作为任务。

## 2.环境准备

In [1]:
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics 

import nltk
from nltk.corpus import stopwords

## 3. 数据预处理
> 需要对数据进行清洗
---

处理步骤大致如下：
    1. 去除html标签
    2. 去除标点
    3. 切分成词
    4. 去除停用词
    5. 重组为新的句子

In [2]:
# 0. 先准备数据
file_path = '../data/IMDB/labeledTrainData.tsv'
df = pd.read_csv(file_path,sep='\t',escapechar='\\')
print('Number of samples:{}'.format(len(df)))
df.head()

Number of samples:25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
eng_stopwords = stopwords.words('english') #定义停用词

def text_clean(text):
    text = BeautifulSoup(text,'html.parser').get_text() #去除html标签
    text = re.sub(r'[^a-zA-Z]',' ',text) #去除标点
    words = text.lower().split()  #全部转成小写，然后按空格分词
    words = [w for w in words if w not in eng_stopwords] #去除停用词
    return ' '.join(words)  #重组成新的句子

df['clean_review'] = df.review.apply(text_clean)
df.head()

Unnamed: 0,id,sentiment,review,clean_review
0,5814_8,1,With all this stuff going down at the moment w...,stuff going moment mj started listening music ...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin...",classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious explo...


## 4. 文本特征表示 
> 文本向量化，并提取特征，分为离散法和分布式法(此处主要讲离散法)
---
离散法主要有以下几种方法：
 1. 词袋模型(Bag of word): 用单词频数来表示文本，不考虑文本的语法结构和单词顺序
 2. 独热编码(One-hot): 将文本表示成整个词标长度的向量，出现过的词为1，否则为0
 3. n元语法(n-gram): 对词袋模型的一种改进，即用n个词或词组组成的字符串作为特征，然后再用词袋模型的方法将文本表示为向量形式
 4. TF-IDF： 用词的TF-IDF来表示文本
 
<span style="color:red">注：其实上述方法都可以看作是词袋模型</span>

In [4]:
# 1. 使用统计词频，作为文本特征
vectorizer_feq = CountVectorizer(max_features=5000) #取词频为前5000的词
train_freq = vectorizer_feq.fit_transform(df.clean_review).toarray()
print("词频为特征的文本-单词矩阵维度:",train_freq.shape)

# 2. 使用bigram，作为文本特征
vectorizer_bigram = CountVectorizer(ngram_range=(2,2),max_features=1000,token_pattern=r'\b\w+\b', min_df=1)
# analyze = vectorizer_bigram.build_analyzer()
# print("bi-gram示例：",analyze(df.clean_review[0]))#bi-gram举例
train_bigram = vectorizer_bigram.fit_transform(df.clean_review).toarray()
print("bi-gram为特征的文本-单词矩阵维度：",train_bigram.shape)

# 2. 使用tfidf, 作为文本特征
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
train_tfidf = vectorizer_tfidf.fit_transform(df.clean_review).toarray()

print("TF-IDF为特征的文本-单词矩阵维度：",train_tfidf.shape)

词频为特征的文本-单词矩阵维度: (25000, 5000)
bi-gram为特征的文本-单词矩阵维度： (25000, 1000)
TF-IDF为特征的文本-单词矩阵维度： (25000, 5000)


## 5. 辅助函数
> 包括数据批量生成器，softmax函数，预测函数，评估函数

In [5]:
#定义数据批量生成器
def batch_generator(data, batch_size, shuffle=True):
    X, Y = data
    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    if shuffle:
        np.random.shuffle(indices)  #打乱顺序

    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        batch_idx = indices[start:end]

        yield X[batch_idx], Y[batch_idx]
        
# softmax函数
def softmax(scores):
    sum_exp = np.sum(np.exp(scores),axis=1,keepdims=True)
    softmax = np.exp(scores) / sum_exp
    return softmax

# 预测函数
def predict(w,b,x):
    scores = np.dot(x,w.T) + b
    probs = softmax(scores)
    
    return np.argmax(probs,axis=1).reshape(-1,1)
    
# 评估函数，包括精确率，召回率和F1值
def evaluate(w,val_x,val_y):
    val_loss = []
    val_gen = batch_generator((val_x,val_y),batch_size=32,shuffle=False)
    for batch_x,batch_y in val_gen:
        scores = np.dot(batch_x,w.T)
        prob = softmax(scores)

        y_one_hot = one_hot(batch_y)
        # 损失函数
        loss = - (1.0 / len(batch_x)) * np.sum(y_one_hot * np.log(prob))
        val_loss.append(loss)
        
    return np.mean(val_loss)
    
    
def one_hot(batch_y,n_classes=2):
    n = batch_y.shape[0]
    one_hot = np.zeros((n, n_classes))
    one_hot[np.arange(n), batch_y.T] = 1
    return one_hot
    

## 6. 构建分类器
> 此处以softmax regression作为分类器

In [6]:
def logistic_regression(x, y, n_classes=2, lr = 0.01, val_split=0.2, batch_size=128, epochs=5000, early_stop=None):
    n_samples,n_features = x.shape

    w = np.random.rand(n_classes, n_features)
    train_all_loss = []
    val_all_loss = []
    not_improved = 0
    best_val_loss = np.inf
    best_w = None

    indices = np.random.permutation(n_samples)
    split = int(n_samples * (1-val_split))
    training_idx = indices[:split]
    valid_idx = indices[split+1:]
    
    train_x = x[training_idx]
    train_y = y[training_idx]
    
    valid_x = x[valid_idx]
    valid_y = y[valid_idx]
    
    for epoch in range(epochs):
        training_gen = batch_generator((train_x, train_y), batch_size=128)
        train_loss = []
        for batch_x, batch_y in training_gen:
            scores = np.dot(batch_x, w.T)
            prob = softmax(scores)

            y_one_hot = one_hot(batch_y,n_classes)
            # 损失函数
            loss = - (1.0 / len(batch_x)) * np.sum(y_one_hot * np.log(prob))
            train_loss.append(loss)

            # 梯度下降
            dw = -(1.0/len(batch_x)) * np.dot((y_one_hot - prob).T, batch_x)
            w = w - lr * dw

        val_loss = evaluate(w, valid_x, valid_y)

        print("Epoch = {0},the train loss = {1:.4f}, the val loss = {2:.4f}".format(
            epoch, np.mean(train_loss), val_loss))

        train_all_loss.append(np.mean(train_loss))
        val_all_loss.append(val_loss)

        if early_stop is not None:
            if val_loss <= best_val_loss:
                best_val_loss = val_loss
                best_w = w
                not_improved = 0
            else:
                not_improved += 1

            if not_improved > early_stop:
                print("Validation performance didn\'t improve for {} epochs. "
                        "Training stops.".format(early_stop))
                break
                
    return best_w,train_all_loss,val_all_loss
        

## 7.训练 

### 7.1 以BOW为特征进行训练 

In [7]:
label = df['sentiment'].values
w,train_all_loss,val_all_loss = logistic_regression(train_freq,label,early_stop=10)

Epoch = 0,the train loss = 1.8449, the val loss = 1.6766
Epoch = 1,the train loss = 1.5352, the val loss = 1.4716
Epoch = 2,the train loss = 1.3741, the val loss = 1.3359
Epoch = 3,the train loss = 1.2604, the val loss = 1.2362
Epoch = 4,the train loss = 1.1764, the val loss = 1.1614
Epoch = 5,the train loss = 1.1069, the val loss = 1.1012
Epoch = 6,the train loss = 1.0524, the val loss = 1.0521
Epoch = 7,the train loss = 1.0012, the val loss = 1.0110
Epoch = 8,the train loss = 0.9608, the val loss = 0.9761
Epoch = 9,the train loss = 0.9240, the val loss = 0.9462
Epoch = 10,the train loss = 0.8943, the val loss = 0.9198
Epoch = 11,the train loss = 0.8667, the val loss = 0.8960
Epoch = 12,the train loss = 0.8401, the val loss = 0.8749
Epoch = 13,the train loss = 0.8166, the val loss = 0.8557
Epoch = 14,the train loss = 0.7988, the val loss = 0.8381
Epoch = 15,the train loss = 0.7771, the val loss = 0.8218
Epoch = 16,the train loss = 0.7600, the val loss = 0.8067
Epoch = 17,the train los

Epoch = 141,the train loss = 0.2972, the val loss = 0.4530
Epoch = 142,the train loss = 0.2973, the val loss = 0.4525
Epoch = 143,the train loss = 0.2959, the val loss = 0.4521
Epoch = 144,the train loss = 0.2949, the val loss = 0.4514
Epoch = 145,the train loss = 0.2943, the val loss = 0.4509
Epoch = 146,the train loss = 0.2931, the val loss = 0.4503
Epoch = 147,the train loss = 0.2917, the val loss = 0.4498
Epoch = 148,the train loss = 0.2929, the val loss = 0.4491
Epoch = 149,the train loss = 0.2909, the val loss = 0.4486
Epoch = 150,the train loss = 0.2897, the val loss = 0.4481
Epoch = 151,the train loss = 0.2893, the val loss = 0.4476
Epoch = 152,the train loss = 0.2886, the val loss = 0.4470
Epoch = 153,the train loss = 0.2876, the val loss = 0.4464
Epoch = 154,the train loss = 0.2865, the val loss = 0.4460
Epoch = 155,the train loss = 0.2865, the val loss = 0.4456
Epoch = 156,the train loss = 0.2855, the val loss = 0.4450
Epoch = 157,the train loss = 0.2845, the val loss = 0.44

Epoch = 280,the train loss = 0.2256, the val loss = 0.4089
Epoch = 281,the train loss = 0.2252, the val loss = 0.4088
Epoch = 282,the train loss = 0.2250, the val loss = 0.4087
Epoch = 283,the train loss = 0.2254, the val loss = 0.4086
Epoch = 284,the train loss = 0.2245, the val loss = 0.4082
Epoch = 285,the train loss = 0.2254, the val loss = 0.4081
Epoch = 286,the train loss = 0.2250, the val loss = 0.4079
Epoch = 287,the train loss = 0.2235, the val loss = 0.4078
Epoch = 288,the train loss = 0.2231, the val loss = 0.4076
Epoch = 289,the train loss = 0.2241, the val loss = 0.4075
Epoch = 290,the train loss = 0.2233, the val loss = 0.4075
Epoch = 291,the train loss = 0.2221, the val loss = 0.4072
Epoch = 292,the train loss = 0.2227, the val loss = 0.4074
Epoch = 293,the train loss = 0.2225, the val loss = 0.4070
Epoch = 294,the train loss = 0.2219, the val loss = 0.4068
Epoch = 295,the train loss = 0.2212, the val loss = 0.4066
Epoch = 296,the train loss = 0.2212, the val loss = 0.40

Epoch = 419,the train loss = 0.1937, the val loss = 0.3952
Epoch = 420,the train loss = 0.1943, the val loss = 0.3951
Epoch = 421,the train loss = 0.1943, the val loss = 0.3952
Epoch = 422,the train loss = 0.1943, the val loss = 0.3950
Epoch = 423,the train loss = 0.1938, the val loss = 0.3948
Epoch = 424,the train loss = 0.1938, the val loss = 0.3949
Epoch = 425,the train loss = 0.1937, the val loss = 0.3948
Epoch = 426,the train loss = 0.1933, the val loss = 0.3950
Epoch = 427,the train loss = 0.1931, the val loss = 0.3947
Epoch = 428,the train loss = 0.1926, the val loss = 0.3949
Epoch = 429,the train loss = 0.1928, the val loss = 0.3946
Epoch = 430,the train loss = 0.1926, the val loss = 0.3946
Epoch = 431,the train loss = 0.1924, the val loss = 0.3944
Epoch = 432,the train loss = 0.1926, the val loss = 0.3945
Epoch = 433,the train loss = 0.1915, the val loss = 0.3944
Epoch = 434,the train loss = 0.1918, the val loss = 0.3943
Epoch = 435,the train loss = 0.1920, the val loss = 0.39

Epoch = 558,the train loss = 0.1761, the val loss = 0.3914
Epoch = 559,the train loss = 0.1753, the val loss = 0.3915
Epoch = 560,the train loss = 0.1756, the val loss = 0.3915
Epoch = 561,the train loss = 0.1749, the val loss = 0.3915
Epoch = 562,the train loss = 0.1752, the val loss = 0.3914
Epoch = 563,the train loss = 0.1748, the val loss = 0.3914
Epoch = 564,the train loss = 0.1754, the val loss = 0.3916
Epoch = 565,the train loss = 0.1751, the val loss = 0.3913
Epoch = 566,the train loss = 0.1747, the val loss = 0.3912
Epoch = 567,the train loss = 0.1747, the val loss = 0.3914
Epoch = 568,the train loss = 0.1745, the val loss = 0.3913
Epoch = 569,the train loss = 0.1739, the val loss = 0.3914
Epoch = 570,the train loss = 0.1742, the val loss = 0.3913
Epoch = 571,the train loss = 0.1745, the val loss = 0.3912
Epoch = 572,the train loss = 0.1738, the val loss = 0.3913
Epoch = 573,the train loss = 0.1742, the val loss = 0.3912
Epoch = 574,the train loss = 0.1733, the val loss = 0.39

NameError: name 'printnt' is not defined