# 基于深度学习的文本分类 

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import warnings
warnings.filterwarnings('ignore')


## 获取数据

In [2]:
train_df = pd.read_csv('./input/train.tsv', sep='\t')
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


### 划为训练集和验证集

In [3]:
index = np.arange(len(train_df))
np.random.shuffle(index)

train_size = 0.8
train_num = int(len(index)*train_size)

train_df.iloc[index[:train_num], :].to_csv('./input/task2_train.csv', index=False)
train_df.iloc[index[train_num:], :].to_csv('./input/task2_valid.csv', index=False)

In [4]:
from torchtext import data
TEXT = data.Field(sequential=True, batch_first=True, lower=True, fix_length=100)
LABEL = data.Field(sequential=False, batch_first=True, use_vocab=False)

In [5]:
data_fields = [('PhraseId', None),
              ('SentenceId', None),
              ('Phrase', TEXT),
              ('Sentiment', LABEL)]

train_data = data.TabularDataset(path='./input/task2_train.csv', format='csv', skip_header=True, fields=data_fields)
valid_data = data.TabularDataset(path='./input/task2_valid.csv', format='csv', skip_header=True, fields=data_fields)

In [19]:
TEXT.build_vocab(train_data, vectors='glove.6B.100d')

In [7]:
batch_size = 32
train_iter = data.Iterator(train_data, batch_size=batch_size, shuffle=True)
valid_iter = data.Iterator(valid_data, batch_size=batch_size)

## CNN模型

In [8]:
num_embedding = len(TEXT.vocab)
embedding_dim = 100 # 根据使用的glove

In [9]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.channel_num = 16
        self.class_num = 5
        self.dropout = 0.1
        self.embedding = nn.Embedding(num_embedding, embedding_dim).from_pretrained(TEXT.vocab.vectors)
        self.conv1 = nn.Conv2d(1, self.channel_num, kernel_size=(3, embedding_dim), padding=(2, 0))
        self.conv2 = nn.Conv2d(1, self.channel_num, kernel_size=(4, embedding_dim), padding=(3, 0))
        self.conv3 = nn.Conv2d(1, self.channel_num, kernel_size=(5, embedding_dim), padding=(4, 0))
        self.dropout = nn.Dropout(self.dropout)
        self.fc = nn.Linear(3*self.channel_num, self.class_num)
        
    def forward(self, x):
        # x [batch_size, sequence_length]
        x = self.embedding(x)
        # [batch_size, sequence_length, embedding_dim]
        x = x.unsqueeze(1)
        # [batch_size, 1, sequence_length, embedding_dim]
        x1 = self.conv_and_pool(x, self.conv1)
        x2 = self.conv_and_pool(x, self.conv2)
        x3 = self.conv_and_pool(x, self.conv3)
        # [batch_size, channel_num]
        x = torch.cat((x1, x2, x3), dim=1)
        # [batch_size, channel_num*3]
        x = self.dropout(x)
        out = self.fc(x)
        # [batch_size, class_num]
        return out
        
    @staticmethod 
    def conv_and_pool(x, conv):
        # x [batch_size, 1, sequence_length, embedding_dim]
        x = conv(x)
        # [batch_size, channel_num, H_out, W_out]  
        # H_out = (sequence_length+2*padding[0]-(kernel_size[0]-1)-1)/stride[0] + 1
        # W_out = (embedding_dim+2*padding[1]-(kernel_size[1]-1)-1)/stride[1] + 1 = 1
        x = F.relu(x.squeeze(3))
        # [batch_size, channel_num, H_out]
        x = F.max_pool1d(x, kernel_size=x.size(2))
        # [batch_size, channel_num, L_out]
        # L_out = (H_out+2*padding-(kernel_size-1)-1) / stride + 1 = 1
        x = x.squeeze(2)
        # [batch_size, channel_num]
        return x
        
        
        

## 训练

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [11]:
model = CNNModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [12]:
epoch = 3
for e in range(epoch):
    # 训练
    model.train()
    total_loss = 0.0
    accuracy = 0
    total_train_num = len(train_iter.dataset)
    for i, batch in enumerate(train_iter):
        text = batch.Phrase.to(device)
        label = batch.Sentiment.to(device)
        
        optimizer.zero_grad()

        out =model(text)
        loss = criterion(out, label) 
        
        batch_loss = loss.item()
        total_loss += batch_loss
        batch_accuracy = (torch.argmax(out, dim=1)==label).sum().item()
        accuracy += batch_accuracy
        
        loss.backward()
        optimizer.step()
        if i % 200 == 0:
            print('Epoch_{} Batch_{}, Train Loss:{}, Accuracy:{}'.format(e, i, batch_loss/len(label), batch_accuracy/len(label)))
    print('>>> Epoch_{}, Train Loss:{}, Accuracy:{}'.format(e, total_loss/total_train_num, accuracy/total_train_num))
    
    # 每训练完一个epoch看下
    model.eval()
    total_loss = 0.0
    accuracy = 0
    total_valid_num = len(valid_iter.dataset)
    for batch in valid_iter:
        text = batch.Phrase.to(device)
        label = batch.Sentiment.to(device)
        
        out = model(text)
        
        loss = criterion(out, label)
        total_loss += loss.item()
        accuracy += (torch.argmax(out, dim=1)==label).sum().item()
        
    print('>>> Epoch_{}, Valid loss:{}, Accuracy:{} \n'.format(e, total_loss/total_valid_num, accuracy/total_valid_num))

    
    

Epoch_0 Batch_0, Train Loss:0.04957478493452072, Accuracy:0.28125
Epoch_0 Batch_200, Train Loss:0.025076357647776604, Accuracy:0.6875
Epoch_0 Batch_400, Train Loss:0.040925655514001846, Accuracy:0.34375
Epoch_0 Batch_600, Train Loss:0.03395047411322594, Accuracy:0.5625
Epoch_0 Batch_800, Train Loss:0.03413496911525726, Accuracy:0.5625
Epoch_0 Batch_1000, Train Loss:0.029053203761577606, Accuracy:0.625
Epoch_0 Batch_1200, Train Loss:0.02809777483344078, Accuracy:0.71875
Epoch_0 Batch_1400, Train Loss:0.02457335963845253, Accuracy:0.65625
Epoch_0 Batch_1600, Train Loss:0.027460245415568352, Accuracy:0.625
Epoch_0 Batch_1800, Train Loss:0.024487830698490143, Accuracy:0.65625
Epoch_0 Batch_2000, Train Loss:0.024921800941228867, Accuracy:0.75
Epoch_0 Batch_2200, Train Loss:0.041361577808856964, Accuracy:0.4375
Epoch_0 Batch_2400, Train Loss:0.02737363614141941, Accuracy:0.6875
Epoch_0 Batch_2600, Train Loss:0.03153635561466217, Accuracy:0.65625
Epoch_0 Batch_2800, Train Loss:0.0266115739941

看上去效果不是很理想

## RNN模型 

In [13]:
class RNNModel(nn.Module):
    def __init__(self):
        super(RNNModel, self).__init__()
        self.hidden_size = 55
        self.num_layers = 2
        self.dropout = 0.1
        self.bidirectional = True
        self.class_num = 5
        self.embedding = nn.Embedding(num_embedding, embedding_dim).from_pretrained(TEXT.vocab.vectors)
        self.lstm = nn.LSTM(embedding_dim, hidden_size=self.hidden_size, num_layers = self.num_layers,
                           batch_first=True, dropout=self.dropout, bidirectional=self.bidirectional)
        self.dropout = nn.Dropout(self.dropout)
        self.fc = nn.Linear(self.hidden_size*(2 if self.bidirectional else 1), self.class_num)
        
    def forward(self, x):
        # x [batch_size, squence_length]
        h0 = torch.zeros(self.num_layers*(2 if self.bidirectional else 1), x.shape[0], self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers*(2 if self.bidirectional else 1), x.shape[0], self.hidden_size).to(device)
        
        x = self.embedding(x)
        # [batch_size, squence_length, embedding_dim]
        
        out, hidden = self.lstm(x, (h0, c0))
        # out [batch_size, squence_length, hidden_size*2 (bidirectional)]
        # hidden  (h1, c1)

        out =self.dropout(out)
        
        out = self.fc(out[:,-1,:]) # 取最后一个输出
        
        return out 
    
    

In [14]:
model = RNNModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [15]:
epoch = 3
for e in range(epoch):
    # 训练
    model.train()
    total_loss = 0.0
    accuracy = 0
    total_train_num = len(train_iter.dataset)
    for i, batch in enumerate(train_iter):
        text = batch.Phrase.to(device)
        label = batch.Sentiment.to(device)
        
        optimizer.zero_grad()

        out =model(text)
        loss = criterion(out, label) 
        
        batch_loss = loss.item()
        total_loss += batch_loss
        batch_accuracy = (torch.argmax(out, dim=1)==label).sum().item()
        accuracy += batch_accuracy
        
        loss.backward()
        optimizer.step()
        if i % 200 == 0:
            print('Epoch_{} Batch_{}, Train Loss:{}, Accuracy:{}'.format(e, i, batch_loss/len(label), batch_accuracy/len(label)))
    print('>>> Epoch_{}, Train Loss:{}, Accuracy:{}'.format(e, total_loss/total_train_num, accuracy/total_train_num))
    
    # 每训练完一个epoch看下
    model.eval()
    total_loss = 0.0
    accuracy = 0
    total_valid_num = len(valid_iter.dataset)
    for batch in valid_iter:
        text = batch.Phrase.to(device)
        label = batch.Sentiment.to(device)
        
        out = model(text)
        
        loss = criterion(out, label)
        total_loss += loss.item()
        accuracy += (torch.argmax(out, dim=1)==label).sum().item()
        
    print('>>> Epoch_{}, Valid loss:{}, Accuracy:{} \n'.format(e, total_loss/total_valid_num, accuracy/total_valid_num))

    

Epoch_0 Batch_0, Train Loss:0.048784174025058746, Accuracy:0.625
Epoch_0 Batch_200, Train Loss:0.040376611053943634, Accuracy:0.53125
Epoch_0 Batch_400, Train Loss:0.04513956978917122, Accuracy:0.375
Epoch_0 Batch_600, Train Loss:0.036570582538843155, Accuracy:0.5
Epoch_0 Batch_800, Train Loss:0.03312668576836586, Accuracy:0.59375
Epoch_0 Batch_1000, Train Loss:0.046475473791360855, Accuracy:0.34375
Epoch_0 Batch_1200, Train Loss:0.04132869467139244, Accuracy:0.53125
Epoch_0 Batch_1400, Train Loss:0.03976373001933098, Accuracy:0.53125
Epoch_0 Batch_1600, Train Loss:0.04671654850244522, Accuracy:0.34375
Epoch_0 Batch_1800, Train Loss:0.035003092139959335, Accuracy:0.6875
Epoch_0 Batch_2000, Train Loss:0.038686737418174744, Accuracy:0.59375
Epoch_0 Batch_2200, Train Loss:0.04222581908106804, Accuracy:0.5625
Epoch_0 Batch_2400, Train Loss:0.04316255822777748, Accuracy:0.46875
Epoch_0 Batch_2600, Train Loss:0.03934739902615547, Accuracy:0.4375
Epoch_0 Batch_2800, Train Loss:0.0390295386314