In [6]:
import pandas as pd
import jieba

In [4]:
train_df = pd.read_csv('./疫情微博情绪识别挑战赛公开数据/train.csv', sep='\t')
test_df = pd.read_csv('./疫情微博情绪识别挑战赛公开数据/test.csv', sep='\t')

In [12]:
train_df.head(2)

Unnamed: 0,text,label,words
0,这是在向世界上所有的母亲宣战！[怒] //@子小亻青loukas妈:听说后一直不想看，耐不住...,0,"[这, 是, 在, 向, 世界, 上, 所有, 的, 母亲, 宣战, ！, [, 怒, ],..."
1,和少奶奶@5棒冰 一起收拾完衣柜，就躺在听她给我讲#步步惊心#的情感纠葛「四和八喜欢她」，「...,0,"[和, 少奶奶, @, 5, 棒冰, , 一起, 收拾, 完, 衣柜, ，, 就, 躺, ..."


In [15]:
train_df['words'] = train_df['text'].apply(lambda x:' '.join(jieba.lcut(x)))

In [16]:
test_df['words'] = test_df['text'].apply(lambda x: ' '.join(jieba.lcut(x)))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline # 组合流水线

In [18]:
# 训练TFIDF和逻辑回归
pipline = make_pipeline(
    TfidfVectorizer(),
    LogisticRegression()
)
pipline.fit(
    train_df['words'].tolist(),
    train_df['label'].tolist()
)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('logisticregression', LogisticRegression())])

In [22]:
pd.DataFrame(
    {
        'label': pipline.predict(test_df['words'])
    }
).to_csv('lr_submit.csv', index=None) # 86左右

In [25]:
# pip install transformers
# transformers bert相关的模型使用和加载
from transformers import BertTokenizer
# 分词器，词典

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_encoding = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=128)
test_encoding = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=128)

In [31]:
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch

# 数据集读取
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encoding, train_df['label'])
test_dataset = NewsDataset(test_encoding, [0] * len(test_df))

In [32]:
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [45]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 优化方法
optim = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 1

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [46]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))

for epoch in range(1):
    print("------------Epoch: %d ----------------" % epoch)
    train()

------------Epoch: 0 ----------------
epoth: 0, iter_num: 100, loss: 0.0293, 2.67%
epoth: 0, iter_num: 200, loss: 0.0087, 5.33%
epoth: 0, iter_num: 300, loss: 0.1835, 8.00%
epoth: 0, iter_num: 400, loss: 0.0722, 10.67%
epoth: 0, iter_num: 500, loss: 0.0275, 13.33%
epoth: 0, iter_num: 600, loss: 0.0207, 16.00%
epoth: 0, iter_num: 700, loss: 0.0315, 18.67%
epoth: 0, iter_num: 800, loss: 0.0209, 21.33%
epoth: 0, iter_num: 900, loss: 0.4200, 24.00%
epoth: 0, iter_num: 1000, loss: 0.1209, 26.67%
epoth: 0, iter_num: 1100, loss: 0.0093, 29.33%
epoth: 0, iter_num: 1200, loss: 0.0229, 32.00%
epoth: 0, iter_num: 1300, loss: 0.0164, 34.67%
epoth: 0, iter_num: 1400, loss: 0.1712, 37.33%
epoth: 0, iter_num: 1500, loss: 0.0070, 40.00%
epoth: 0, iter_num: 1600, loss: 0.3227, 42.67%
epoth: 0, iter_num: 1700, loss: 0.2320, 45.33%
epoth: 0, iter_num: 1800, loss: 0.0102, 48.00%
epoth: 0, iter_num: 1900, loss: 0.0195, 50.67%
epoth: 0, iter_num: 2000, loss: 0.4099, 53.33%
epoth: 0, iter_num: 2100, loss: 0.

In [47]:
with torch.no_grad():
    pred_label = []
    for batch in test_dataloader:
        # 正向传播
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        pred_label += list(outputs.logits.argmax(1).cpu().data.numpy())

In [48]:
pd.DataFrame(
    {
        'label': pred_label
    }
).to_csv('bert_submit.csv', index=None) # 96左右