In [None]:
# 导入相应的库
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
import datetime
import numpy as np
import io
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.layers import Reshape, Flatten, Concatenate, concatenate,Dropout, SpatialDropout1D
from keras.layers import GlobalMaxPooling1D, MaxPooling1D, Add, Flatten,GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import os
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.optimizers import Adam
# 计时开始
starttime = datetime.datetime.now()

# 加载训练集和测试集数据
train = pd.read_csv('train.csv').fillna(' ')
test = pd.read_csv('test.csv').fillna(' ')
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
text_name = 'comment_text'

def LR():
    ''' tfidf+LR '''
    print("start LR...")
    # 加载训练集文本数据，测试集文本数据
    X_train = train[text_name]
    X_test = test[text_name]
    # 合并训练集和测试集文本，用于训练TF-IDF模型
    X_all = pd.concat([X_train, X_test])
    # 创建TF-IDF模型，用于计算单个单词的tfidf
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        ngram_range=(1, 1),
        token_pattern=r'\w{1,}',
        stop_words='english',
        max_features=10000)
    word_vectorizer.fit(X_all)
    # 提取文本单词的特征
    x_word_train = word_vectorizer.transform(X_train)
    x_word_test = word_vectorizer.transform(X_test)

    # 创建TF-IDF模型，用于计算字母和字母组合的tfidf
    char_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='char',
        stop_words='english',
        ngram_range=(2, 6),
        max_features=50000)
    char_vectorizer.fit(X_all)
    # 提取字母和字母组合的tfidf
    x_char_train = char_vectorizer.transform(X_train)
    x_char_test = char_vectorizer.transform(X_test)
    # 将单词的tfidf和字母及其组合的tfidf合并，作为文本的最终特征
    x_train = hstack([x_char_train, x_word_train])
    x_test = hstack([x_char_test, x_word_test])

    scores = []
    submission = pd.DataFrame.from_dict({'id': test['id']})
    for class_name in class_names:
        # y值
        y_train = train[class_name]
        # 模型
        classifier = LogisticRegression(solver='sag')
        # 交叉验证
        cv_score = np.mean(cross_val_score(classifier, x_train, y_train, cv=3, scoring='roc_auc'))
        scores.append(cv_score)
        print('LR score for class {} is {}'.format(class_name, cv_score))
        # 训练
        classifier.fit(x_train, y_train)
        # 预测
        submission[class_name] = classifier.predict_proba(x_test)[:, 1]
        
    print('Total LR score is {}'.format(np.mean(scores)))
    print('end LR...')
    return submission;

    # 保存结果到csv文件中
max_features = 100000 # 最大特征数，现有数据中所有不同单词的种数
maxlen = 200 # 一条评论的词种类数的最大限制
embed_size = 300 # 预训练词向量的维度

def open_file(fname):
    embeddings_index = {}
    with open(fname,encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def getEmbeddingMatrix(fname):
    X_train = train[text_name].values
    X_test = test[text_name].values
    embeddings_index = open_file(fname)
    ## 将评论数据，转转成sequences形式，评论中英文单词类别数最大为200 
    # 分词器
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    # 将评论数据转换成sequences，[1,2,3]
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    # 统一长度
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((num_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:embedding_matrix[i] = embedding_vector
    return X_train,X_test,embedding_matrix

class RocAucEvaluation(Callback):
    """ Callback子类，用于打印ROC-AUC分数 """
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

def textCNN():
    ''' crawl-300d-2M + CNN'''
    print('star CNN...')
    num_filters = 32 # 过滤器数
    # 数据
    Y_train = train[class_names].values
    X_train,X_test,embedding_matrix = getEmbeddingMatrix("crawl-300d-2M.vec")
            
    inp = Input(shape=(maxlen, ))
    # 引入预训练词向量，向量化输入的int，得到max_features * embed_size的矩阵
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    # 随机丢弃词，提高训练速度，提高词的独立性
    x = SpatialDropout1D(0.2)(x)
    # 转换维度，添加第三维，维度是1
    x = Reshape((maxlen, embed_size, 1))(x)
    # 卷积层，过滤器32,大小1*300
    conv_1 = Conv2D(num_filters, kernel_size=(1, embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(2, embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(3, embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_5 = Conv2D(num_filters, kernel_size=(5, embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    # 最大池化层
    maxpool_1 = MaxPool2D(pool_size=(maxlen, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_3)
    maxpool_5 = MaxPool2D(pool_size=(maxlen - 4, 1))(conv_5)
    # 连接最大池化层
    z = Concatenate(axis=1)([maxpool_1, maxpool_2,maxpool_3,maxpool_5])   
    # 压平
    z = Flatten()(z)
    # 随机丢弃，提高最后训练速度，防止因全连接层导致过拟合
    z = Dropout(0.1)(z)
    # 全连接层，输入六个值
    outp = Dense(6, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    # 拆分训练集和验证集
    x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, train_size=0.96)
    # 模型评估
    RocAuc = RocAucEvaluation(validation_data=(x_val, y_val), interval=1)
    # 训练
    # verbose：日志显示，0为不在标准输出流输出日志信息，1为输出进度条记录，2为每个epoch输出一行记录
    hist = model.fit(x_train, y_train, batch_size=256, epochs=3, validation_data=(x_val, y_val),
                 callbacks=[RocAuc], verbose=2)
    # 预测
    y_pred = model.predict(X_test)
    submission = pd.read_csv('sample_submission.csv')
    submission[class_names] = y_pred
    print('end CNN...')
    return submission

def textRNN():
    ''' glove.840B.300d + RNN'''
    print('start RNN...')
    # 数据
    Y_train = train[class_names].values
    X_train,X_test,embedding_matrix = getEmbeddingMatrix("glove.840B.300d.txt")
    
    sequence_input = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

    x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, train_size=0.96)
    RocAuc = RocAucEvaluation(validation_data=(x_val, y_val), interval=1)

    model.fit(x_train, y_train, batch_size=128, epochs=4, validation_data=(x_val, y_val),callbacks = [RocAuc],verbose=2)
    y_pred = model.predict(X_test,batch_size=1024,verbose=1)

    submission = pd.read_csv('sample_submission.csv')
    submission[class_names] = y_pred
    print('end RNN...')
    return submission

# 执行
lr = LR()
lr.to_csv('submission_LR.csv', index = False)

cnn = textCNN()
cnn.to_csv('submission_CNN.csv', index = False)

rnn = textRNN()
rnn.to_csv('submission_RNN.csv', index = False)

blend = lr.copy()
col = lr.columns
col = col.tolist()
col.remove('id')
# 加权平均
for i in col:
    blend[i] = (3* lr[i] + 2 * cnn[i] + 5 * rnn[i]) / 10
    
blend.to_csv('325submission.csv', index = False)
endtime = datetime.datetime.now()
print (endtime - starttime)