In [1]:
from sklearn import datasets
from sklearn import svm
import random
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import numpy as np
import gensim
import tensorflow as tf
import load_data as ld
import count_data as cd

unable to import 'smart_open.gcs', disabling that module


In [2]:
#得到准确率和召回率
def evaluate(actual, pred):
    epsilon = 1e-7
    m_precision = metrics.precision_score(actual, pred,average='macro')
    m_recall = metrics.recall_score(actual,pred,average='macro')
    f1 = 2*m_precision*m_recall/(m_precision+m_recall+epsilon)
    #print('precision:{0:.3f}'.format(m_precision))
    #print('recall:{0:0.3f}'.format(m_recall))
    #print('F1:{0:0.3f}'.format(f1))
    result = [f1, m_precision, m_recall]
    return result

#创建svm分类器
def train_clf(train_data, train_tags ,kernel='rbf'):
    clf = svm.SVC(C=300.0, cache_size=2000, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3,
                  gamma='auto', kernel=kernel, max_iter=-1, probability=False, random_state=None, shrinking=True,
                  tol=0.001, verbose=False)
    clf.fit(train_data, np.asarray(train_tags))

    return clf

In [3]:
#  选择模型
def get_model(model_type, model_name, embedding_dim=100):
    if model_type == 'word2vec':
        model = gensim.models.word2vec.Word2Vec.load("data/{}.w2v".format(model_name)).wv
    elif model_type == 'doc2vec':
        model = gensim.models.doc2vec.Doc2Vec.load("data/{}.d2v".format(model_name))
    elif model_type == 'google':
        model = gensim.models.KeyedVectors.load_word2vec_format(
            '../input/GoogleNews-vectors-negative300.bin', binary=True)
    elif model_type == 'tfidf':
        model = TfidfVectorizer(min_df=5,
                             max_df=0.8,
                             sublinear_tf=True,
                             use_idf=True)
    return model

In [4]:
#  加载数据集
def prepare_train():
    (x_train, y_train), (x_test, y_test) = ld.load_data('data/clean_data_0.5_no_repeat.csv')
    y_train = ld.transform_to_multi_class(y_train)
    y_test = ld.transform_to_multi_class(y_test)
    return (x_train, y_train), (x_test, y_test)

In [5]:
#  特征表示用于输入svm
def prepare_vec(model, x_train, x_test, model_name, embedding_dim):
    '''word2vec (including google-news pretrained)'''
    if model_name == 'word2vec':
        max_length = 200
        tokenizer = Tokenizer(oov_token='<OOV>')
        tokenizer.fit_on_texts(x_train)
        word_index = tokenizer.word_index

        x_train = tokenizer.texts_to_sequences(x_train)
        x_train = pad_sequences(x_train, padding='post', maxlen=max_length)

        x_test = tokenizer.texts_to_sequences(x_test)
        x_test = pad_sequences(x_test, padding='post', maxlen=max_length)
        embedding_matrix = np.zeros((len(word_index) + 1, model.vector_size))
        for word, index in word_index.items():
            try:
                embedding_vector = model.__getitem__(str(word))
                embedding_matrix[int(index)] = embedding_vector
            except KeyError:
                continue
        embedder = tf.keras.layers.Embedding(len(word_index) + 1, embedding_dim, input_length=max_length, weights=[embedding_matrix], trainable=False)
        x_train = embedder(x_train)
        x_test = embedder(x_test)
        x_train = tf.keras.layers.GlobalMaxPooling1D()(x_train)
        x_test = tf.keras.layers.GlobalMaxPooling1D()(x_test)
    '''doc2vec'''
    elif model_name == 'doc2vec':
        x_train = [segment.split() for segment in x_train]
        x_test = [segment.split() for segment in x_test]
        x_train = [model.infer_vector(segment) for segment in x_train]
        x_test = [model.infer_vector(segment) for segment in x_test]
    '''tf-idf'''
    elif model_name == 'tfidf':
        x_train = vectorizer.fit_transform(x_train)
        x_test = vectorizer.transform(x_test)
    return (x_train, x_test)

In [6]:
#  获取得分
def get_score(x_train,y_train,x_test,y_test,kernel='rbf'):
    type_dic = {'Introductory/Generic': 0, 'Practice not covered': 1,
                'Privacy contact information': 2, 'User Access, Edit and Deletion': 3,
                'Data Security': 4, 'International and Specific Audiences': 5,
                'Do Not Track': 6, 'User Choice/Control': 7,
                'Data Retention': 8, 'Policy Change': 9,
                'First Party Collection/Use': 10, 'Third Party Sharing/Collection': 11}
    index = 0
    table_result = []
    clf = []
    re = []
    for key, value in type_dic.items():
        clf.append(train_clf(x_train,y_train[index].numpy(),kernel))
        re.append(clf[index].predict(x_test))
        result = evaluate(np.asarray(y_test[index]),re[index])
        index = index+1
        table_result.append([result[0], result[1], result[2]])
    print(table_result)
    return table_result

In [7]:
#  计算均值并保存数据，svm实际上跑一次即可
def save_result(result,file_name,run_num):
    av_F = []
    av_p = []
    av_r = []
    for i in range(12):
        f = 0
        p = 0
        r = 0
        for j in range(run_num):
            f = f+result[j][i][0]
            p = p+result[j][i][1]
            r = r+result[j][i][2]
        av_F.append(f/run_num)
        av_p.append(p/run_num)
        av_r.append(r/run_num)
    table_result = []
    all_F1 = 0
    all_P = 0
    all_R = 0
    for i in range(12):
        table_result.append(['/'.join([str('%.2f' % e) for e in [av_F[i],av_p[i],av_r[i]]])])
        all_F1 = all_F1+av_F[i]
        all_P = all_P+av_p[i]
        all_R = all_R+av_r[i]
    table_result.append(['/'.join([str('%.2f' % e) for e in [all_F1/12, all_P/12, all_R/12]])])
    print(table_result)
    with open(r'{}.csv'.format(file_name), 'w', encoding='gbk', newline='') as f:
                writer = csv.writer(f, dialect=csv.excel, delimiter=',')
                for data in table_result:
                    writer.writerow(data)

In [None]:
#  运行并保存数据，逻辑类似于10_run_average
result = []
run_num = 10
kernel = 'rbf'
model = get_model(model_type='google',model_name='word2vec')
for i in range(run_num):
    (x_train, y_train), (x_test, y_test) = prepare_train()
    (x_train, x_test) = prepare_vec(model=model,
                                    embedding_dim=300, 
                                    x_train=x_train, 
                                    x_test=x_test, 
                                    model_name='word2vec')
    result.append(get_score(x_train=x_train,y_train=y_train,x_test=x_test,y_test=y_test,kernel=kernel))
save_result(result=result,file_name='svm_google_embed300_len200_10run',run_num=run_num)

[[0.6860150149288475, 0.6425811573747353, 0.7357462697451904], [0.6493339345069844, 0.6120912547528518, 0.6914024056861673], [0.7887332826470623, 0.7681181690708976, 0.8104855751051004], [0.7298641013044551, 0.6777338122586966, 0.7906823322442824], [0.7390586828205026, 0.6894344290844453, 0.7963808557415863], [0.8601731084592654, 0.8248847926267281, 0.8986157145642425], [0.8883509339683862, 0.998639455782313, 0.8], [0.729939961507733, 0.6931451323624364, 0.7708603191770314], [0.6987526256864274, 0.65276369168357, 0.7517129010695187], [0.7529486033758102, 0.7150558336542164, 0.7950823003454582], [0.7597487930333204, 0.7617362415137946, 0.7577717879604672], [0.7472207638950172, 0.738980350504514, 0.7556471306471306]]
[[0.6860150149288475, 0.6425811573747353, 0.7357462697451904], [0.6493339345069844, 0.6120912547528518, 0.6914024056861673], [0.7887332826470623, 0.7681181690708976, 0.8104855751051004], [0.7298641013044551, 0.6777338122586966, 0.7906823322442824], [0.7390586828205026, 0.689

In [None]:
!sh /root/shutdown.sh