In [1]:
from utils import tokenize, load_curpus
import numpy as np

#### 加载数据

In [2]:
import pandas as pd
train_data = load_curpus("weibo2018/train.txt")
test_data = load_curpus("weibo2018/test.txt")
train_df = pd.DataFrame(train_data, columns=["content", "sentiment"])
test_df = pd.DataFrame(test_data, columns=["content", "sentiment"])

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/dy/xjy0y7v97js5x1bghby2fnkm0000gn/T/jieba.cache
Loading model cost 1.098 seconds.
Prefix dict has been built successfully.


加载停用词

In [3]:
stopwords = []
with open("stopwords.txt", "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

TfIdf

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
data_str = [" ".join(content) for content, sentiment in train_data] + \
            [" ".join(content) for content, sentiment in test_data]
tfidf = TfidfVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords)
tfidf_fit = tfidf.fit_transform(data_str)

  'stop_words.' % sorted(inconsistent))


加载之前训练好的FastText模型

In [5]:
from gensim.models import FastText
model = FastText.load("model/model_100.txt")

最多只保留Tf-Idf最高的前多少个词

In [6]:
key_words = 30

#### 用每个词的Tfidf作为权重, 对FastText词向量进行加权, 得到表征每个句子的向量

In [7]:
X_train, y_train = [], []
for content, sentiment in train_data:
    X, y = [], sentiment
    X_tfidf = tfidf.transform([" ".join(content)]).toarray()
    keywords_index = np.argsort(-X_tfidf)[0, :key_words]
    for w in content:
        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:
            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])
    if X:
        X = np.concatenate(X)
        X = np.mean(X, axis=0)
        X_train.append(X)
        y_train.append(y)

  import sys
  


In [8]:
X_test, y_test = [], []
for content, sentiment in test_data:
    X, y = [], sentiment
    X_tfidf = tfidf.transform([" ".join(content)]).toarray()
    keywords_index = np.argsort(-X_tfidf)[0, :key_words]
    for w in content:
        if w in model and w in tfidf.vocabulary_ and tfidf.vocabulary_[w] in keywords_index:
            X.append(np.expand_dims(model[w], 0) * X_tfidf[0, tfidf.vocabulary_[w]])
    if X:
        X = np.concatenate(X)
        X = np.mean(X, axis=0)
        X_test.append(X)
        y_test.append(y)

  import sys
  


### SVM

In [9]:
from sklearn import svm
clf = svm.SVC(C=1, class_weight={1: .95, 0: 1.})
clf.fit(X_train, y_train)



SVC(C=1, cache_size=200, class_weight={0: 1.0, 1: 0.95}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [10]:
result = clf.predict(X_test)

In [11]:
from sklearn import metrics
print(metrics.classification_report(y_test, result))
print("准确率:", metrics.accuracy_score(y_test, result))

              precision    recall  f1-score   support

           0       0.70      0.75      0.72       155
           1       0.89      0.85      0.87       344

    accuracy                           0.82       499
   macro avg       0.79      0.80      0.80       499
weighted avg       0.83      0.82      0.82       499

准确率: 0.8216432865731463
