In [1]:
import os
import pandas
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
import xgboost


In [2]:
path_data = 'F:/github-project/shiyan/shiyan5/第五章 话题追踪与检测/话题检测/tc-corpus-answer/answer/'
dir_names = os.listdir(path_data)
labels,texts=[], []
for dir_name in dir_names:
    file_names =os.listdir(path_data + dir_name +'/')
    for file_name in file_names:
        f = open(path_data + dir_name + '/'+ file_name, encoding='gb18030', errors='ignore')
        content = f.read()
        f.close()
        labels.append(dir_name)
        texts.append(content)

In [3]:
trainDF = pandas.DataFrame()
trainDF["text"] = texts
trainDF["label"] = labels
train_x,valid_x,train_y,valid_y = model_selection.train_test_split(trainDF["text"],trainDF["label"])
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
count_vect = CountVectorizer(analyzer="word", token_pattern=r"\w{1,}")
count_vect.fit(trainDF["text"])
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

In [4]:
# 词语级tf-idf
tfidf_vect = TfidfVectorizer(
    analyzer="word", token_pattern=r"\w{1,}", max_features=5000
)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)
# ngram 级tf-idf
tfidf_vect_ngram = TfidfVectorizer(
    analyzer="word", token_pattern=r"\w{1,}", ngram_range=(2, 3), max_features=5000
)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)
# 词性级tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(
    analyzer="char", token_pattern=r"\w{1,}", ngram_range=(2, 3), max_features=5000
)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(train_x)
xvalid_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(valid_x)



In [5]:
def train_model(classifier, feature_vector_train,label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifien
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions =classifier.predict(feature_vector_valid)
    if is_neural_net:
        predictions =predictions.argmax(axis=-1)
        return metrics.accuracy_score(predictions,valid_y)

In [6]:
# 特征为计数向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(),xtrain_count, train_y, xvalid_count)
print("NB,Count Vectors:",accuracy)
# 特征为词语级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(),xtrain_tfidf, train_y, xvalid_tfidf)
print("NB,WordLevel TF-IDF:",accuracy)
#特征为多个词语级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB,N-Gram Vectors:",accuracy)
# 特征为词性级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB,CharLevel Vectors:", accuracy)

NB,Count Vectors: None
NB,WordLevel TF-IDF: None
NB,N-Gram Vectors: None
NB,CharLevel Vectors: None


In [7]:
#特征为计数向量的线性分类器
accuracy = train_model(linear_model.LogisticRegression(),xtrain_count, train_y, xvalid_count)
print("LR,Count Vectors:",accuracy)
#特征为词语级别TF-IDF向量的线性分类器
accuracy = train_model(linear_model.LogisticRegression(),xtrain_tfidf, train_y, xvalid_tfidf)
print("LR,WordLevel TF-IDF:",accuracy)
#特征为多个词语级别TF-IDF向量的线性分类器
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR,N-Gram Vectors:",accuracy)
# 特征为词性级别TF-IDF向量的线性分类器
accuracy = train_model(linear_model.LogisticRegression(),xtrain_tfidf_ngram_chars, train_y,xvalid_tfidf_ngram_chars)
print("LR,CharLevel Vectors:",accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR,Count Vectors: None
LR,WordLevel TF-IDF: None
LR,N-Gram Vectors: None
LR,CharLevel Vectors: None


In [10]:
# 特征为多个词语级别TF-IDF向量的SVM
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM,N-Gram Vectors:", accuracy)
# 特征为计数向量的RF
accuracy = train_model(
    ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count
)
print("RF,Count Vectors:", accuracy)
# 特征为词语级别TF-IDF向量的RF
accuracy = train_model(
    ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf
)
print("RF,WordLevel TF-IDF:", accuracy)

SVM,N-Gram Vectors: None
RF,Count Vectors: None
RF,WordLevel TF-IDF: None


In [None]:
# 特征为计数问量的Xgboost
accuracy = train_model(
    xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc()
)
print("Xgb,Count Vectors:",accuracy)
# 特征为词语级别TF-IDF向量的Xgboost
accuracy = train_model(
    xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc()
)
print("Xgb,WordLevel TF-IDF:",accuracy)
# 特征为词性级别TF-IDF向量的Xgboost
accuracy = train_model(
    xgboost.XGBClassifier(),
    xtrain_tfidf_ngram_chars.tocsc(),
    train_y,
    xvalid_tfidf_ngram_chars.tocsc(),
)
print("Xgb, CharLevel Vectors:", accuracy)

KeyboardInterrupt: 

In [None]:
def create_model_architecture(input_size):
    # create input layer
    input_layer = layers.Input((input_size,),sparse=True)
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)
    classifien = models.Model(inputs=input_layer, outputs=output_layer)
    classifier.compile(optimizer=optimizers.Adam(),loss='binary_crossentropy')
    return classifier
#浅层神经网络
classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
print("NN,Ngram Level TF IF Vectors", accuracy)