In [1]:

import os
model_save_path = "/home/sys1/Dong/A_Stage_Two/ABSA-QUAD/data/clean/uausal/desiontree"  #名字
train_data_path = "/home/sys1/Dong/A_Stage_Two/ABSA-QUAD/data/clean/uausal/train1.txt"
validate_data_path = "/home/sys1/Dong/A_Stage_Two/ABSA-QUAD/data/clean/uausal/dev1.txt"
test_data_path = "/home/sys1/Dong/A_Stage_Two/ABSA-QUAD/data/clean/uausal/test1.txt"
test_data_predict_out_path = "/home/sys1/Dong/A_Stage_Two/ABSA-QUAD/data/clean/uausal/" 

In [2]:
import pandas as pd
import jieba
import json

label2id = {'happy': 0, 'angry': 1, 'sad': 2, 'fear': 3, 'surprise': 4, 'neutral': 5}
def load_data(filename):
    D = []
    f = json.load(open(filename))    
    for l in f:
        label, content = label2id[l['label']],l['content']
        D.append((label, content))
    return D

# 分词
def seg_words(contents):
    contents_segs = list()
    for content in contents:
        segs = jieba.lcut(content[1])
        # print(segs)
        contents_segs.append(" ".join(segs))

    return contents_segs
# content_train = load_data(train_data_path)
# s = seg_words(content_train)
# # print(s[1])

In [3]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score,accuracy_score
from sklearn.tree import DecisionTreeClassifier
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
logger = logging.getLogger(__name__)


class TextClassifier():

    def __init__(self, vectorizer, classifier=MultinomialNB()):
        classifier = SVC(kernel="rbf")
        # classifier = KNeighborsClassifier(6, weights='uniform')
        # classifier = DecisionTreeClassifier()

        # classifier = SVC(kernel="linear")
        self.classifier = classifier
        self.vectorizer = vectorizer

    def features(self, x):
        return self.vectorizer.transform(x)

    def fit(self, x, y):

        self.classifier.fit(self.features(x), y)

    def predict(self, x):

        return self.classifier.predict(self.features(x))

    def score(self, x, y):
        return self.classifier.score(self.features(x), y)

    def get_f1_score(self, x, y):
        return f1_score(y, self.predict(x), average='macro')
    
    def get_acc_score(self, x, y):
        return accuracy_score(y, self.predict(x))

In [4]:
#train.py


from sklearn.feature_extraction.text import TfidfVectorizer

import logging
import numpy as np
import joblib
import os
import argparse
from sklearn.preprocessing import OneHotEncoder

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
logger = logging.getLogger(__name__)


if __name__ == '__main__':

    model_name = "model_dict.pkl"

    # load train data
    logger.info("start load data")
    content_train = load_data(train_data_path)
    content_validate = load_data(validate_data_path)

    labels_train = []
    for l in content_train:
        labels_train .append(l[0])
    
    # content_train = train_data_df.iloc[:, 1]
    logger.info("start seg train data")
    content_train = seg_words(content_train)
    logger.info("complete seg train data")

    # columns = train_data_df.columns.values.tolist()

    logger.info("start train feature extraction")
    vectorizer_tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=5, norm='l2')
    vectorizer_tfidf.fit(content_train)
    logger.info("complete train feature extraction models")
    # logger.info("vocab shape: %s" % np.shape(vectorizer_tfidf.vocabulary_.keys()))

    # model train
    logger.info("start train model")
    text_classifier = TextClassifier(vectorizer=vectorizer_tfidf)
    text_classifier.fit(content_train, labels_train )
    logger.info("complete train model")

    # validate model
    labels_validate = []
    for l in content_validate:
        labels_validate .append(l[0])
    logger.info("start seg validate data")
    content_validate = seg_words(content_validate)
    logger.info("complete seg validate data")

    logger.info("start validate model")
    score_dict = dict()
    f1_score = text_classifier.get_f1_score(content_validate, labels_validate)
    acc_score = text_classifier.get_acc_score(content_validate, labels_validate)
    score_dict['f1'] = f1_score
    score_dict['acc'] = acc_score

    logger.info("f1_scores: %s\n" % score_dict['f1'])
    logger.info("acc_score: %s" % score_dict['acc'])
    logger.info("complete validate model")

    # save model
    logger.info("start save model")
    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    joblib.dump(text_classifier, model_save_path + model_name)
    logger.info("complete save model")

2023-03-27 17:26:10,220 [INFO] <MainProcess> (MainThread) start load data
2023-03-27 17:26:10,317 [INFO] <MainProcess> (MainThread) start seg train data
Building prefix dict from the default dictionary ...
2023-03-27 17:26:10,319 [DEBUG] <MainProcess> (MainThread) Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
2023-03-27 17:26:10,320 [DEBUG] <MainProcess> (MainThread) Loading model from cache /tmp/jieba.cache
Loading model cost 0.656 seconds.
2023-03-27 17:26:10,976 [DEBUG] <MainProcess> (MainThread) Loading model cost 0.656 seconds.
Prefix dict has been built successfully.
2023-03-27 17:26:10,982 [DEBUG] <MainProcess> (MainThread) Prefix dict has been built successfully.
2023-03-27 17:26:16,102 [INFO] <MainProcess> (MainThread) complete seg train data
2023-03-27 17:26:16,105 [INFO] <MainProcess> (MainThread) start train feature extraction
2023-03-27 17:26:18,664 [INFO] <MainProcess> (MainThread) complete train feature extraction models
2