### 1.安装依赖包并导入

In [2]:
# !pip install nltk==3.5 -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install tqdm==4.62.3 -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install openpyxl==3.0.7 -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install scikit_learn==0.23.2 -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install pandas==1.1.5 -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install numpy==1.19.5 -i https://pypi.tuna.tsinghua.edu.cn/simple

In [3]:
import os
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from string import punctuation
from tqdm.notebook import tqdm
from collections import Counter
import warnings
from sklearn.metrics import confusion_matrix

warnings.filterwarnings('ignore')

### 2. 读取停用词

In [4]:
stop_words = set(stopwords.words('english'))


### 3. 移除标点符号

In [5]:
def remove_punc(text):
    # 移除标点符号
    punctuation_zh = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”？，！【】（）、。：；’‘……￥·"""
    dicts = {i: '' for i in punctuation + punctuation_zh}
    punc_table = str.maketrans(dicts)
    new_text = text.translate(punc_table)
    return new_text

In [6]:
pd.read_excel('data-标签.xlsx')

Unnamed: 0,text,必选关键词,正向关键词,负向关键词,文件名,页数,标签,Unnamed: 7
0,"Considering the regular pore size, stabili...",optimize,parameter,reaction,139.pdf,5,no,
1,"conditions, After several rounds of parameter ...",optimization,parameter,,171.pdf,5,no,
2,Supplementary Figure 78: Localization of the L...,optimize,cell,LUMO,15-SI.pdf,52,no,
3,Supplementary Figure 77: Localization of the H...,optimize,cell,HOMO,15-SI.pdf,51,no,
4,Materials and characterization Optimization...,optimization,cell,adsorption,153-SI.pdf,2,no,
...,...,...,...,...,...,...,...,...
1785,the three COFs adopt AA eclipsed stacking in t...,optimize,,,520.pdf,2,,
1786,Density functional theory (DFT) calculation wa...,optimize,,,520.pdf,3,,
1787,"Figure 3, The optimized binding sites and bind...",optimize,,,520.pdf,3,,
1788,which is because the 6FDA-ODA matrix is still ...,optimize,,,520.pdf,5,,


### 4. 读取数据

In [7]:
def read_data(path):
    texts = []
    labels = []
    max_len = 0
    data = pd.read_excel(path).dropna(subset=['标签'])
    for idx, row in data.iterrows():
        temp = []
        line = str(row['text']).strip()
        if not line:
            continue
        line = remove_punc(line)
        text = [j for j in word_tokenize(line) if j not in stop_words]
        if len(text)==0:
            continue
        label = row['标签']
        texts.append(' '.join(text))
        max_len = max(max_len, len(text))
        labels.append(label)
    assert len(texts) == len(labels)
    print(max_len)
    return texts, labels

In [8]:
texts, labels = read_data("data-标签.xlsx")

96


In [9]:
len(set(labels)),set(labels)

(2, {'no', 'yes'})

In [10]:
len(texts)

500

### 5. 提取BOW特征

In [11]:
def bow_feature(data, flag='rf'):
    if os.path.exists(flag + '_tokenizer.pkl'):
        print('Tokenizer founded. Loading...')
        with open(flag + '_tokenizer.pkl', 'rb') as f:
            transformer = pickle.load(f)
    else:
        print('No Tokenizer founded. Creating...')
        transformer = CountVectorizer(max_features=2000)
        transformer.fit(data)
        # 保存装换器
        with open(flag + '_tokenizer.pkl', 'wb') as f:
            pickle.dump(transformer, f)
    tfidf = transformer.transform(data)
    return tfidf.toarray()

### 6. 计算评估指标

In [12]:
def compute_metrics(labels, preds):
    # accuracy
    accuracy = accuracy_score(labels, preds)
    # precision
    precision = precision_score(labels, preds, average='macro')
    # recall
    recall = recall_score(labels, preds, average='macro')
    # f1_score
    f1 = f1_score(labels, preds, average='macro')
    print(f'accuracy: {accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'f1: {f1}')
    print(f'classification_report: ')
    print(classification_report(labels, preds, digits=4))

### 7. 寻找模型最优超参

In [13]:
def get_best_hyperparam(train_x, train_y):
    f1_scorer = make_scorer(f1_score, average='weighted')
    acc_scorer = make_scorer(accuracy_score)
    scoring = {'F1': f1_scorer, 'Accuracy': acc_scorer}
    params = {
        'n_estimators': [100, 200, 300],
        "criterion": ["gini", "entropy", "log_loss"]
    }
    model = RandomForestClassifier()
    # 交叉验证，GridSearch搜索最优参数
    model = GridSearchCV(model,
                         param_grid=params,
                         scoring=scoring,
                         refit="Accuracy",
                         return_train_score=True,
                         cv=5,
                         verbose=1,
                         n_jobs=-1)
    model.fit(train_x, train_y)
    # performance metrics
    print(model.best_score_, model.best_params_)
    return model

### 8. 训练和测试

In [14]:
# 训练和测试
def train_and_eval(train_x, train_y, test_x, test_y, flag, need_recover=True):
    # 训练模型
    if need_recover:
        if os.path.exists(flag + '_model.pkl'):
            print('Model founded. Loading...')
            with open(flag + '_model.pkl', 'rb') as f:
                clf = pickle.load(f)
        else:
            print('No model founded. Creating...')
            clf = get_best_hyperparam(train_x, train_y)
            with open(flag + '_model.pkl', 'wb') as f:
                pickle.dump(clf, f)
    else:
        print('Not to recover, Creating...')
        clf = get_best_hyperparam(train_x, train_y)
        with open(flag + '_model.pkl', 'wb') as f:
            pickle.dump(clf, f)
    # 测试
    print('Train:')
    pred = clf.predict(train_x)
    compute_metrics(train_y, pred)

    print('Eval:')
    pred = clf.predict(test_x)
    compute_metrics(test_y, pred)

### 9.预测

In [15]:
# 用于预测
def predict(test_text, label_encoder, flag='rf'):
    with open(flag + '_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open(flag + '_tokenizer.pkl', 'rb') as f:
        transformer = pickle.load(f)
    tests = []
    for data in test_text:
        text = remove_punc(data)
        text = [item for item in word_tokenize(text) if item not in stop_words]
        tests.append(' '.join(text))
    test_x = transformer.transform(tests).toarray()
    preds = model.predict(test_x)
    pred_labels = []
    for pred in preds:
        pred_label = label_encoder.inverse_transform([pred])[0]
        pred_labels.append(pred_label)
    return pred_labels

## Main

In [16]:
data = bow_feature(texts)

Tokenizer founded. Loading...


In [17]:
data.shape

(500, 2000)

### Label转换为数字

In [18]:
le = preprocessing.LabelEncoder()
label = le.fit_transform(labels)

In [19]:
label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

### 训练集验证集划分

In [20]:
train_x, test_x, train_y, test_y = train_test_split(data,
                                                    label,
                                                    test_size=0.1,
                                                    random_state=1)

### 训练以及验证

In [21]:
def compute_confusion_matrix(labels, preds):
    cm = confusion_matrix(labels, preds)
    tp = cm[1, 1]
    tn = cm[0, 0]
    fp = cm[0, 1]
    fn = cm[1, 0]
    return tp, tn, fp, fn

def train_and_eval(train_x, train_y, test_x, test_y, flag, need_recover=True):
    if need_recover:
        if os.path.exists(flag + '_model.pkl'):
            print('Model founded. Loading...')
            with open(flag + '_model.pkl', 'rb') as f:
                clf = pickle.load(f)
        else:
            print('No model founded. Creating...')
            clf = get_best_hyperparam(train_x, train_y)
            with open(flag + '_model.pkl', 'wb') as f:
                pickle.dump(clf, f)
    else:
        print('Not to recover, Creating...')
        clf = get_best_hyperparam(train_x, train_y)
        with open(flag + '_model.pkl', 'wb') as f:
            pickle.dump(clf, f)

    print('Train:')
    pred_train = clf.predict(train_x)
    compute_metrics(train_y, pred_train)
    tp, tn, fp, fn = compute_confusion_matrix(train_y, pred_train)
    print('Train - TP: {}, TN: {}, FP: {}, FN: {}'.format(tp, tn, fp, fn))

    print('Eval:')
    pred_test = clf.predict(test_x)
    compute_metrics(test_y, pred_test)
    tp, tn, fp, fn = compute_confusion_matrix(test_y, pred_test)
    print('Eval - TP: {}, TN: {}, FP: {}, FN: {}'.format(tp, tn, fp, fn))



In [22]:
def compute_confusion_matrix(labels, preds):
    cm = confusion_matrix(labels, preds)
    tp = cm[1, 1]
    tn = cm[0, 0]
    fp = cm[0, 1]
    fn = cm[1, 0]
    return tp, tn, fp, fn

train_and_eval(train_x, train_y, test_x, test_y, flag='rf', need_recover=False)

Not to recover, Creating...
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    6.7s finished


0.9400000000000001 {'criterion': 'entropy', 'n_estimators': 100}
Train:
accuracy: 1.0
precision: 1.0
recall: 1.0
f1: 1.0
classification_report: 
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       229
           1     1.0000    1.0000    1.0000       221

    accuracy                         1.0000       450
   macro avg     1.0000    1.0000    1.0000       450
weighted avg     1.0000    1.0000    1.0000       450

Train - TP: 221, TN: 229, FP: 0, FN: 0
Eval:
accuracy: 0.94
precision: 0.9416666666666667
recall: 0.9351395730706076
f1: 0.9379909053327822
classification_report: 
              precision    recall  f1-score   support

           0     0.9500    0.9048    0.9268        21
           1     0.9333    0.9655    0.9492        29

    accuracy                         0.9400        50
   macro avg     0.9417    0.9351    0.9380        50
weighted avg     0.9403    0.9400    0.9398        50

Eval - TP: 28, TN: 19, FP: 2, FN: 1


### 预测

In [None]:
data = pd.read_excel('data-标签.xlsx')
result = []
for idx, row in data.iterrows():
    temp = []
    line = str(row['text']).strip()
    true_label = row['标签']  # 将变量名从label更改为true_label
    if pd.isna(true_label):
        predicted_label = predict([line], le)[0]  # 将变量名从label更改为predicted_label
        result.append(predicted_label)
    else:
        result.append(true_label)

In [None]:
data['pred_label'] = result

In [None]:
data.to_excel('result.xlsx', index=False, encoding='utf-8-sig')