In [None]:
!pip install torch
!pip install gensim
!pip install nltk
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install transformers

!pip install keras
!pip install tensorflow
import keras
import tensorflow



import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
# from torch.optim import AdamW
from transformers import BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KDTree
import pandas as pd
import numpy as np
import json
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re




In [None]:
df1=pd.read_csv('./df1.csv')
df1['abstract'] = df1['abstract'].astype(str)
df1.head(10)

In [None]:

# 加载Word2Vec模型
model_path = "./GoogleNews-vectors-negative300.bin"
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

# NLTK资源下载
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:

# 分词和预处理函数
def tokenize_and_process(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha()]
    words = [word.lower() for word in words]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words


# 对DataFrame中的abstract列进行分词和预处理
df1['tokenized_abstract'] = df1['abstract'].apply(tokenize_and_process)

# 函数来将单词转换为Word2Vec向量，如果模型中没有该词，则使用UNK向量
def word_to_vec(word, model):
    return model[word] if word in model.key_to_index else model['UNK']

# 函数来将句子的分词列表转换为Word2Vec向量列表
def tokens_to_vectors(tokens, model):
    return [word_to_vec(token, model) for token in tokens]

# 应用函数将tokenized_abstract列的分词列表转换为Word2Vec向量列表
df1['word_vectors'] = df1['tokenized_abstract'].apply(lambda tokens: tokens_to_vectors(tokens, word2vec_model))


In [None]:


# 找出word_vectors中最长的长度
MAX_SEQUENCE_LENGTH = df1['word_vectors'].apply(len).max()
UNK_VECTOR = word2vec_model['UNK']  # 获取'UNK'的向量表示
# 使用这个长度作为填充或截断的基础
def pad_or_truncate_vectors(word_vectors, max_length, unk_vector):
    """
    如果word_vectors长度小于max_length，则用unk_vector填充；
    如果word_vectors长度大于max_length，则截断。
    """
    # 获取当前word vectors的长度
    sequence_length = len(word_vectors)
    
    # 如果当前长度小于最大长度，进行填充
    if sequence_length < max_length:
        padding = [unk_vector] * (max_length - sequence_length)
        word_vectors.extend(padding)
    # 如果当前长度大于最大长度，进行截断
    elif sequence_length > max_length:
        word_vectors = word_vectors[:max_length]
    
    return word_vectors

# 应用函数pad_or_truncate_vectors到每一行的word_vectors列
df1['padded_word_vectors'] = df1['word_vectors'].apply(
    lambda x: pad_or_truncate_vectors(x, MAX_SEQUENCE_LENGTH, UNK_VECTOR))

# 检查结果
df1['padded_word_vectors'].apply(len)  # 每个向量的长度都应该是MAX_SEQUENCE_LENGTH

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# 准备特征：将padded_word_vectors的列表转换为NumPy数组
X = np.array(df1['padded_word_vectors'].tolist())

# 准备标签：获取所有标签列
label_columns = df1.columns.difference(['abstract', 'tokenized_abstract', 'word_vectors', 'padded_word_vectors'])
y = df1[label_columns].values

# 分割数据集为训练集和测试集，这里使用20%的数据作为测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 输出分割后的数据集维度，仅用于确认
print(f'Training set shape: {X_train.shape, y_train.shape}')
print(f'Test set shape: {X_test.shape, y_test.shape}')


In [None]:
from keras.models import Model
from keras.layers import Input, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras import backend as K
from sklearn.neighbors import NearestNeighbors  # 这里导入NearestNeighbors

def micro_f1(y_true, y_pred):
    # 预测值大于0.3的被认为是正类
    y_pred = K.cast(K.greater(y_pred, 0.3), K.floatx())
    
    # 计算真正例、假正例和假负例
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0)
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0)
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)
    
    # 计算精确度和召回率
    precision = K.sum(true_positives) / (K.sum(predicted_positives) + K.epsilon())
    recall = K.sum(true_positives) / (K.sum(possible_positives) + K.epsilon())
    
    # 计算micro-F1分数
    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
    
    return f1


# 模型参数
max_sequence_length = 237  # 句子的最大长度
embedding_dim = 300  # 词嵌入的维度
num_labels = 145  # 标签的数量

# 模型输入
input_ = Input(shape=(max_sequence_length, embedding_dim))

# 卷积层：使用多个不同大小的滤波器来提取特征
# 可以通过交叉验证调整filters和kernel_size
conv = Conv1D(filters=128, kernel_size=5, activation='relu')(input_)

# 全局最大池化层：为了捕捉最重要的特征
gmp = GlobalMaxPooling1D(name='gmp')(conv)

# Dropout层：减少过拟合
dropout = Dropout(0.5)(gmp)

# 全连接层：进一步处理特征
# 可以通过交叉验证调整units
dense = Dense(128, activation='relu')(dropout)

# 输出层：使用sigmoid函数，适用于多标签分类
# 输出层：使用sigmoid函数，适用于多标签分类
output = Dense(num_labels, activation='sigmoid', name='output')(dense)
# 构建模型
model = Model(inputs=input_, outputs=[output,gmp])

# 编译模型：使用二元交叉熵作为损失函数，适用于多标签分类
model.compile(optimizer='adam',    loss={'output': 'binary_crossentropy', 'gmp': None}, metrics={'output': micro_f1})

# 打印模型结构
print(model.summary())

# 训练模型：使用训练数据和验证数据
# 这里假设X_train和y_train已经准备好，且与模型输入输出匹配
# 可以通过交叉验证调整batch_size和epochs
model.fit(X_train, {'output': y_train}, batch_size=32, epochs=20, validation_split=0.1)

# 在测试集上评估模型性能
evaluation_results = model.evaluate(X_test, y_test)

# evaluation_results[0] 是整体的损失值
# evaluation_results[1] 是主输出层的损失值
# evaluation_results[2] 是主输出层的micro-F1分数

# 打印测试集上的性能
print(f'Test loss (overall): {evaluation_results[0]}')
print(f'Test loss (main output): {evaluation_results[1]}')
print(f'Test micro-F1 score (main output): {evaluation_results[2]}')


# Extract features from the training set using the trained model
# This will be used for constructing the datastore for k-NN
train_features = model.predict(X_train)[1]

# Construct the datastore for k-NN using the extracted features and the known labels
datastore = NearestNeighbors(n_neighbors=5).fit(train_features)

# Define k-NN inference function
def knn_inference(model, datastore, x_test, k=5, temperature=1.0, lambda_factor=0.5):
    cnn_test_predictions, test_features = model.predict(x_test)
    
    knn_test_predictions = np.zeros(cnn_test_predictions.shape)
    
    for i, feature in enumerate(test_features):
        distances, indices = datastore.kneighbors([feature], n_neighbors=k)
        weights = np.exp(-np.array(distances) / temperature)
        weights = weights / np.sum(weights)
        
        for idx, w in zip(indices[0], weights[0]):
            knn_test_predictions[i] += w * y_train[idx]
    
    knn_test_predictions /= np.max(knn_test_predictions, axis=1, keepdims=True)
    
    # Combine CNN model's output with k-NN's prediction to get the final prediction
    final_predictions = lambda_factor * cnn_test_predictions + (1 - lambda_factor) * knn_test_predictions
    return final_predictions

# Perform inference with k-NN on the test set
final_predictions = knn_inference(model, datastore, X_test)


In [None]:
import numpy as np
from sklearn.metrics import f1_score, mean_squared_error

# 定义阈值
threshold = 0.3

# 将概率转换为二进制标签
y_pred_binary = (final_predictions >= threshold).astype(int)

# 计算micro-F1分数
micro_f1_score = f1_score(y_test, y_pred_binary, average='micro')

# 打印micro-F1分数
print(f'Micro-F1 score for final predictions: {micro_f1_score}')

# 计算测试误差（均方误差）
test_error = mean_squared_error(y_test, final_predictions)

# 打印测试误差
print(f'Test mean squared error: {test_error}')
