In [35]:
import warnings

warnings.filterwarnings("ignore")

import os
import jieba
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity


# 去停用词
def drop_stopwords(context, stop_words_list):
    return [word for word in context if word not in stop_words_list and word != ' ']


# 返回停用词
def text_stop(file_path):
    with open(file_path + 'cn_stopwords.txt', encoding='utf-8') as stop:
        stop_words = stop.read().split("\n")
    stop_words_list = list(stop_words)
    stop_words_list.append("\u3000")
    return stop_words_list


# 文本处理
def process_text_file(file_path, text_name, stop_words_list):
    print(f"Processing {text_name}")
    with open(file_path + "/" + text_name + ".txt", "r", encoding='gb18030') as file:
        all_text = file.read()
        for ad in ['本书来自www.cr173.com免费txt小说下载站', '更多更新免费电子书请关注www.cr173.com',
                   '她', '他', '你', '我', '它', '这', '\u3000']:
            all_text = all_text.replace(ad, '')
        paragraphs = all_text.split("\n")
        text_jieba = []
        for para in paragraphs:
            if para.strip() == '':
                continue
            processed_para = drop_stopwords(jieba.lcut(para), stop_words_list)
            if processed_para:
                text_jieba.append(processed_para)
        return text_jieba


def train_model(text_name, text_data):
    # Tokenize the text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_data)
    sequences = tokenizer.texts_to_sequences(text_data)
    word_index = tokenizer.word_index
    vocab_size = len(word_index) + 1
    max_sequence_length = max(len(seq) for seq in sequences)

    # Pad sequences
    data = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

    # Define the context window size
    window_size = 2

    # Prepare input and output pairs for training
    inputs = []
    labels = []

    for sequence in sequences:
        for i in range(window_size, len(sequence) - window_size):
            context_words = sequence[i - window_size:i] + sequence[i + 1:i + window_size + 1]
            target_word = sequence[i]
            inputs.append(context_words)
            labels.append(target_word)

    inputs = np.array(inputs)
    labels = np.array(labels)

    input_length = 2 * window_size
    model = create_lstm_model(input_length=input_length,vocab_size=vocab_size, embedding_dim=80)
    # Train model
    model.fit(inputs, labels, epochs=15, batch_size=512)

    # Extract word embeddings
    embeddings = model.layers[1].get_weights()[0]

    return tokenizer, embeddings, word_index

In [36]:
def create_lstm_model(input_length, vocab_size, embedding_dim):
    # 定义输入层
    input_layer = Input(shape=(input_length,))
    # 定义嵌入层
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length)(input_layer)
    # 定义LSTM层
    lstm_layer = LSTM(128)(embedding_layer)
    # 定义输出层
    output_layer = Dense(vocab_size, activation='softmax')(lstm_layer)

    # 创建模型
    model = Model(inputs=input_layer, outputs=output_layer)
    # 编译模型
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    return model

In [37]:
file_path = 'data/'
text_names = ['倚天屠龙记', '鹿鼎记', '射雕英雄传', '神雕侠侣', '笑傲江湖','碧血剑']

# 获取停用词列表
stop_words_list = text_stop(file_path)

# 保存处理后的文本
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

    # Train models and extract embeddings
models_data = []
for name in text_names:
    text_data = process_text_file(file_path, name, stop_words_list)
    tokenizer, embeddings, word_index = train_model(name, text_data)
    models_data.append((name, tokenizer, embeddings, word_index))

Processing 倚天屠龙记
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Processing 鹿鼎记
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Processing 射雕英雄传
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Processing 神雕侠侣
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Processing 笑傲江湖
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Processing 碧血剑
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch

In [38]:
test_name_mapping = {
        '倚天屠龙记': '张无忌',
        '鹿鼎记':'韦小宝',
        '天龙八部': '乔峰',
        '射雕英雄传': '郭靖',
        '神雕侠侣': '杨过',
        '笑傲江湖': '令狐冲',
        '碧血剑':'袁承志'
    }

for name, tokenizer, embeddings, word_index in models_data:
    test_word = test_name_mapping[name]
    if test_word in word_index:
        print(f"\nTop 10 words similar to '{test_word}' in '{name}':")
        test_word_index = word_index[test_word]
        test_word_vector = embeddings[test_word_index].reshape(1, -1)
        similarities = cosine_similarity(test_word_vector, embeddings)[0]
        similar_indices = similarities.argsort()[-11:-1][::-1]
        similar_words = [(tokenizer.index_word[idx], similarities[idx]) for idx in similar_indices]
        for word, similarity in similar_words:
            print(word, similarity)


Top 10 words similar to '张无忌' in '倚天屠龙记':
周芷若 0.8084123
殷素素 0.805586
张翠山 0.78017986
赵敏 0.7621716
都大锦 0.74168056
俞岱岩 0.7367599
宋青书 0.72280306
蛛儿 0.71801317
卫璧 0.7154796
金花婆婆 0.70907915

Top 10 words similar to '韦小宝' in '鹿鼎记':
康熙 0.8837671
吴之荣 0.8103085
陈近南 0.80448043
老者 0.798382
女尼 0.78054684
茅十八 0.77979743
张康年 0.7717519
图尔布青 0.76906645
海老公 0.7667867
陶红英 0.76284593

Top 10 words similar to '郭靖' in '射雕英雄传':
黄蓉 0.7820635
众人 0.7283165
六子 0.714378
欧阳克 0.6903978
黄药师 0.6585986
洪七公 0.65272653
穆易 0.64841056
韩宝驹 0.6455806
丘处机 0.6448091
陆冠英 0.6433101

Top 10 words similar to '杨过' in '神雕侠侣':
小龙女 0.8335143
李莫愁 0.8033289
陆无双 0.798977
法王 0.7841845
周伯通 0.77406096
赵志敬 0.76541376
柯镇恶 0.7649498
裘千尺 0.75431204
完颜萍 0.7445578
郭襄 0.743217

Top 10 words similar to '令狐冲' in '笑傲江湖':
岳不群 0.80036384
桃花仙 0.79262125
盈盈 0.7918447
任行 0.78677994
向问天 0.7818441
岳夫人 0.75931877
玉玑子 0.7569629
林平之 0.75383526
定静师太 0.75357324
岳灵珊 0.7523141

Top 10 words similar to '袁承志' in '碧血剑':
崔秋山 0.8582802
青青 0.7778281
焦公礼 0.7749132
温青 0

In [None]:
0.