以下模塊需要照順序往下執行
<hr/>
1. 載入已經pretrained過的word2vec模型，這邊是用GoogleNews-300<br>
2. 爬arxiv上的冷氣空調規格相關的文章<br>
3. 計算arxiv文章裡面詞彙的TF-IDF值(詞頻率)<br>
4. 用arxiv文章對pretrained的word2vec模型做fine-tuned<br>
5. 根據fine-tuned完的word2vec模型，將原始文檔抓到的關鍵詞群進行向量分配<br>
<hr/>

In [None]:
import requests
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from numpy.linalg import norm
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from tqdm import tqdm

# 计算余弦相似度
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = norm(vec1)
    norm_vec2 = norm(vec2)
    if norm_vec1 != 0 and norm_vec2 != 0:
        similarity = dot_product / (norm_vec1 * norm_vec2)
    else:
        similarity = 0  # 避免除以0的情况
    return similarity

# 加载预训练的Word2Vec模型
word_vectors = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

<hr/>
爬arxiv上的冷氣空調規格相關的文章摘要(需事先指定要抓的關鍵詞)，另外也要注意API呼叫太頻繁或太多會直接被shut down<br>
最後存成arxiv_documents.txt，裡面都是摘要段落<br>
*這邊我已經有一份抓了76萬個段落的arxiv_documents.txt檔案可以直接用，如果沒要重抓可以直接跳到下一個區塊*<br>
<hr/>

In [None]:
import requests
import time
import xml.etree.ElementTree as ET
from sklearn.feature_extraction.text import TfidfVectorizer

# 定義關鍵字和最大結果數
keywords = ["air conditioning", "HVAC", "compressor", "condenser", "pump", "conditioner", "cooling", "heating", "fan", "fin"]
max_results_per_query = 1000
total_documents_needed = 1000000
max_api_calls = 1000  # 最大API呼叫次數

# 下載arXiv中與冷氣空調相關的文檔
def get_arxiv_documents(query, start_index, max_results):
    url = f"http://export.arxiv.org/api/query?search_query={query}&start={start_index}&max_results={max_results}"
    response = requests.get(url)
    documents = set()  # 使用集合來存儲文檔以避免重複
    if response.status_code == 200:
        try:
            root = ET.fromstring(response.text)  # 使用 response.text 而不是 response.content
            for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
                title = entry.find('{http://www.w3.org/2005/Atom}title').text
                summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
                document = (title + " " + summary).lower()  # 轉小寫
                documents.add(document)
        except ET.ParseError as e:
            print("Error parsing XML:", e)
    return documents


# 取得包含關鍵字的文章段落
def get_all_documents(keywords, total_documents_needed, max_results_per_query, max_api_calls):
    all_documents = set()
    api_call_count = 0  # 初始化API呼叫計數器
    
    for keyword in keywords:
        start_index = 0
        while len(all_documents) < total_documents_needed and api_call_count < max_api_calls:
            documents = get_arxiv_documents(f"all:{keyword}", start_index, max_results_per_query)
            api_call_count += 1  # 增加API呼叫計數
            if not documents:
                break
            all_documents.update(documents)
            start_index += max_results_per_query
            time.sleep(3)  # 避免過於頻繁的請求導致被封禁
            if len(all_documents) >= total_documents_needed or api_call_count >= max_api_calls:
                break
        if api_call_count >= max_api_calls:
            print("Reached maximum API call limit.")
            break
    return list(all_documents)[:total_documents_needed]

# 取得語冷氣空調相關的文章段落
corpus = get_all_documents(keywords, total_documents_needed, max_results_per_query, max_api_calls)

def save_documents_to_file(documents, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for doc in documents:
            file.write(doc + "\n\n")

# 保存文檔到文件中
save_documents_to_file(corpus, "arxiv_documents.txt")

# 打印文檔數量
print(f"Number of documents: {len(corpus)}")

# 計算TF-IDF權重
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)


<hr/>
使用已經抓好的arxiv文檔(arxiv_documents.txt)計算裡面所有有出現詞彙的TF-IDF值(詞頻率)<br>
<hr/>

In [None]:
#### 不重抓arxiv文章計算TF-IDF，直接用之前存的來算
import os
from sklearn.feature_extraction.text import TfidfVectorizer

# 从文件加载文档
def load_documents_from_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        documents = file.read().split("\n\n")
    return documents

# 加载文档
corpus = load_documents_from_file("arxiv_documents.txt")

# 打印文档数量
print(f"Number of documents: {len(corpus)}")

# 计算TF-IDF权重
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# 打印特征数量（词汇表大小）
print(f"Number of features: {len(vectorizer.get_feature_names_out())}")


<hr/>
Pretrained model用arxiv段落摘要做fine-tuned，並存成English_fine_tuned_word2vec_model<br>
<hr/>

In [None]:
# 设置训练参数
total_epochs = 50
learning_rate = 0.01

# 构建一个新的 Word2Vec 模型，使用预训练的词向量
model = Word2Vec(vector_size=300, min_count=1)
model.build_vocab_from_freq(word_vectors.key_to_index)
model.wv = word_vectors

# 加载文本文件并将其转换为单词列表
sentences = [line.split() for line in corpus]

# 获取总的句子数量
total_sentences = len(sentences)

# 训练模型
for epoch in range(total_epochs):
    with tqdm(total=total_sentences, desc=f"Epoch {epoch+1}/{total_epochs}", unit="句") as pbar:
        for sentence in sentences:
            for word in sentence:
                if word in model.wv.key_to_index:
                    index = model.wv.key_to_index[word]
                    vec = model.wv.vectors[index]
                    model.wv.vectors[index] += learning_rate * np.random.uniform(-0.5, 0.5, size=(model.vector_size,))
            pbar.update(1)

# 保存微调后的模型
model.save("English_fine_tuned_word2vec_model")

del model
del sentences
del index
del vec

<hr/>
計算從原始文件中抓到的詞(English_nouns.txt)的向量值，若該詞在fine-tuned模型裡面本來就有，就直接取該值<br>
若該詞為複合辭彙，則拆解成數個單詞，然後以TF-IDF值為權重做向量組合<br>
若該詞不存在模型內，則賦予-0.5~0.5的隨機值填滿其向量<br>
最後處理好的word2vec檔案以english_word_vectors.txt存回<br>
*模型的複合詞彙都會用"_"進行串接，索引的時候要記得根據這點進行處理*
<hr/>

In [None]:
model_fine_tune = Word2Vec.load("English_fine_tuned_word2vec_model")

def get_weighted_phrase_vector(phrase, model, vectorizer, vector_size):
    words = phrase.lower().split()  # 将短语转换为小写
    word_vectors = []
    weights = []
    
    all_words_in_model = False
    
    for word in words:
        if word in model.wv:
            word_vectors.append(model.wv[word])  # 使用 model.wv 访问词向量
            if word in vectorizer.vocabulary_:
                weights.append(vectorizer.idf_[vectorizer.vocabulary_[word]])
            else:
                weights.append(1.0)  # 使用默认权重
            all_words_in_model = True
        else:
            word_vectors.append(np.zeros(vector_size))
            weights.append(0.0)  # 给出权重为0的默认向量

    if all_words_in_model:
        return np.average(word_vectors, axis=0, weights=weights)
    else:
        return np.random.uniform(-0.5, 0.5, size=(vector_size,))

# 从文件中读取额外的词汇并追加到模型中
English_vocab_file = "English_nouns.txt"
with open(English_vocab_file, 'r', encoding='utf-8') as file:
    English_vocab = [line.strip() for line in file]
    
# 创建一个空字典来存储单词及其向量表示
english_word_vectors = {}

# 循环遍历英文词汇列表
for word in English_vocab:
    # 将复合词中的空格用下划线连接起来
    word = word.replace(" ", "_")
    # 获取单词的向量表示
    word_vector = get_weighted_phrase_vector(word, model_fine_tune, vectorizer, 300)
    # 将单词及其向量表示添加到字典中
    english_word_vectors[word] = word_vector
    
# 指定保存向量表示的文件路径
output_file = "english_word_vectors.txt"

# 将英文单词及其向量表示写入文件
with open(output_file, "w", encoding="utf-8") as file:
    # 写入向量的维度信息
    file.write(f"{len(english_word_vectors)} 300\n")
    # 遍历英文单词及其向量表示
    for word, vector in english_word_vectors.items():
        # 格式化为文本行，首先写入单词，然后向量用空格分隔
        vector_str = " ".join(str(val) for val in vector)
        line = f"{word} {vector_str}\n"
        # 写入文件
        file.write(line)


# 示例使用
# phrase = "Wooden Base"
# phrase_vector = get_weighted_phrase_vector(phrase, model_fine_tune, vectorizer, 300) # 你的模型的向量维度

# 计算给定短语与所有词语的余弦相似度，并打印出最相似的前5个词语
# similarities = [(word, cosine_similarity(phrase_vector, word_vectors[word])) for word in word_vectors.index_to_key]
# similarities.sort(key=lambda x: x[1], reverse=True)
# top_similar_words = similarities[:5]

# print(f"Most similar words to '{phrase}':")
# for word, similarity in top_similar_words:
#     print(f"{word}: {similarity}")

<hr/>
用來檢視單詞的TF-IDF值，值越高代表頻率越高<br>
<hr/>

In [None]:
print(vectorizer.idf_[vectorizer.vocabulary_['evaporator']])
print(vectorizer.idf_[vectorizer.vocabulary_['blower']])
print(vectorizer.idf_[vectorizer.vocabulary_['fin']])
print(vectorizer.idf_[vectorizer.vocabulary_['compressor']])
print(vectorizer.idf_[vectorizer.vocabulary_['electric']])
print(vectorizer.idf_[vectorizer.vocabulary_['motor']])
print(vectorizer.idf_[vectorizer.vocabulary_['air']])
print(vectorizer.idf_[vectorizer.vocabulary_['automatic']])