## 生成第一段文本

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "D:models/microsoft-Phi-3-mini-4k-instruct",
    device_map="cpu",
    torch_dtype="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("D:/models/microsoft-Phi-3-mini-4k-instruct")

In [3]:
from transformers import pipeline

In [4]:
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False
)

In [None]:
messages = [
    {"role": "user", "content": "Create a funny joke about chickens."}
]

output = generator(messages)
print(output[0]["generated_text"])

In [None]:
prompt1 = "Create a funny joke about chickens."

# Tokenize the input prompt
input_ids1 = tokenizer(prompt1, return_tensors="pt").input_ids

# Tokenize the input prompt
input_ids1 = input_ids1.to("cpu")

# Get the output of the model before the lm_head
model_output1 = model.model(input_ids1)

# Get the output of the lm_head
lm_head_output1 = model.lm_head(model_output1[0])

In [None]:
import torch

In [None]:
# 获取每一时间步的最高分数对应的词的索引
predicted_token_ids = torch.argmax(lm_head_output1, dim=-1)

# 使用 tokenizer 将 token ids 转换为文本
# 注意这里的 tokenizer 应该与你之前用于编码的 tokenizer 相同
decoded_output = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

print(decoded_output)

## eee

In [18]:
prompt = "The capital of France is"

# Tokenize the input prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Tokenize the input prompt
input_ids = input_ids.to("cpu")

# Get the output of the model before the lm_head
model_output = model.model(input_ids)

# Get the output of the lm_head
lm_head_output = model.lm_head(model_output[0])

You are not running the flash-attention implementation, expect numerical differences.


In [19]:
token_id = lm_head_output[0,-1].argmax(-1)
tokenizer.decode(token_id)

'Paris'

In [20]:
model_output[0].shape

torch.Size([1, 5, 3072])

In [21]:
lm_head_output.shape

torch.Size([1, 5, 32064])

## 词源和嵌入

In [None]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened. <|assistant|>"

# 对输入的提示词进行分词
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cup")
# 生成文本
generation_output = model.generate(
    input_ids=input_ids,
    max_new_tokens=20
)
# 打印输出
print(tokenizer.decode(generation_output[0])

In [None]:
input_ids

In [None]:
generation_output

In [None]:
for id in input_ids[0]:
    print(tokenizer.decode(id))

In [None]:
print(tokenizer.decode(3323))
print(tokenizer.decode(622))
print(tokenizer.decode([3323,622]))
print(tokenizer.decode(29901))

In [None]:
colors_list = [
    "102;194;165","252;141;98","141;160;203",
    "231;138;195","166;216;84","255;217;47"
]
def show_tokens(sentence, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids
    for idx, t in enumerate(token_ids):
        print(
            f"\xlb[0;30;48;2;{colors_list[idx % len(colors_list)]}m" +
            tokenizer.decode(t) +
            "\xlb[0m",
            end=" "
        )

## 使用语言模型创建与上下文相关的词嵌入

In [None]:
from transformers import AutoModel, AutoTokenizer
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
# 加载语言模型
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")
# 对句子进行分词
tokens = tokenizer("Hello world", return_tensors="pt")
# 处理词元
output = model(**tokens)[0]

In [None]:
output.shape

In [None]:
for token in tokens["input_ids"][0]:
    print(tokenizer.decode(token))

## 文本嵌入（用于句子和整篇文档）

In [None]:
from sentence_transformers import SentenceTransformer
# 加载模型
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
# 将文本转换为文本嵌入
vector = model.encode("Best movie ever!")

In [None]:
vector.shape

## LLM之外的词嵌入

In [32]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-50")

ModuleNotFoundError: No module named 'gensim'

In [None]:
model.most_similar([model["king"]], topn=11)

In [35]:
import pandas as pd
from urllib import request
data = request.urlopen("https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt")
# 解析播放列表数据集文件。跳过前两行，因为它们只包含元数据
lines = data.read().decode("utf-8").split("\n")[2:]
# 删除只有一首歌的播放列表
playlists = [s.rstrip().split() for s in lines if len(s.split()) > 1]
# 加载歌曲元数据
songs_file = request.urlopen("https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt")
songs_file = songs_file.read().decode("utf-8").split("\n")
songs = [s.rstrip().split("\t") for s in songs_file]
songs_df = pd.DataFrame(data=songs, columns=["id","title","artist"])
songs_df = songs_df.set_index("id")

In [36]:
print(f"Playlist #1:\n {playlists[0]}, \n")
print(f"Playlist #2:\n {playlists[1]}")

Playlist #1:
 ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '2', '42', '43', '44', '45', '46', '47', '48', '20', '49', '8', '50', '51', '52', '53', '54', '55', '56', '57', '25', '58', '59', '60', '61', '62', '3', '63', '64', '65', '66', '46', '47', '67', '2', '48', '68', '69', '70', '57', '50', '71', '72', '53', '73', '25', '74', '59', '20', '46', '75', '76', '77', '59', '20', '43'], 

Playlist #2:
 ['78', '79', '80', '3', '62', '81', '14', '82', '48', '83', '84', '17', '85', '86', '87', '88', '74', '89', '90', '91', '4', '73', '62', '92', '17', '53', '59', '93', '94', '51', '50', '27', '95', '48', '96', '97', '98', '99', '100', '57', '101', '102', '25', '103', '3', '104', '105', '106', '107', '47', '108', '109', '110', '111', '112', '113', '25', '63', '62', '114', '115', '84', '116', '117', 

In [37]:
from gensim.models import Word2Vec
# 训练我们的word2vec模型
v_model = Word2Vec(playlists, vector_size=32, window=20, negative=50, min_count=1, workers=4)

In [39]:
song_id = 2172
# 让模型找出与歌曲2172相似的歌曲
v_model.wv.most_similar(positive=str(song_id))

[('1922', 0.9977813959121704),
 ('2014', 0.9971864819526672),
 ('11473', 0.996427059173584),
 ('2849', 0.99642413854599),
 ('5586', 0.9960500001907349),
 ('5634', 0.9957127571105957),
 ('3116', 0.9955424666404724),
 ('10084', 0.9953478574752808),
 ('2640', 0.9949705600738525),
 ('2104', 0.9948686361312866)]

In [40]:
print(songs_df.iloc[2172])

title     Fade To Black
artist        Metallica
Name: 2172 , dtype: object


In [42]:
import numpy as np
def print_recommendations(song_id):
    similar_songs = np.array(
        v_model.wv.most_similar(positive=str(song_id), topn=5)
    )[:,0]
    return songs_df.iloc[similar_songs]

print_recommendations(2172)

Unnamed: 0_level_0,title,artist
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1922,One,Metallica
2014,Youth Gone Wild,Skid Row
11473,Little Guitars,Van Halen
2849,Run To The Hills,Iron Maiden
5586,The Last In Line,Dio
