In [31]:
import os
from transformers import pipeline
import torch

# cache_dir = "/Users/yangye/models"


# 设置环境变量
# os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

# 检查环境变量是否设置成功
print(os.environ["HF_ENDPOINT"])
# 设置 CUDA_VISIBLE_DEVICES 环境变量
os.environ["CUDA_VISIBLE_DEVICES"] = "0"



if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print("CUDA version:", torch.version.cuda)
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. Check your installation and try again.")


https://hf-mirror.com
Number of GPUs available: 1
CUDA version: 12.1
Device name: NVIDIA RTX A6000


In [15]:
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

classifier("I've been waiting for a HuggingFace course my whole life.")

[{'label': 'POSITIVE', 'score': 0.9598048329353333}]

In [16]:
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

[{'label': 'POSITIVE', 'score': 0.9598048329353333},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [17]:
classifier(
 "I am lingtao master"
)

[{'label': 'POSITIVE', 'score': 0.9988415837287903}]

In [18]:
classifier(
    "I am sading"
)

[{'label': 'NEGATIVE', 'score': 0.9993966817855835}]

In [19]:
classifier(
    "I get a apple from toilet"
)

[{'label': 'NEGATIVE', 'score': 0.9995057582855225}]

In [20]:
classifier(
    "I get a apple from a little boy"
)

[{'label': 'POSITIVE', 'score': 0.9947863817214966}]

In [21]:
classifier(
    "我是孔令涛"
)

[{'label': 'NEGATIVE', 'score': 0.8768509030342102}]

In [22]:
# 文本分类

In [23]:
classifier = pipeline("zero-shot-classification")
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://hf-mirror.com/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445960283279419, 0.11197628080844879, 0.043427709490060806]}

In [24]:
classifier(
    "我是西安交通大学的孔令涛硕士",
    candidate_labels=["education", "politics", "business"],
)

{'sequence': '我是西安交通大学的孔令涛硕士',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.858680009841919, 0.11062318831682205, 0.030696818605065346]}

In [25]:
classifier(
    "我是令涛硕士。",
    candidate_labels=["education", "politics", "business"],
)

{'sequence': '我是令涛硕士。',
 'labels': ['business', 'education', 'politics'],
 'scores': [0.4734025001525879, 0.39843112230300903, 0.1281663328409195]}

# 文本生成

In [26]:
generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://hf-mirror.com/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to control your own health, including taking effective steps to maintain it. This includes personal growth and self-growth, self-care to better understand your health issues, and improving your quality of life.'}]

In [27]:
generator("我是令涛硕士，我就读于西安交大。")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': '我是令涛硕士，我就读于西安交大。我师,在帉的其中宣亡�'}]

In [28]:
generator("I like eat apple ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I like eat apple "\n\nSkipping off in that way is one of Lee\'s favorite things to do, which he calls "eating in a hurry." Here the two sit down and listen to the soundtrack of "Blindspot."'}]

In [32]:
generator = pipeline("text-generation", model="distilgpt2")
generator(
    "In this course, we will teach you how to",
    max_length=30,
    num_return_sequences=2,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to learn how to read or write language using the language available, such as English, and how to understand'},
 {'generated_text': "In this course, we will teach you how to code a Python application. If you don't know how to code Python, then you need to start"}]

In [34]:
generator = pipeline("text-generation", model="distilgpt2",device=0)
generator(
    "In this course, we will teach you how to",
    max_length=300,
    num_return_sequences=20,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to get around the problems of climate science. If you think you can get around these problems, read on to see if they can improve your approach.'},
 {'generated_text': 'In this course, we will teach you how to use the language by developing a vocabulary which is not available. Each week, we will focus on how to use the language by developing a vocabulary and using the other languages.\n\n\nIn previous courses, I have used a grammar that is not used by most people, except for the French, Dutch, French, German, Spanish, French, German, Romanian, Portuguese, Italian and Italian.\nThe first language to use is French, an Arabic dialect (see: Arabic & Persian by Elsai, who use French only and use English alone).\nIn this course, we\'ll show you how to use a different language in a language. We will introduce French and Portuguese as one of several languages that use the same vocabulary and with different meanings for each other.\nIn 

In [36]:
 

unmasker = pipeline("fill-mask",device=0)
unmasker("This course will teach you all about <mask> models.", top_k=2)

No model was supplied, defaulted to distilbert/distilroberta-base and revision ec58a5b (https://hf-mirror.com/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[{'score': 0.19619712233543396,
  'token': 30412,
  'token_str': ' mathematical',
  'sequence': 'This course will teach you all about mathematical models.'},
 {'score': 0.04052708297967911,
  'token': 38163,
  'token_str': ' computational',
  'sequence': 'This course will teach you all about computational models.'}]

In [38]:
unmasker("我早上喜欢吃苹果和<mask>。", top_k=2)

[{'score': 0.14672788977622986,
  'token': 48827,
  'token_str': '上',
  'sequence': '我早上喜欢吃苹果和上。'},
 {'score': 0.12378136068582535,
  'token': 47643,
  'token_str': '中',
  'sequence': '我早上喜欢吃苹果和中。'}]

In [None]:
ner = pipeline("ner", grouped_entities=True,device=0)
ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

In [42]:
ner("My name is LingTao")

[{'entity_group': 'PER',
  'score': 0.8838808,
  'word': 'LingTao',
  'start': 11,
  'end': 18}]

In [43]:
question_answerer = pipeline("question-answering",device=0)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://hf-mirror.com/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [44]:
summarizer = pipeline("summarization")


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://hf-mirror.com/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

# 实体识别 shibing624/bert4ner-base-chinese

In [45]:
ner = pipeline("ner", model="shibing624/bert4ner-base-chinese",device=0)

config.json:   0%|          | 0.00/532 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/407M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [46]:
ner("我是孔令涛硕士，来自西安交大。")

[{'entity': 'B-PER',
  'score': 0.99982256,
  'index': 3,
  'word': '孔',
  'start': 2,
  'end': 3},
 {'entity': 'I-PER',
  'score': 0.9998381,
  'index': 4,
  'word': '令',
  'start': 3,
  'end': 4},
 {'entity': 'I-PER',
  'score': 0.99990463,
  'index': 5,
  'word': '涛',
  'start': 4,
  'end': 5},
 {'entity': 'B-ORG',
  'score': 0.99617827,
  'index': 11,
  'word': '西',
  'start': 10,
  'end': 11},
 {'entity': 'I-ORG',
  'score': 0.99722326,
  'index': 12,
  'word': '安',
  'start': 11,
  'end': 12},
 {'entity': 'I-ORG',
  'score': 0.9936413,
  'index': 13,
  'word': '交',
  'start': 12,
  'end': 13},
 {'entity': 'I-ORG',
  'score': 0.9962859,
  'index': 14,
  'word': '大',
  'start': 13,
  'end': 14}]