### Sentiment analysis

In [None]:
!pip install transformers 

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 6.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 72.3 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 69.7 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [None]:
from transformers import pipeline

nlp = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

resultat = nlp("trés bien reçu")[0]
print(resultat)
print(f"label: {resultat['label']}, avec un score de: {round(resultat['score']*100, 2)}%")

resultat = nlp("une trés mauvaise remarque")[0]
print(f"label: {resultat['label']}, avec un score de: {round(resultat['score']*100, 2)}%")

{'label': '5 stars', 'score': 0.6836311221122742}
label: 5 stars, avec un score de: 68.36%
label: 1 star, avec un score de: 68.42%


## Text generation

In [None]:
from transformers import pipeline

In [None]:
# French
Fr_Text = pipeline('text-generation', model='dbddv01/gpt2-french-small')

print(Fr_Text("je travaille avec", max_length=50, do_sample=False))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=842.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=510406637.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=611.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=858358.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=516682.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=387.0, style=ProgressStyle(description_…




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'je travaille avec le producteur de la série, Mike Reiss. Celui-ci a déjà travaillé avec le scénariste de la série, Mike Reiss, sur la série de films de la série, et a travaillé avec le scénariste de la série, Mike'}]


In [None]:
# Arabic
Ar_Text = pipeline('text-generation', model='akhooli/gpt2-small-arabic')
print(Ar_Text("انها معلمة تاريخية", max_length=50, do_sample=False))


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=666.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=510378732.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=30.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1548817.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1207232.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=120.0, style=ProgressStyle(description_…




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'انها معلمة تاريخية في مدرسة القرية، وهي مدرسة ابتدائية للبنين، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية للبنات، ومدرسة ابتدائية'}]


## Name entity recognition (NER)

In [None]:

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

ENG = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER")
AR = pipeline("ner", model="hatmimoha/arabic-ner", tokenizer="hatmimoha/arabic-ner")
FR = pipeline("ner", model="gilf/french-postag-model", tokenizer="gilf/french-postag-model")

print(AR("انها معلمة تاريخية"))
 
print(ENG("well done"))

print(FR("je travaille avec"))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=829.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433316646.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=59.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1300.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440251983.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=86.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=333770.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1618.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711553514.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=49.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'LABEL_12', 'score': 0.9995168, 'index': 1, 'word': 'انها', 'start': 0, 'end': 4}, {'entity': 'LABEL_12', 'score': 0.9998653, 'index': 2, 'word': 'معلم', 'start': 5, 'end': 9}, {'entity': 'LABEL_12', 'score': 0.9994434, 'index': 3, 'word': '##ة', 'start': 9, 'end': 10}, {'entity': 'LABEL_12', 'score': 0.99991083, 'index': 4, 'word': 'تاريخية', 'start': 11, 'end': 18}]
[]
[{'entity': 'CLS', 'score': 0.9998509, 'index': 1, 'word': 'je', 'start': 0, 'end': 2}, {'entity': 'V', 'score': 0.99989057, 'index': 2, 'word': 'travaille', 'start': 3, 'end': 12}, {'entity': 'P', 'score': 0.99991506, 'index': 3, 'word': 'avec', 'start': 13, 'end': 17}]


## Question answering

In [None]:
from transformers import pipeline
Answer = pipeline("question-answering")
context = """
Lionel Messi, parfois surnommé Leo Messi, né le 24 juin 1987 à Rosario en Argentine, est un footballeur international argentin évoluant au poste d'attaquant au Paris Saint-Germain, après avoir joué au FC Barcelone..
"""
qst = "Qui est Lionel Messi?"
resultat = Answer(question=qst, context=context)
print("Reponse:", resultat['answer'])




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=473.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=260793700.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…


Reponse: né le 24 juin 1987 à Rosario en Argentine


## Filling masked text

In [None]:
from transformers import pipeline

NLP = pipeline("fill-mask")

from pprint import pprint
pprint(NLP(f"Mohammed VI, né le 21 août 1963 à Rabat, est  {NLP.tokenizer.mask_token} et le troisième à porter le titre de roi du Maroc."))



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=480.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=331070498.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…


[{'score': 0.3971644937992096,
  'sequence': 'Mohammed VI, né le 21 août 1963 à Rabat, est ét et le troisième '
              'à porter le titre de roi du Maroc.',
  'token': 10221,
  'token_str': 'ét'},
 {'score': 0.08196265250444412,
  'sequence': 'Mohammed VI, né le 21 août 1963 à Rabat, est iced et le '
              'troisième à porter le titre de roi du Maroc.',
  'token': 12646,
  'token_str': 'iced'},
 {'score': 0.060506634414196014,
  'sequence': 'Mohammed VI, né le 21 août 1963 à Rabat, est és et le troisième '
              'à porter le titre de roi du Maroc.',
  'token': 5739,
  'token_str': 'és'},
 {'score': 0.056976623833179474,
  'sequence': 'Mohammed VI, né le 21 août 1963 à Rabat, est ident et le '
              'troisième à porter le titre de roi du Maroc.',
  'token': 8009,
  'token_str': 'ident'},
 {'score': 0.03867478668689728,
  'sequence': 'Mohammed VI, né le 21 août 1963 à Rabat, est ieri et le '
              'troisième à porter le titre de roi du Maroc.',
  '

In [5]:
#Arabic
from transformers import pipeline
arabic_fill_mask = pipeline('fill-mask', model='CAMeL-Lab/bert-base-camelbert-ca')
pprint(arabic_fill_mask(" ‏بسم [MASK]‬ الرحمن الرحيم ."))

Some weights of the model checkpoint at CAMeL-Lab/bert-base-camelbert-ca were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.999610960483551,
  'sequence': 'بسم الله الرحمن الرحيم.',
  'token': 1953,
  'token_str': 'الله'},
 {'score': 0.00019928190158680081,
  'sequence': 'بسم الرحمن الرحمن الرحيم.',
  'token': 4289,
  'token_str': 'الرحمن'},
 {'score': 3.6494100640993565e-05,
  'sequence': 'بسم لله الرحمن الرحيم.',
  'token': 2784,
  'token_str': 'لله'},
 {'score': 2.092821705446113e-05,
  'sequence': 'بسم اله الرحمن الرحيم.',
  'token': 2090,
  'token_str': 'اله'},
 {'score': 9.44888870435534e-06,
  'sequence': 'بسم اللهم الرحمن الرحيم.',
  'token': 2168,
  'token_str': 'اللهم'}]


## Summarization

In [None]:
from transformers import pipeline

debrief = pipeline("summarization")

TEXTE = """ 
Mohammed VI (en arabe marocain : محمد السادس, en berbère marocain : ⵎⵓⵃⵎⵎⴷ ⵡⵉⵙⵙ ⵚⴹⵉⵚ), né le 21 août 1963 à Rabat (Maroc), est le vingt-troisième monarque de la dynastie alaouite, et le troisième à porter le titre de roi du Maroc, depuis le 23 juillet 1999.
"""
print(debrief(TEXTE, max_length=130, min_length=30, do_sample=False))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1222317369.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


[{'summary_text': ' Mohammed VI, né le 21 août 1963 à Rabat (Maroc), est le vingt-troisième monarque de la dynastie alaouite . He is the title de roi du Maroc, depuis le 23 juillet 1999 .'}]


## Translation

In [None]:
from transformers import pipeline
# English to french
trans = pipeline("translation_en_to_fr")
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

pprint(trans("The king initially introduced reforms to grant women more power. Leaked diplomatic cables from WikiLeaks have alleged extensive corruption in the court of King Mohammed VI, implicating the king and his closest advisors.", max_length=40))


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




Your input_length: 53 is bigger than 0.9 * max_length: 40. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


[{'translation_text': 'Le roi a initialement introduit des réformes pour '
                      'conférer plus de pouvoir aux femmes. Des câbles '
                      'diplomatiques divulgués par WikiLeak'}]


In [1]:
# english to Arabic

from transformers import MarianTokenizer, MarianMTModel

tokenizer = MarianTokenizer.from_pretrained("marefa-nlp/marefa-mt-en-ar")
model = MarianMTModel.from_pretrained("marefa-nlp/marefa-mt-en-ar")

text = "The city  origins derives from the the ancient kingdom of Mouwahidine and Alaouis."

translated_tokens = model.generate(**tokenizer.prepare_seq2seq_batch(text, return_tensors="pt"))
Output_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]

print(Output_text)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=801074.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=916890.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2243581.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=237.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1320.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=305511975.0, style=ProgressStyle(descri…




`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


['ترجع أصول المدينة إلى مملكة المواهدين والعلويين القديمة.']


In [3]:
# Arabic to English
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from pprint import pprint

text_ar = "في عام 2015 قدر عدد المسلمين في العالم بنحو 1.8 مليار نسمة أو 24٪ من سكان العالم"

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.src_lang = "ar_AR"
encoded_ar = tokenizer(text_ar, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
pprint(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))


['In 2015, the number of Muslims in the world was estimated at 1.8 billion, or '
 "24 percent of the world's population."]


## Feature extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

MOTS = [
    "it is very soft and kind",
    "you should see it ",
    "You are a very handsome guy and good teacher, teacher.",
    "You can be a good father in the future",
    "You are smart as ELON MUSK",]

vectorizer = CountVectorizer(stop_words='english')

vectorizer.fit(MOTS)

vectorizer.get_feature_names()

['elon',
 'father',
 'future',
 'good',
 'guy',
 'handsome',
 'kind',
 'musk',
 'smart',
 'soft',
 'teacher']