In [34]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [35]:
!python -m spacy download es_core_news_md

Collecting es_core_news_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_md-2.2.5/es_core_news_md-2.2.5.tar.gz (78.4 MB)
[K     |████████████████████████████████| 78.4 MB 1.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_md')


In [36]:
import torch
import torch.nn as nn
import torchtext

import spacy
import nltk
from nltk.stem import *

import gensim
import gensim.downloader
from gensim.models import Word2Vec

import json
import matplotlib.pyplot as plt
import numpy as np
import pickle

# Tokenizing

In [4]:
sp_en = spacy.load("en_core_web_md")

In [37]:
sp_es = spacy.load("es_core_news_md")

In [38]:
for w in sp_es("Bienvenidos a la clase de Aprendizaje Automatico"):
  print(w,w.pos_,w.dep_)

Bienvenidos ADJ ROOT
a ADP case
la DET det
clase NOUN obj
de ADP case
Aprendizaje PROPN nmod
Automatico PROPN flat


In [39]:
for w in sp_en("Welcome to the Machine Learning class"):
  print(w,w.pos_,w.dep_)

Welcome VERB ROOT
to ADP prep
the DET det
Machine PROPN compound
Learning PROPN compound
class NOUN pobj


# Word Vectorization

In [40]:
word_to_vec = torchtext.vocab.GloVe()

In [41]:
word_to_vec.get_vecs_by_tokens("hello how are you".split(" "))

tensor([[ 0.2523,  0.1018, -0.6748,  ...,  0.1787, -0.5192,  0.3359],
        [-0.2321,  0.4747, -0.3826,  ...,  0.3318,  0.3155,  0.3797],
        [-0.1986, -0.0628, -0.3661,  ..., -0.5845,  0.2788, -0.2621],
        [-0.1108,  0.3079, -0.5198,  ..., -0.0591,  0.4760,  0.0566]])

In [42]:
cosine_simalirty = torch.nn.CosineSimilarity(dim=0)

In [43]:
cosine_simalirty(word_to_vec.get_vecs_by_tokens("hello"),word_to_vec.get_vecs_by_tokens("potato"))

tensor(0.1982)

In [44]:
cosine_simalirty(word_to_vec.get_vecs_by_tokens("hello"),word_to_vec.get_vecs_by_tokens("hi"))

tensor(0.7629)

In [45]:
cosine_simalirty(word_to_vec.get_vecs_by_tokens("university"),word_to_vec.get_vecs_by_tokens("school"))

tensor(0.6848)

In [46]:
cosine_simalirty(word_to_vec.get_vecs_by_tokens("university"),word_to_vec.get_vecs_by_tokens("bar"))

tensor(0.2076)

# Simple Chat Bot

In [48]:
!mkdir .kaggle
!mv kaggle.json .kaggle/
!mv .kaggle ~

mkdir: cannot create directory ‘.kaggle’: File exists
mv: cannot move '.kaggle' to '/root/.kaggle': Directory not empty


In [49]:
!kaggle datasets download elvinagammed/chatbots-intent-recognition-dataset
!unzip chatbots-intent-recognition-dataset.zip

chatbots-intent-recognition-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  chatbots-intent-recognition-dataset.zip
replace Intent.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [50]:
data_ = json.load(open("Intent.json"))

In [51]:
intents_ = data_["intents"]
num_intent = len(intents_)

In [52]:
words = set()
for intent_ in intents_:
  for sentence_ in intent_["text"]:
    for w in sp_en(sentence_):
      words.add(str(w).lower())
  for sentence_ in intent_["responses"]:
    for w in sp_en(sentence_):
      words.add(str(w).lower())

In [53]:
bag_of_words = {k: v for v,k in enumerate(words)}
num_words = len(bag_of_words)

In [54]:
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [55]:
model = nn.Sequential(
    nn.Linear(num_words,256),
    nn.ReLU(inplace=True),
    nn.Linear(256,num_intent)
).to(dev)

In [56]:
class MyDataset(torch.utils.data.Dataset):
  def __init__(self):
    self.intents_text = []
    self.intents_target = []
    for target, intent_ in enumerate(intents_):
      self.intents_text += intent_["text"]
      self.intents_target += [target]*len(intent_["text"])
  
  def __getitem__(self,idx):
    x = torch.zeros(num_words)
    for w in sp_en(self.intents_text[idx]):
      w = str(w).lower()
      x[bag_of_words[w]] = 1
    
    return x, self.intents_target[idx]
  def __len__(self):
    return len(self.intents_text)

In [57]:
train_ds = MyDataset()
train_dl = torch.utils.data.DataLoader(train_ds,batch_size=32,shuffle=True)

In [58]:
def evaluate(model, loader, crit):
  model.eval()
  total = 0
  corrects = 0
  avg_loss = 0
  for x, y in loader:
    x = x.to(dev)
    y = y.to(dev)
    o = model(x)
    loss = crit(o,y)
    avg_loss += loss.item()
    corrects += torch.sum(torch.argmax(o,axis=1) == y).item()
    total += len(y)
  acc = 100* corrects / total
  avg_loss /= len(loader)  
  return avg_loss, acc

def train_one_epoch(model, train_loader, crit, optim):
  model.train()
  total = 0
  corrects = 0
  avg_loss = 0
  for x, y in train_loader:
    optim.zero_grad()
    x = x.to(dev)
    y = y.to(dev)
    o = model(x)
    loss = crit(o,y)
    avg_loss += loss.item()
    loss.backward()
    optim.step()
    corrects += torch.sum(torch.argmax(o,axis=1) == y).item()
    total += len(y)
  acc = 100 * corrects / total
  avg_loss /= len(train_loader)
  return avg_loss, acc

def train(model, train_loader, test_loader, crit, optim, epochs = 20):
  for epoch in range(epochs):
    train_loss, train_acc = train_one_epoch(model, train_loader,crit, optim)
    test_loss, test_acc = evaluate(model, test_loader, crit)
    print(f"epoch: {epoch}, train loss: {train_loss}, train acc: {train_acc}%, test loss: {test_loss}, test acc: {test_acc}%")

In [59]:
crit = nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters(),lr=0.1)
train(model,train_dl,train_dl, crit, optim,epochs=10)

epoch: 0, train loss: 2.581163740158081, train acc: 30.76923076923077%, test loss: 0.8848951935768128, test acc: 78.32167832167832%
epoch: 1, train loss: 0.6092755734920502, train acc: 80.41958041958041%, test loss: 0.280326846241951, test acc: 93.00699300699301%
epoch: 2, train loss: 0.15987884066998959, train acc: 94.4055944055944%, test loss: 0.048370074667036536, test acc: 97.2027972027972%
epoch: 3, train loss: 0.1032136266672751, train acc: 97.2027972027972%, test loss: 0.004532892489805818, test acc: 100.0%
epoch: 4, train loss: 0.09504019755986519, train acc: 99.3006993006993%, test loss: 0.0009294453797338065, test acc: 100.0%
epoch: 5, train loss: 0.0006214900777194998, train acc: 100.0%, test loss: 0.00013553579274230287, test acc: 100.0%
epoch: 6, train loss: 0.00014146582102512185, train acc: 100.0%, test loss: 0.0008331643863812132, test acc: 100.0%
epoch: 7, train loss: 0.01492447194395936, train acc: 99.3006993006993%, test loss: 8.020959754162504e-06, test acc: 100.0%


In [60]:
torch.save(model.state_dict(),"chatbot.ckpt")

In [61]:
def get_response(model, text_):
  model.eval()
  x = torch.zeros(1,num_words)
  for w in sp_en(text_):
    w = str(w).lower()
    if w in bag_of_words:
      x[0,bag_of_words[w]] = 1
  pred_scores = torch.softmax(model(x.to(dev)),1).cpu()
  pred_intent_idx = torch.argmax(pred_scores).item()
  confidence = pred_scores[0][pred_intent_idx].item() * 100
  print("confidence: ", confidence, "%")
  responses_ = intents_[pred_intent_idx]["responses"]
  if confidence > 50:
    rand_idx = np.random.randint(0,len(responses_))
    return responses_[rand_idx]
  return "I don't understand what you are saying"

In [62]:
get_response(model,"who are you?")

confidence:  99.99997615814209 %


'Call me Geni'

# Hugging Face

# Traductor

In [63]:
!pip3 install transformers



In [65]:
!pip3 install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 22.8 MB/s eta 0:00:01[K     |▌                               | 20 kB 31.0 MB/s eta 0:00:01[K     |▉                               | 30 kB 23.6 MB/s eta 0:00:01[K     |█                               | 40 kB 12.1 MB/s eta 0:00:01[K     |█▍                              | 51 kB 11.3 MB/s eta 0:00:01[K     |█▋                              | 61 kB 12.7 MB/s eta 0:00:01[K     |██                              | 71 kB 13.2 MB/s eta 0:00:01[K     |██▏                             | 81 kB 13.2 MB/s eta 0:00:01[K     |██▍                             | 92 kB 14.5 MB/s eta 0:00:01[K     |██▊                             | 102 kB 11.8 MB/s eta 0:00:01[K     |███                             | 112 kB 11.8 MB/s eta 0:00:01[K     |███▎                            | 122 kB 11.8 MB/s eta 0:00:01[K     |██

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-es")

Downloading:   0%|          | 0.00/783k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/807k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/298M [00:00<?, ?B/s]

In [13]:
original_phrase = "I would like to visit other latinamerican countries"

In [14]:
tokenizer(original_phrase,return_tensors="pt")

{'input_ids': tensor([[   33,   134,   172,    13,  2080,   114, 42483, 29893,   240,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
model.generate(**tokenizer(original_phrase,return_tensors="pt"))

tensor([[65000,   377,  3756,  4998,   262,   185,     6, 33191,     0]])

In [16]:
tokenizer.batch_decode(model.generate(**tokenizer(original_phrase,return_tensors="pt")))

['<pad> Me gustaría visitar otros países latinoamericanos']

# Question answering

In [160]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "deepset/roberta-base-squad2"

nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': "Who is Candace?",
    'context':  """
Phineas and Ferb is an American animated musical-comedy television series created by Dan Povenmire and Jeff "Swampy" Marsh for Disney Channel and Disney XD. Produced by Disney Television Animation, the series was originally broadcast as a one-episode preview on August 17, 2007, and again previewed on September 28, 2007, the series officially premiered on February 1, 2008, on Disney Channel, running until June 12, 2015.

The program follows Phineas Flynn and his stepbrother Ferb Fletcher,[2] who are between eight and ten years old,[3] during summer vacation. Every day, the boys embark on a grand new project, which is usually unrealistic in scale given the protagonists' ages (and are sometimes physically impossible). This annoys their controlling older sister Candace, who frequently tries to reveal their shenanigans to her and Phineas' mother, Linda Flynn-Fletcher, and less frequently to Ferb's father, Lawrence Fletcher. The series follows a standard plot system; running gags occur in every episode, and the subplot almost always features Phineas and Ferb's pet platypus Perry the Platypus working as a spy named "Agent P" for OWCA (the Organization Without a Cool Acronym) to defeat the latest scheme of Dr. Heinz Doofenshmirtz, a mad scientist driven largely by a need to assert his evilness (although he is not especially evil and has a good heart in some situations). The two plots intersect at the end to erase all traces of the boys' project just before Candace can show it to their mother, which usually leaves Candace very frustrated.

Povenmire and Marsh had previously worked together on Fox's The Simpsons and Nickelodeon's Rocko's Modern Life. The creators also voice two of the main B-plot characters, Dr. Doofenshmirtz and Major Monogram. Phineas and Ferb was conceived after Povenmire sketched a triangular boy – the prototype for Phineas – in a restaurant. Povenmire and Marsh developed the series concept together and pitched it to networks for 16 years before securing a run on Disney Channel.[2]"""
}
res = nlp(QA_input)

  tensor = as_tensor(value)
  for span_id in range(num_spans)


In [161]:
res

{'answer': 'older sister',
 'end': 768,
 'score': 0.7630390524864197,
 'start': 756}