In [1]:
import string
import re
import urllib.request
import bs4 as bs
import nltk

In [2]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/federicotanzi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/federicotanzi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/federicotanzi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/ChatGPT')
raw_html = raw_html.read()

# Parsear artículo, 'lxml' es el parser a utilizar
article_html = bs.BeautifulSoup(raw_html, 'lxml')

# Encontrar todos los párrafos del HTML (bajo el tag <p>)
# y tenerlos disponible como lista
article_paragraphs = article_html.find_all('p')

article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text = article_text.lower()

In [4]:
# Demos un vistazo
article_text

'\nchatgpt (chat generative pre-trained transformer) is an artificial intelligence chatbot developed by openai and launched on november 30, 2022. it is notable for enabling users to refine and steer a conversation towards a desired length, format, style, level of detail, and language used. successive prompts and replies are taken into account at each stage of the conversation as a context.[2]\nchatgpt is built upon gpt-3.5 and gpt-4, from openai\'s proprietary series of foundational gpt models. these large language models (llms)[3] have been fine-tuned for conversational applications using a combination of supervised and reinforcement learning techniques. according to llm experts, this training has made chatgpt better at handling "hallucinations" than its preceding model, gpt-3, but chatgpt is still known to confidently presently inaccurate information.[4][5] chatgpt was released as a freely available research preview, but due to its popularity, openai now operates the service on a fre

In [5]:
# substituir con regex con espacio vacío:
text = re.sub(r'\[[0-9]*\]', ' ', article_text) # substituir los números entre corchetes
# (notar que los corchetes son interpretados literalmente por los backlsash)
text = re.sub(r'\s+', ' ', text) # substituir más de un caracter de espacio, salto de línea o tabulación.'

In [6]:
corpus = nltk.sent_tokenize(text) # divide en oraciones
words = nltk.word_tokenize(text) # divide en términos

In [7]:
# Demos un vistazo
corpus[:10]

[' chatgpt (chat generative pre-trained transformer) is an artificial intelligence chatbot developed by openai and launched on november 30, 2022. it is notable for enabling users to refine and steer a conversation towards a desired length, format, style, level of detail, and language used.',
 'successive prompts and replies are taken into account at each stage of the conversation as a context.',
 "chatgpt is built upon gpt-3.5 and gpt-4, from openai's proprietary series of foundational gpt models.",
 'these large language models (llms) have been fine-tuned for conversational applications using a combination of supervised and reinforcement learning techniques.',
 'according to llm experts, this training has made chatgpt better at handling "hallucinations" than its preceding model, gpt-3, but chatgpt is still known to confidently presently inaccurate information.',
 'chatgpt was released as a freely available research preview, but due to its popularity, openai now operates the service on

In [8]:
# Demos un vistazo
words[:20]

['chatgpt',
 '(',
 'chat',
 'generative',
 'pre-trained',
 'transformer',
 ')',
 'is',
 'an',
 'artificial',
 'intelligence',
 'chatbot',
 'developed',
 'by',
 'openai',
 'and',
 'launched',
 'on',
 'november',
 '30']

In [9]:
print("Vocabulario:", len(words))

Vocabulario: 6779


In [10]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def perform_lemmatization(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# ord() nos da el código Unicode para un caracter dado
punctuation_removal = dict((ord(punctuation), None) for punctuation in string.punctuation)

def get_processed_text(document):
    # 1 - reduce el texto a mínuscula (string.lower())
    # 2 - quitar los simbolos de puntuacion (string.translate())
    # 3 - realiza la tokenización (nltk.word_tokenize)
    # 4 - realiza la lematización (nuestra función perform_lemmatization)
    return perform_lemmatization(nltk.word_tokenize(document.lower().translate(punctuation_removal)))

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def generate_response(user_input, corpus):
    response = ''
    # Sumar al corpus la pregunta del usuario para calcular
    # su cercania con otros documentos/sentencias
    # la entrada del usuario se usa para tokenizar y vectorizar
    corpus.append(user_input)

    # Crear un vectorizar TFIDF que quite las "stop words" del ingles y utilice
    # nuestra funcion para obtener los tokens lematizados "get_processed_text"
    word_vectorizer = TfidfVectorizer(tokenizer=get_processed_text, stop_words='english')

    # Crear los vectores a partir del corpus
    all_word_vectors = word_vectorizer.fit_transform(corpus)

    # Calcular la similitud coseno entre todas los documentos excepto el agregado (el útlimo "-1")
    # NOTA: con los word embedings veremos más en detalle esta matriz de similitud
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors)

    # Obtener el índice del vector más cercano a nuestra oración
    # --> descartando la similitud contra nuestor vector propio
    similar_sentence_number = similar_vector_values.argsort()[0][-2]
    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2]

    if vector_matched == 0: # si la similaridad coseno fue nula (ningún término en común)
        response = "I am sorry, I could not understand you"
    else:
        response = corpus[similar_sentence_number] # obtener el documento del corpus más similar
    
    corpus.remove(user_input)
    return response

In [12]:
def bot_response(human_text):
    print("Q:", human_text)    
    resp = generate_response(human_text.lower(), corpus)
    print("A:", resp)
    return resp

bot_response('chatgpt is an artificial intelligence chatbot developed by?')
bot_response('what is The fine-tuning?')
bot_response('chatGPT has limited knowledge of events that occurred after?')

Q: chatgpt is an artificial intelligence chatbot developed by?




A:  chatgpt (chat generative pre-trained transformer) is an artificial intelligence chatbot developed by openai and launched on november 30, 2022. it is notable for enabling users to refine and steer a conversation towards a desired length, format, style, level of detail, and language used.
Q: what is The fine-tuning?
A: the fine-tuning was accomplished using human trainers to improve the model's performance and, in the case of supervised learning, the trainers played both sides: the user and the ai assistant.
Q: chatGPT has limited knowledge of events that occurred after?
A: chatgpt has limited knowledge of events that occurred after september 2021. in training chatgpt, human reviewers preferred longer answers, regardless of actual comprehension or factual content.




'chatgpt has limited knowledge of events that occurred after september 2021. in training chatgpt, human reviewers preferred longer answers, regardless of actual comprehension or factual content.'