In [0]:
# ! sudo apt install openjdk-8-jdk
# ! sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java 
# ! pip install language-check -qq
# ! pip install pycontractions -qq

In [0]:
! pip install chatterbot 
! pip install chatterbot_corpus

In [0]:
! pip install --upgrade chatterbot 
! pip install --upgrade chatterbot_corpus

In [0]:
import warnings
warnings.filterwarnings('ignore')

# Import statements
import pandas as pd
import numpy as np
import pprint as pp
import json
from pandas.io.json import json_normalize
import re
from timeit import default_timer

# Preprocessing
# from pycontractions import Contractions

# Tokenization imports
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# Puncuation & lower case
import string #punctuation removal

# Stop words
from nltk.corpus import stopwords

# Stemming
from nltk.stem.snowball import SnowballStemmer

# Lemmatizer
from nltk.stem.wordnet import WordNetLemmatizer

# POS tagging
from nltk.corpus import wordnet

# NER
import nltk, nltk.tag, nltk.chunk 
import spacy
import pprint as pprint
from gensim.summarization import summarize 
from collections import Counter 
import en_core_web_sm # CNN gets loaded in, sees what words depends on each other, POS tagging, entity recognition 
from spacy import displacy # Visualize NER

# Chatterbot
from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer

# Data Preprocessing

In [0]:
# This is testing with part 1 - 10% of data
with open('/content/drive/My Drive/contraction_data_parts/expand_convo_text_1.txt', 'r') as file:
    convo_all = file.read()

In [5]:
# Let's see the text data, seems messy 
# When we extracted conversations from the frames dataset, we split every statement with a new line and every conversations with *
# Through the use of pycontractions, every new line (\n) was added with an extra \
pp.pprint(convo_all[0:1000]) 

('["I would like to book a trip to Atlantis from Caprica on Saturday, August '
 '13, 2016 for 8 adults. I have a tight budget of 1700.\\nHi...I checked a few '
 'options for you, and unfortunately, we do not currently have any trips that '
 'meet this criteria.  Would you like to book an alternate travel '
 'option?\\nYes, how about going to Neverland from Caprica on August 13, 2016 '
 'for 5 adults. For this trip, my budget would be 1900.\\nI checked the '
 'availability for this date and there were no trips available.  Would you '
 'like to select some alternate dates?\\nI have no flexibility for dates... '
 'but I can leave from Atlantis rather than Caprica. How about that?\\nI '
 'checked the availability for that date and there were no trips available.  '
 'Would you like to select some alternate dates?\\nI suppose I will speak with '
 'my husband to see if we can choose other dates, and then I will come back to '
 'you.Thanks for your help\\n**************************************

In [0]:
# Splits text data into separate lists for each conversation
def text_to_convo(text):
  list_convos = [convo.split('\n') for convo in text.split('*') if text]         # Conversation delimited by *
  list_convos = [convo for convo in list_convos if convo != ['']]                # Remove empty conversation
  return list_convos

In [0]:
# Splits all the statements from a conversation into their own string
def convo_to_statement(list_convos):
  sep_convos = []
  for items in list_convos: 
    sep_convos.append([])                     # Creates list for each conversation
    for item in items:
      item = item.split('\\n')                # Splits conversations into statements 
      sep_convos[-1].append(item)             # Adds all statements to corresponding list
  return sep_convos

In [0]:
# Flattens the list as there is a list in another list
# - Handles sentence tokenization, as all statements are separated into their own string in this list 
def flatten_list(sep_convos): 
  flat_list = []
  for convo in sep_convos:
    for sublist in convo:
      for item in sublist:
            flat_list.append(item)
  return flat_list

## Tokenization

In [50]:
nltk.download('punkt') # Punkt sentence tokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
# Splits sentences into words 
def word_token(flat_list):
  tokenized_words=[]
  tokenized_words.extend(word for word in word_tokenize(str(flat_list))) # Extends list by appending elements from the iterable
  return tokenized_words

## Punctuation & Lower Case

In [69]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
def punc_removal(tokenized_words):
  translator = str.maketrans('', '', string.punctuation) 
  no_punct = [word.translate(translator) for word in tokenized_words]
  no_punct = [word for word in no_punct if word != '']   
  return no_punct             # Remove empty strings

In [0]:
# Main code to run with functions
list_of_convo = text_to_convo(convo_all)
list_of_statement = convo_to_statement(list_of_convo)
flat_convo = flatten_list(list_of_statement)
list_of_word = word_token(flat_convos)
no_punct_words = punc_removal(list_of_word)

In [106]:
no_punct_words

['I',
 'would',
 'like',
 'to',
 'book',
 'a',
 'trip',
 'to',
 'Atlantis',
 'from',
 'Caprica',
 'on',
 'Saturday',
 'August',
 '13',
 '2016',
 'for',
 '8',
 'adults',
 'I',
 'have',
 'a',
 'tight',
 'budget',
 'of',
 '1700',
 'Hi',
 'I',
 'checked',
 'a',
 'few',
 'options',
 'for',
 'you',
 'and',
 'unfortunately',
 'we',
 'do',
 'not',
 'currently',
 'have',
 'any',
 'trips',
 'that',
 'meet',
 'this',
 'criteria',
 'Would',
 'you',
 'like',
 'to',
 'book',
 'an',
 'alternate',
 'travel',
 'option',
 'Yes',
 'how',
 'about',
 'going',
 'to',
 'Neverland',
 'from',
 'Caprica',
 'on',
 'August',
 '13',
 '2016',
 'for',
 '5',
 'adults',
 'For',
 'this',
 'trip',
 'my',
 'budget',
 'would',
 'be',
 '1900',
 'I',
 'checked',
 'the',
 'availability',
 'for',
 'this',
 'date',
 'and',
 'there',
 'were',
 'no',
 'trips',
 'available',
 'Would',
 'you',
 'like',
 'to',
 'select',
 'some',
 'alternate',
 'dates',
 'I',
 'have',
 'no',
 'flexibility',
 'for',
 'dates',
 'but',
 'I',
 'can',
 

In [0]:
no_punct = [word.strip('\'') for word in no_punct] # Pesky ' attatched to word
no_punct = [word.replace('.',' ') for word in no_punct] # Pesky . attatched to word

In [0]:
# Lowercase text
data_lower = [word.lower() for word in no_punct]
print(data_lower[-10:])

['i', 'will', 'come', 'back', 'to', 'you thanks', 'for', 'your', 'help', '']


## Stop Words

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
stop_words = stopwords.words('english')
add_stop_words = ['would','could']
stop_words += add_stop_words
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [0]:
filtered_word = []
for word in data_lower:
  if word not in stop_words:
    filtered_word.append(word)

In [0]:
filtered_word

['like',
 'book',
 'trip',
 'atlantis',
 'caprica',
 'saturday',
 'august',
 '13',
 '2016',
 '8',
 'adults',
 'tight',
 'budget',
 '1700',
 'hi',
 'checked',
 'options',
 'unfortunately',
 'currently',
 'trips',
 'meet',
 'criteria',
 'like',
 'book',
 'alternate',
 'travel',
 'option',
 'yes',
 'going',
 'neverland',
 'caprica',
 'august',
 '13',
 '2016',
 '5',
 'adults',
 'trip',
 'budget',
 '1900',
 'checked',
 'availability',
 'date',
 'trips',
 'available',
 'like',
 'select',
 'alternate',
 'dates',
 'flexibility',
 'dates',
 'leave',
 'atlantis',
 'rather',
 'caprica',
 'checked',
 'availability',
 'date',
 'trips',
 'available',
 'like',
 'select',
 'alternate',
 'dates',
 'suppose',
 'speak',
 'husband',
 'see',
 'choose',
 'dates',
 'come',
 'back',
 'you thanks',
 'help',
 '']

### Emoji removal

In [0]:
for word in filtered_word[:]:       #makes a copy of the list words and then iterates over that copy. Then, modifies the original list.
    if word.endswith('_face'): 
        filtered_word.remove(word) 

## Stemming

In [0]:
stemmer = SnowballStemmer('english', ignore_stopwords=True) # Already removed stopwords

In [0]:
stemmed_words = []

for words in filtered_sent:
  
  stemmed_words.append(stemmer.stem(words)) 

  print('Words '+words+' - stemmer:'+stemmer.stem(words))

In [0]:
filtered_words = []
for word in stemmed_words:
  if word not in stop_words:
    filtered_words.append(word)

In [0]:
filtered_words

## Lemmatizing with appropriate POS tag

In [0]:
nltk.download('wordnet')                        # Lemmatization
nltk.download('averaged_perceptron_tagger')     # POS tagging

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [0]:
lem_words = []
lem = WordNetLemmatizer()
lem_words = [lem.lemmatize(word, get_wordnet_pos(word)) for word in filtered_word if word]

In [0]:
lem_words[-10:]

['suppose',
 'speak',
 'husband',
 'see',
 'choose',
 'date',
 'come',
 'back',
 'you thanks',
 'help']

## NER

In [0]:
nlp = en_core_web_sm.load()

In [0]:
text = ' '.join(lem_words)

In [0]:
convo_nlp = nlp(text)

In [0]:
labels = [ent.label_ for ent in convo_nlp.ents]
Counter(labels)

Counter({'CARDINAL': 1, 'DATE': 4, 'PRODUCT': 2})

In [0]:
#Print a random sentence from these conversations 
sentences = [x for x in convo_nlp.sents]
print(sentences[0])

like book trip atlantis caprica saturday august 13 2016 8 adult tight budget 1700 hi checked option unfortunately currently trip meet criterion like book alternate travel option yes go neverland caprica august 13 2016 5 adult trip budget 1900 checked


In [0]:
displacy.render(convo_nlp, jupyter=True, style='ent', page=True)

# Chatterbot

In [0]:
# Create a new chat bot named Charlie
chatbot = ChatBot('Charlie')
trainer = ListTrainer(chatbot)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
trainer.train([
    'How are you?',
    'I am good.',
    'That is good to hear.',
    'Thank you',
    'You are welcome.',
])

List Trainer: [####################] 100%


In [0]:
trainer.train(flat_list)

List Trainer: [####################] 100%


In [0]:
print(chatbot.get_response('Hi'))

To start, just give me some information on where you would like to travel, your budget, your point of departure, or any other travel info you can think of.


In [0]:
print(chatbot.get_response('I would like to travel from Toronto to Japan'))

Did you have any specific dates in mind?


In [0]:
print(chatbot.get_response('June to August'))

How much would that be?


In [0]:
print(chatbot.get_response('under 1200'))

Perfect. Thank you.
