In [1]:
! sudo apt install openjdk-8-jdk
! sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
! pip install language-check -qq
! pip install pycontractions -qq

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jre x11-utils
Suggested packages:
  openjdk-8-demo openjdk-8-source visualvm icedtea-8-plugin mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jdk openjdk-8-jre x11-utils
0 upgraded, 8 newly installed, 0 to remove and 25 not upgraded.
Need to get 4,942 kB of archives.
After this operation, 13.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-core all 2.37-1 [1,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-extra all 2.37-1 [1,953 kB

In [0]:
# Import statements
import pandas as pd
import numpy as np
import pprint as pp
import json
from pandas.io.json import json_normalize
import re
from timeit import default_timer

# Preprocessing
from pycontractions import Contractions

# Tokenization imports
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# Puncuation & lower case
import string #punctuation removal

# Stop words
from nltk.corpus import stopwords

# Stemming
from nltk.stem.snowball import SnowballStemmer

# Lemmatizer
from nltk.stem.wordnet import WordNetLemmatizer

# POS tagging
from nltk.corpus import wordnet

# NER
import nltk, nltk.tag, nltk.chunk 
import spacy
import pprint as pprint
from gensim.summarization import summarize 
from collections import Counter 
import en_core_web_sm # CNN gets loaded in, sees what words depends on each other, POS tagging, entity recognition 
from spacy import displacy # Visualize NER

# Data extraction

In [0]:
# Load JSON file into dataframe
data = pd.read_json('/content/drive/My Drive/frames.json')
df = pd.DataFrame(data)

In [4]:
df.head() # Looks like turns and labels are nested dictionaries

Unnamed: 0,user_id,turns,wizard_id,id,labels
0,U22HTHYNP,[{'text': 'I'd like to book a trip to Atlantis...,U21DKG18C,e2c0fc6c-2134-4891-8353-ef16d8412c9a,"{'userSurveyRating': 4.0, 'wizardSurveyTaskSuc..."
1,U21E41CQP,"[{'text': 'Hello, I am looking to book a vacat...",U21DMV0KA,4a3bfa39-2c22-42c8-8694-32b4e34415e9,"{'userSurveyRating': 3.0, 'wizardSurveyTaskSuc..."
2,U21RP4FCY,[{'text': 'Hello there i am looking to go on a...,U21E0179B,6e67ed28-e94c-4fab-96b6-68569a92682f,"{'userSurveyRating': 2.0, 'wizardSurveyTaskSuc..."
3,U22HTHYNP,[{'text': 'Hi I'd like to go to Caprica from B...,U21DKG18C,5ae76e50-5b48-4166-9f6d-67aaabd7bcaa,"{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc..."
4,U21E41CQP,"[{'text': 'Hello, I am looking to book a trip ...",U21DMV0KA,24603086-bb53-431e-a0d8-1dcc63518ba9,"{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc..."


In [5]:
df['id'].nunique() # There are 1369 unique ID for the dialogue

1369

In [0]:
# labels is a nested dictionary
labels_df = json_normalize(df['labels'])

In [7]:
labels_df.head()

Unnamed: 0,userSurveyRating,wizardSurveyTaskSuccessful
0,4.0,True
1,3.0,True
2,2.0,False
3,5.0,True
4,5.0,True


`userSurveyRating` - value that represents the user’s satisfaction with the Wizard’s service, ranging from 1 – complete dissatisfaction to 5 – complete satisfaction

`wizardSurveyTaskSuccessful` - boolean which is true if the wizard thinks at the end of the dialogue that the user’s goal was achieved

In [0]:
# Attempted json_normalize but kept getting the error: 'list' object has no attribute 'values'
# This happens because of list enclosing each of the dictionary.
#   - Multiple dictionaries with matching keys enclosed by a list.
#   - A dictionary with keys as columns and values in the form of list.

# Thus we try another approach
text_list = []
for item in df['turns']:
  text_list.append(item)
turns_df = pd.DataFrame(text_list)

In [9]:
turns_df.head(2) # Every row is a dialogue 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47
0,{'text': 'I'd like to book a trip to Atlantis ...,{'db': {'result': [[{'trip': {'returning': {'d...,"{'text': 'Yes, how about going to Neverland fr...","{'db': {'result': [[], [], [], [], [], []], 's...",{'text': 'I have no flexibility for dates... b...,"{'db': {'result': [[]], 'search': [{'ORIGIN_CI...",{'text': 'I suppose I'll speak with my husband...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,"{'text': 'Hello, I am looking to book a vacati...",{'db': {'result': [[{'trip': {'returning': {'d...,{'text': 'What about a trip from Gotham City t...,{'db': {'result': [[{'trip': {'returning': {'d...,{'text': 'Would any packages to Mos Eisley be ...,{'db': {'result': [[{'trip': {'returning': {'d...,"{'text': 'You know what, I'd like to try and v...",{'db': {'result': [[{'trip': {'returning': {'d...,{'text': 'Do you have any trips from Gotham Ci...,{'db': {'result': [[{'trip': {'returning': {'d...,"{'text': 'No, that's too far for me. I need a ...",{'db': {'result': [[{'trip': {'returning': {'d...,"{'text': 'How many days would I be in Kobe?', ...","{'db': {'result': [], 'search': []}, 'text': '...",{'text': 'What would the price be if I shorten...,{'db': {'result': [[{'trip': {'returning': {'d...,"{'text': 'Ok, then I would like to purchase th...",{'db': {'result': [[{'trip': {'returning': {'d...,"{'text': 'Yes, I would like to book this packa...","{'db': {'result': [], 'search': []}, 'text': '...",,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
turns_df.replace(np.nan,'',inplace=True) # Replace all null values

In [0]:
turns_df_T = turns_df.transpose() # Transposed the dataframe to have proper conversation turns

In [0]:
# Iterates through the data frame and gets all values with a text key
convo_text = ''
for i in turns_df_T:                                 # 1369 columns
  for j in range(0,48):                              # 48 rows
    try:
      get_text = turns_df_T[i][j].get('text')        # Gets the value from the corresponding key
      convo_text += get_text + ' '                   # Concatenates all text
    except AttributeError:
      convo_text += ''                               # Concatenates '' if there is no text key - used for switching conversations

# Data Preprocessing

In [13]:
# Conversation sample
pp.pprint(convo_text[0:888]) 

("I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, "
 '2016 for 8 adults. I have a tight budget of 1700. Hi...I checked a few '
 'options for you, and unfortunately, we do not currently have any trips that '
 'meet this criteria.  Would you like to book an alternate travel option? Yes, '
 'how about going to Neverland from Caprica on August 13, 2016 for 5 adults. '
 'For this trip, my budget would be 1900. I checked the availability for this '
 'date and there were no trips available.  Would you like to select some '
 'alternate dates? I have no flexibility for dates... but I can leave from '
 'Atlantis rather than Caprica. How about that? I checked the availability for '
 'that date and there were no trips available.  Would you like to select some '
 "alternate dates? I suppose I'll speak with my husband to see if we can "
 "choose other dates, and then I'll come back to you.Thanks for your help ")


## Contractions

In [14]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-03-25 17:28:21--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.200.181
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.200.181|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-03-25 17:28:41 (82.1 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [15]:
len(convo_text[0:134175])/len(convo_text) # Approx. 10% of dataset

0.10124420774473858

In [16]:
pp.pprint(convo_text[134000:134175]) # Logical end of conversation

('Would you like me to book this package? Hold your horse! It sounds like a '
 'great deal but I need to check with my bride first. I will get back to you.  '
 'Thank you for your help.')


In [0]:
convo_text_sample = convo_text[0:134175]
#convo_text_sample = convo_text[0:round(len(convo_text) * 0.10)]

In [0]:
# Expand contractions
# - Load semantic vector model in gensim keyedvectors format from disk
cont = Contractions('GoogleNews-vectors-negative300.bin.gz')

In [19]:
start = default_timer()
expand_convo_text_sample_false = str(list(cont.expand_texts([convo_text_sample], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {end-start}s")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


elapsed time: 811.843762671s


In [0]:
with open('/content/drive/My Drive/expand_convo_text_sample.txt', "w") as text_file:
    text_file.write(expand_convo_text_sample_false)

In [0]:
# start = default_timer()
# expand_convo_text_full = str(list(cont.expand_texts([convo_text], precise=False))) #this part takes forever
# end = default_timer()
# print(f"elapsed time: {end-start/60} min")

In [0]:
# with open('expand_convo_text_full.txt', "w") as text_file:
#     text_file.write(expand_convo_text_full)

## Tokenization

In [20]:
nltk.download('punkt') # Punkt sentence tokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
#split into sentences
tokenized_sent = sent_tokenize(expand_convo_text_sample_false)
pp.pprint(tokenized_sent[0:13])

['["I would like to book a trip to Atlantis from Caprica on Saturday, August '
 '13, 2016 for 8 adults.',
 'I have a tight budget of 1700.',
 'Hi...I checked a few options for you, and unfortunately, we do not currently '
 'have any trips that meet this criteria.',
 'Would you like to book an alternate travel option?',
 'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 '
 'adults.',
 'For this trip, my budget would be 1900.',
 'I checked the availability for this date and there were no trips available.',
 'Would you like to select some alternate dates?',
 'I have no flexibility for dates... but I can leave from Atlantis rather than '
 'Caprica.',
 'How about that?',
 'I checked the availability for that date and there were no trips available.',
 'Would you like to select some alternate dates?',
 'I suppose I will speak with my husband to see if we can choose other dates, '
 'and then I will come back to you.Thanks for your help Hello, I am looking to '
 'book a v

In [22]:
len(tokenized_sent)

2517

In [23]:
#split into words
tokenized_words=[]
tokenized_words.extend(word for word in word_tokenize(str(tokenized_sent)))
pp.pprint(tokenized_words[0:40])

['[',
 "'",
 '[',
 '``',
 'I',
 'would',
 'like',
 'to',
 'book',
 'a',
 'trip',
 'to',
 'Atlantis',
 'from',
 'Caprica',
 'on',
 'Saturday',
 ',',
 'August',
 '13',
 ',',
 '2016',
 'for',
 '8',
 'adults',
 '.',
 "'",
 ',',
 "'I",
 'have',
 'a',
 'tight',
 'budget',
 'of',
 '1700',
 '.',
 "'",
 ',',
 "'Hi",
 '...']


## Punctuation & Lower Case

In [0]:
string.punctuation = string.punctuation + '``' + '...'

In [25]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~``...'

In [26]:
# Removes punctuation
no_punct = [token for token in tokenized_words if token not in string.punctuation] 
print(no_punct[0:20])

['I', 'would', 'like', 'to', 'book', 'a', 'trip', 'to', 'Atlantis', 'from', 'Caprica', 'on', 'Saturday', 'August', '13', '2016', 'for', '8', 'adults', "'I"]


In [27]:
# Lowercase text
data_lower = [word.lower() for word in no_punct]
print(data_lower[0:20])

['i', 'would', 'like', 'to', 'book', 'a', 'trip', 'to', 'atlantis', 'from', 'caprica', 'on', 'saturday', 'august', '13', '2016', 'for', '8', 'adults', "'i"]


## Stop Words

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [41]:
stop_words = stopwords.words('english')
add_stop_words = ['\'i','would','could']
stop_words += add_stop_words
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [0]:
filtered_word = []
for word in data_lower:
  if word not in stop_words:
    filtered_word.append(word)

In [43]:
filtered_word[0:20]

['like',
 'book',
 'trip',
 'atlantis',
 'caprica',
 'saturday',
 'august',
 '13',
 '2016',
 '8',
 'adults',
 'tight',
 'budget',
 '1700',
 "'hi",
 'checked',
 'options',
 'unfortunately',
 'currently',
 'trips']

## Stemming

In [0]:
stemmer = SnowballStemmer('english', ignore_stopwords=True) # Already removed stopwords

In [0]:
stemmed_words = []

for words in filtered_sent:
  
  stemmed_words.append(stemmer.stem(words)) 

  print('Words '+words+' - stemmer:'+stemmer.stem(words))

In [0]:
filtered_words = []
for word in stemmed_words:
  if word not in stop_words:
    filtered_words.append(word)

In [0]:
filtered_words

## Lemmatizing with appropriate POS tag

In [32]:
nltk.download('wordnet')                        # Lemmatization
nltk.download('averaged_perceptron_tagger')     # POS tagging

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
filtered_word = [word.strip('\'') for word in filtered_word] # Pesky ' attatched to word

In [0]:
filtered_word_2 = []
for word in filtered_word:
  if word not in stop_words:
    filtered_word_2.append(word)

In [46]:
pp.pprint(filtered_word_2[-10:])

['deal', 'need', 'check', 'bride', 'first', 'get', 'back', 'thank', 'help', '']


In [0]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [0]:
lem_words = []
lem = WordNetLemmatizer()
lem_words = [lem.lemmatize(word, get_wordnet_pos(word)) for word in filtered_word_2 if word]

## NER

In [0]:
nlp = en_core_web_sm.load()

In [0]:
text = ' '.join(lem_words)

In [0]:
convo_1 = nlp(text)

In [88]:
labels = [x.label_ for x in convo_1.ents]
Counter(labels)

Counter({'CARDINAL': 552,
         'DATE': 660,
         'EVENT': 15,
         'FAC': 37,
         'GPE': 470,
         'LAW': 1,
         'LOC': 8,
         'MONEY': 5,
         'NORP': 2,
         'ORDINAL': 66,
         'ORG': 84,
         'PERSON': 82,
         'PRODUCT': 25,
         'QUANTITY': 3,
         'TIME': 68,
         'WORK_OF_ART': 1})

In [106]:
#Print a random sentence from these conversations 
sentences = [x for x in convo_1.sents]
print(sentences[12])

neverland find trip available neverland trip gotham city kobe original budget 2100


In [112]:
displacy.render(convo_1, jupyter=True, style='ent', page=True)