In [3]:
! sudo apt install openjdk-8-jdk
! sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
! pip install language-check -qq
! pip install pycontractions -qq

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jre x11-utils
Suggested packages:
  openjdk-8-demo openjdk-8-source visualvm icedtea-8-plugin mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jdk openjdk-8-jre x11-utils
0 upgraded, 8 newly installed, 0 to remove and 25 not upgraded.
Need to get 4,942 kB of archives.
After this operation, 13.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-core all 2.37-1 [1,041 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-extra all 2.37-1 [1,953 kB

In [0]:
# Import statements
import pandas as pd
import numpy as np
import pprint as pp
import json
from pandas.io.json import json_normalize
import re
from timeit import default_timer

# Preprocessing
from pycontractions import Contractions

In [0]:
# Load JSON file into dataframe
data = pd.read_json('/content/drive/My Drive/frames.json')
df = pd.DataFrame(data)

In [0]:
# Attempted json_normalize but kept getting the error: 'list' object has no attribute 'values'
# This happens because of list enclosing each of the dictionary.
#   - Multiple dictionaries with matching keys enclosed by a list.
#   - A dictionary with keys as columns and values in the form of list.

# Thus we try another approach
text_list = []
for item in df['turns']:
  text_list.append(item)
turns_df = pd.DataFrame(text_list)

In [0]:
turns_df.replace(np.nan,'',inplace=True) # Replace all null values

In [0]:
turns_df_T = turns_df.transpose() # Transposed the dataframe to have proper conversation turns

In [0]:
# Iterates through the data frame and gets all values with a text key
convo_text = ''
for i in turns_df_T:                                 # 1369 columns
  for j in range(0,48):                              # 48 rows
    try:
      get_text = turns_df_T[i][j].get('text')        # Gets the value from the corresponding key
      convo_text += get_text + ' '                   # Concatenates all text
    except AttributeError:
      convo_text += ''                               # Concatenates '' if there is no text key - used for switching conversations

In [10]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-03-25 17:11:19--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.136.125
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.136.125|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-03-25 17:11:45 (60.6 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [0]:
# Expand contractions
# - Load semantic vector model in gensim keyedvectors format from disk
cont = Contractions('GoogleNews-vectors-negative300.bin.gz')

## 1st Quarter

In [12]:
len(convo_text)/4

331315.25

In [25]:
len(convo_text[0:331251])/len(convo_text) # Approx. 25% of dataset

0.2499515189838077

In [24]:
pp.pprint(convo_text[331100:331251])

(' 30th and return on September 7th. ok Shall I go ahead and book that for '
 'you? ok Wonderful, all booked :slightly_smiling_face: Enjoy your stay. '
 'thanks ')


In [0]:
convo_text_qtr_1 = convo_text[0:331251]

In [28]:
len(convo_text_qtr_1)

331251

In [29]:
start = default_timer()
expand_convo_text_qtr_1 = str(list(cont.expand_texts([convo_text_qtr_1], precise=False))) #this part takes forever
end = default_timer()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


elapsed time: 3995.5319152054167 min


In [30]:
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 55.325283718950004 min


In [0]:
with open('expand_convo_text_qtr_1.txt', "w") as text_file:
    text_file.write(expand_convo_text_qtr_1)

## 2nd Quarter

In [52]:
len(convo_text[331251:662139])/len(convo_text) # Approx. 25% of dataset

0.24967761067442565

In [51]:
pp.pprint(convo_text[662038:662139])

('it please This trip had been reserved for you.  Please contact us should you '
 'have any further needs. ')


In [0]:
convo_text_qtr_2 = convo_text[331251:662139]

In [55]:
start = default_timer()
expand_convo_text_qtr_2 = str(list(cont.expand_texts([convo_text_qtr_2], precise=False))) #this part takes forever
end = default_timer()
print(f'elapsed time: {(end-start)/60} min')

elapsed time: 63.01775751359998 min


In [0]:
with open('expand_convo_text_qtr_2.txt', "w") as text_file:
    text_file.write(expand_convo_text_qtr_2)

## 3rd Quarter

In [79]:
len(convo_text[662139:994130])/len(convo_text) # Approx. 25% of dataset

0.25050989955940756

In [82]:
pp.pprint(convo_text[994030:994130])

('y departing on Sept 12 and returning on Sept 19 at 3480.49USD. Correct? '
 'Wonderful! thank you kindly ')


In [0]:
convo_text_qtr_3 = convo_text[662139:994130]

In [84]:
start = default_timer()
expand_convo_text_qtr_3 = str(list(cont.expand_texts([convo_text_qtr_3], precise=False))) #this part takes forever
end = default_timer()
print(f'elapsed time: {(end-start)/60} min')

elapsed time: 90.31492734950001 min


In [0]:
with open('expand_convo_text_qtr_3.txt', "w") as text_file:
    text_file.write(expand_convo_text_qtr_3)

## 4th Quarter

In [96]:
len(convo_text)

1325261

In [87]:
len(convo_text[994130:1325261])/len(convo_text) # Approx. 25% of dataset

0.2498609707823591

In [91]:
pp.pprint(convo_text[1325161:1325260])

('k perfect, book me! Consider it done! Have a good trip '
 ':slightly_smiling_face: Thanks! My pleasure!')


In [0]:
convo_text_qtr_4 = convo_text[994130:1325260]

In [93]:
start = default_timer()
expand_convo_text_qtr_4 = str(list(cont.expand_texts([convo_text_qtr_4], precise=False))) #this part takes forever
end = default_timer()
print(f'elapsed time: {(end-start)/60} min')

elapsed time: 91.1644320107833 min


In [0]:
with open('expand_convo_text_qtr_4.txt', "w") as text_file:
    text_file.write(expand_convo_text_qtr_4)

## Concatenate all parts


In [100]:
print(expand_convo_text_qtr_1[0:10])

['I would 


In [0]:
expand_convo_text_full = expand_convo_text_qtr_1 + expand_convo_text_qtr_2 + expand_convo_text_qtr_3 + expand_convo_text_qtr_4

In [102]:
len(expand_convo_text_full) # Expanded contractions

1332551

In [103]:
len(convo_text) # Original text

1325261

In [0]:
with open('expand_convo_text_full.txt', "w") as text_file:
    text_file.write(expand_convo_text_full)

Total time was approximately 5 hours to fully expand all contractions in text.