In [0]:
! sudo apt install openjdk-8-jdk
! sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java 
! pip install language-check -qq
! pip install pycontractions -qq

In [0]:
# Import statements
import pandas as pd
import numpy as np
import pprint as pp
import json
from timeit import default_timer
import glob
import shutil
# Preprocessing
from pycontractions import Contractions

In [0]:
# Load JSON file into dataframe
data = pd.read_json('/content/drive/My Drive/frames.json')
df = pd.DataFrame(data)

In [0]:
# Attempted json_normalize but kept getting the error: 'list' object has no attribute 'values'
# This happens because of list enclosing each of the dictionary.
#   - Multiple dictionaries with matching keys enclosed by a list.
#   - A dictionary with keys as columns and values in the form of list.

# Thus we try another approach
text_list = []
for item in df['turns']:
  text_list.append(item)
turns_df = pd.DataFrame(text_list)

In [0]:
turns_df.columns = turns_df.columns.astype(str)

In [0]:
turns_df['48'] = np.nan

In [0]:
turns_df.replace(np.nan,'',inplace=True) # Replace all null values

In [0]:
turns_df_T = turns_df.transpose() # Transposed the dataframe to have proper conversation turns

In [0]:
# Iterates through the data frame and gets all values with a text key
convo_text = ''
for i in turns_df_T:                                 # 1369 columns
  for j in range(0,49):                              # 49 rows
    try:
      get_text = turns_df_T[i][j].get('text')        # Gets the value from the corresponding key
      convo_text += get_text + '\n'                   # Concatenates all text
    except AttributeError:
      convo_text += '*'                               # Concatenates '' if there is no text key - used for switching conversations

## 1/10

In [14]:
len(convo_text[0:135938])/len(convo_text) # Approx. 10% of dataset

0.09905447274613875

In [15]:
pp.pprint(convo_text[135838:135938])

('ook this trip\n'
 'Great! Booking is complete. Enjoy your vacation!\n'
 'Thank you!\n'
 '**************************')


In [0]:
convo_text_1 = convo_text[0:135938]

In [0]:
len(convo_text_1)

135938

In [0]:
start = default_timer()
expand_convo_text_1 = str(list(cont.expand_texts([convo_text_1], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


elapsed time: 10.945355405350002 min


In [0]:
len(expand_convo_text_1)

138388

In [0]:
with open('/content/drive/My Drive/expand_convo_text_1.txt', "w") as text_file:
    text_file.write(expand_convo_text_1)

## 2/10

In [0]:
len(convo_text[135938:271715])/len(convo_text) # Approx. 10% of dataset

0.09893715624808723

In [0]:
pp.pprint(convo_text[271566:271715])

('e! Your package has been booked. Enjoy your stay!\n'
 'Thank you for your assistance in finding this killer deal.\n'
 '****************************************')


In [0]:
convo_text_2 = convo_text[135938:271715]

In [0]:
len(convo_text_2)

135777

In [0]:
start = default_timer()
expand_convo_text_2 = str(list(cont.expand_texts([convo_text_2], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 20.695872300650006 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_2.txt', "w") as text_file:
    text_file.write(expand_convo_text_2)

## 3/10

In [0]:
len(convo_text[271715:409340])/len(convo_text) # Approx. 10% of dataset

0.10028374561702648

In [0]:
pp.pprint(convo_text[409300:409340])

'ling_face:\n*****************************'


In [0]:
convo_text_3 = convo_text[271715:409340]

In [0]:
len(convo_text_3)

137625

In [0]:
start = default_timer()
expand_convo_text_3 = str(list(cont.expand_texts([convo_text_3], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 15.276559974483341 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_3.txt', "w") as text_file:
    text_file.write(expand_convo_text_3)

## 4/10

In [0]:
len(convo_text[409340:549529])/len(convo_text) # Approx. 10% of dataset

0.1021520654990396

In [0]:
pp.pprint(convo_text[549429:549529])

('ail within the hour! Have a great day and safe travels!\n'
 'thanks!\n'
 '************************************')


In [0]:
convo_text_4 = convo_text[409340:549529]

In [0]:
len(convo_text_4)

140189

In [0]:
start = default_timer()
expand_convo_text_4 = str(list(cont.expand_texts([convo_text_4], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 22.01403841460001 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_4.txt', "w") as text_file:
    text_file.write(expand_convo_text_4)

## 5/10

In [0]:
len(convo_text[549529:699055])/len(convo_text) # Approx. 10% of dataset

0.10895569371212717

In [0]:
pp.pprint(convo_text[698900:699055])

('Washington is also unavailable\n'
 'Is anywhere available…?\n'
 'For a group of 22 out of Tel Aviv, we have nothing available\n'
 '***************************************')


In [0]:
convo_text_5 = convo_text[549529:699055]

In [0]:
len(convo_text_5)

149526

In [0]:
start = default_timer()
expand_convo_text_5 = str(list(cont.expand_texts([convo_text_5], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 48.47355010049999 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_5.txt', "w") as text_file:
    text_file.write(expand_convo_text_5)

## 6/10

In [0]:
len(convo_text[699055:834263])/len(convo_text) # Approx. 10% of dataset

0.09852254079845171

In [0]:
pp.pprint(convo_text[834213:834263])

" let's book\n**************************************"


In [0]:
convo_text_6 = convo_text[699055:834263]

In [0]:
len(convo_text_6)

135208

In [0]:
start = default_timer()
expand_convo_text_6 = str(list(cont.expand_texts([convo_text_6], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 24.63230284571667 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_6.txt', "w") as text_file:
    text_file.write(expand_convo_text_6)

## 7/10

In [0]:
len(convo_text[834263:970112])/len(convo_text) # Approx. 10% of dataset

0.098989620768955

In [0]:
pp.pprint(convo_text[970012:970112])

('we leave?\n'
 'September 5th and you return on the 14th\n'
 'ok - let’s book it\n'
 '******************************')


In [0]:
convo_text_7 = convo_text[834263:970112]

In [0]:
len(convo_text_7)

135849

In [0]:
start = default_timer()
expand_convo_text_7 = str(list(cont.expand_texts([convo_text_7], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 25.621747870666663 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_7.txt', "w") as text_file:
    text_file.write(expand_convo_text_7)

## 8/10

In [0]:
len(convo_text[970112:1113299])/len(convo_text) # Approx. 10% of dataset

0.1043366298540612

In [0]:
pp.pprint(convo_text[1113100:1113299])

('at the departure and return dates are?\n'
 'Booked!\n'
 'Okay but when do my friend and I leave and come back?\n'
 'You will travel from September 14th until the 27th\n'
 'Okay cool. Thanks... friend.\n'
 '******************')


In [0]:
convo_text_8 = convo_text[970112:1113299]

In [0]:
len(convo_text_8)

143187

In [0]:
start = default_timer()
expand_convo_text_8 = str(list(cont.expand_texts([convo_text_8], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 21.475513338850003 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_8.txt', "w") as text_file:
    text_file.write(expand_convo_text_8)

## 9/10

In [0]:
len(convo_text[1113299:1239696])/len(convo_text) # Approx. 10% of dataset

0.0921021950572592

In [0]:
len(convo_text)

1372356

In [0]:
pp.pprint(convo_text[1239596:1239696])

('eptember 24th 2016. Thank you and have a nice day!\n'
 'thanks you too\n'
 '**********************************')


In [0]:
convo_text_9 = convo_text[1113299:1239696]

In [0]:
len(convo_text_9)

126397

In [0]:
start = default_timer()
expand_convo_text_9 = str(list(cont.expand_texts([convo_text_9], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 25.030293950266664 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_9.txt', "w") as text_file:
    text_file.write(expand_convo_text_9)

## 10/10

In [0]:
len(convo_text[1239696:1372356])/len(convo_text) # Approx. 10% of dataset

0.09905447274613875

In [0]:
pp.pprint(convo_text[1372256:1372356])

(' Have a good trip :slightly_smiling_face:\n'
 'Thanks!\n'
 'My pleasure!\n'
 '*************************************')


In [0]:
convo_text_10 = convo_text[1239696:1372356]

In [0]:
len(convo_text_10)

132660

In [0]:
start = default_timer()
expand_convo_text_10 = str(list(cont.expand_texts([convo_text_10], precise=False))) #this part takes forever
end = default_timer()
print(f"elapsed time: {(end-start)/60} min")

elapsed time: 27.381515698066657 min


In [0]:
with open('/content/drive/My Drive/expand_convo_text_10.txt', "w") as text_file:
    text_file.write(expand_convo_text_10)

## Concatenate all parts


In [0]:
outfile_name = '/content/drive/My Drive/contraction_data_parts/expand_convo_text_full.txt'
with open(outfile_name, 'wb') as outfile:
    for file_name in glob.glob('/content/drive/My Drive/contraction_data_parts/*'):
        if file_name == outfile_name:
            # don't want to copy the output into the output
            continue
        with open(file_name, 'rb') as readfile:
            shutil.copyfileobj(readfile, outfile)

Total time was approximately 4 hours to fully expand all contractions in text.