# Pre-processing steps
## Konstantina Andronikou 

In [1]:
#Importing all relevant packages
import ijson
import codecs
import collections
import string 
import nltk
import pycld2 as cld2
import cleantext
from cleantext import clean
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [2]:
#Loading the data
filepath = 'file_path'
with open(filepath, "r") as infile:
    objects = ijson.items(infile, '_source.text', multiple_values=True)
    column = list(objects)

In [3]:
#Using a language detection library to store only the English conversations 
english_conversations = []
x=0
for convo in column:
    try:
        _, _, _, detected_language = cld2.detect(convo,  returnVectors=True)
        if detected_language[0][3] == 'en' and len(detected_language) == 1: #conditions: if language code is 'en' and if only one is language detected
            english_conversations.append(convo)
    except:
        x+=1

In [4]:
#Using an encoding-decoding method to remove non-readable characters 
decoding = []
for i in english_conversations:
    string_encode = i.encode("ascii", "ignore") # For this project 'ascii' was chosen but any other encoder can be used 
    i = string_encode.decode()
    decoding.append(i)

In [5]:
#manual cleaning of the data 
clean_convos = []
for i in decoding:
     without_char = i.replace('‚Äô',"'") #this character was frequently seen in the data and therefore it was replaced by the original form of it
     clean_convos.append(without_char)

## Automatic Generated Responses 
### The following cell in responsible for removing automatic generated responses from the company - This can be adapted depending on the company 

In [6]:
automatic_responses = ["Insert the automatic generated responses here"]
filtered_text = []
for i in clean_convos: 
    if any(x in i for x in automatic_responses):
        continue
    else:
        filtered_text.append(i)

In [7]:
#cleaning the data in terms of emojis 
without_emojis = []
for i in filtered_text:
    clean_conversations = clean(i, no_emoji=True)
    without_emojis.append(clean_conversations)

## Frequency Check - Optional 

In [12]:
#To check the most frequent words in the data 
for i in without_emojis:
    words = i.split()
resulting_count = collections.Counter(words)

### The output of the previous cell is the most frequent words within the data. Based on human judgment the words that are not relevant to the project are manually stored in an external list for exclusion purposes.

In [8]:
#This list exclude all the frequent words that cannot be a topic 
extra_word_list = []
with open ('data/External_List.txt', 'r', encoding= 'utf-8') as f: #the input for this function in the manually created list 
    frequent_words = f.readlines()
    for i in frequent_words:
        if '\n' in i:
           lines = i.replace('\n', '')
           extra_word_list.append(lines)

## Anonymization - Optional 
### In the case that the data used is not anonymatized and contains sentsitive information such as Names and Surnames, the following cell can be used. This file contains a list with the most frequent names and surnames around the world. 
### This database was retrived from https://github.com/smashew/NameDatabases 

In [10]:
# The following function is loading an external file that contains frequently used names 
all_names = []
with open ('data/Names.txt', 'r', encoding= 'utf-8') as f:
    names = f.readlines()
    for l in names:
        if '\n' in l:
           new_lines = l.replace('\n', '')
           lower_names = new_lines.lower()
           all_names.append(lower_names)

In [11]:
# Loading stop-words 
stop_words = stopwords.words('english')
#stop_words += extra_word_list # optional - if the external list is created
#stop_words += all_names # optional - if the previous cell is implemented

In [12]:
#Applying text mining tools
# This step might take a while to run depending on the size of the data
final_convos = []
lemmatizer = WordNetLemmatizer() # Lemmatization 
for i, convo in enumerate(without_emojis):
    tokenize_words = word_tokenize(convo) #Tokenization 
    tokenize_words = nltk.pos_tag(tokenize_words) #POS tags 
    filtered_sample_text=[]
    for w, t in tokenize_words:
        if 'V' in t:
            filtered_sample_text.append(lemmatizer.lemmatize(w, 'v')) #Lemmatization based on verbs
        else:
            filtered_sample_text.append(lemmatizer.lemmatize(w))
    filtered_sample_text = [w for w in filtered_sample_text if w not in stop_words and w.isalpha()] #removing any character that is not in the alphabet
    filtered_sample_text = [w for w in filtered_sample_text if not 'pii_' in w] # manual removal of the instance pii_
    filtered_sample_text = [w for w in filtered_sample_text if not w in string.punctuation] #removing punctuation 

    final_convos.append(' '.join(filtered_sample_text))

In [13]:
#Storing the normalised text into a file 
import csv
with open('data/Input_for_topic_model.tsv', 'w', newline='') as f:
    outfile = csv.writer(f)
    for convo in final_convos:
        outfile.writerow([convo])

## End of Notebook