In [2]:
import pandas as pd
files_path = 'C:/Users/agurm/OneDrive/Desktop/Bot/' 
dataset = pd.read_csv(files_path + 'chat.csv', usecols=['Question', 'Answer'])

In [3]:
print(len(dataset))

2271


In [4]:
dataset.head()

Unnamed: 0,Question,Answer
0,Can I change my feeling of being worthless to ...,"If everyone thinks you're worthless, then mayb..."
1,Can I change my feeling of being worthless to ...,"Hello, and thank you for your question and see..."
2,Can I change my feeling of being worthless to ...,First thing I'd suggest is getting the sleep y...
3,Can I change my feeling of being worthless to ...,Therapy is essential for those that are feelin...
4,Can I change my feeling of being worthless to ...,I first want to let you know that you are not ...


In [5]:
print(f"Number of question-answer pairs in the dataset: {len(dataset)}")

Number of question-answer pairs in the dataset: 2271


# Text Preprocessing

It turns out that not all cells are of type string. So, we can just apply the str function to make sure that all of them are of the same desired type.

In [6]:
dataset = dataset.applymap(str)

In [7]:
def distinct_chars(data, cols):
    """
    This method takes in a pandas dataframe and prints all distinct characters.
    data: a pandas dataframe.
    cols: a Python list, representing names of columns for questions and answers. First item of the list should be the name 
    of the questions column and the second item should be the name of the column corresponding to answers.
    """
    
    if cols is None:
        cols = list(data.columns)
    
    # join all questions into one string
    questions = ' '.join(data[cols[0]])
    # join all answers into one string
    answers = ' '.join(data[cols[1]])
    
    # get distinct characters used in the data (all questions and answers)
    dis_chars = set(questions+answers)
    
    # print the distinct characters that are used in the data
    print(f"Number of distinct characters used in the dataset: {len(dis_chars)}")
    # print(dis_chars)    
    dis_chars = list(dis_chars)
    
    # Now let's print those characters in an organized way
    digits = [char for char in dis_chars if char.isdigit()]
    alphabets = [char for char in dis_chars if char.isalpha()]
    special = [char for char in dis_chars if not (char.isdigit() | char.isalpha())]
    # sort them to make them easier to read
    digits = sorted(digits)
    alphabets = sorted(alphabets)
    special = sorted(special)
    
    print(f"Digits: {digits}")
    print(f"Alphabets: {alphabets}")
    print(f"Special characters: {special}")

In [8]:
distinct_chars(dataset, ['Question', 'Answer'])

Number of distinct characters used in the dataset: 117
Digits: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Alphabets: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Ú', 'á', 'é', 'í', 'ñ', 'ó', 'ú', 'ü', '明', '朝', 'Ｍ', 'Ｓ']
Special characters: ['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', ']', '_', '{', '}', '~', '\xa0', '¡', '·', '¿', '\u200b', '–', '—', '‘', '’', '“', '”', '…']


The following function replaces some characters with others, removes unwanted characters and gets rid of extra whitespaces from the data.

In [9]:
def clean_text(text):
    """
    This method takes a string, applies different text preprocessing (characters replacement, removal of unwanted characters, 
    removal of extra whitespaces) operations and returns a string.
    text: a string.
    """
    import re
    
    text = str(text)
    
    # REPLACEMENT
    # replace " with ' (because they basically mean the same thing)
    # text = text.replace('\"','\'')
    text = re.sub('\"', '\'', text)
    # replace “ and ” with '
    # text = text.replace("“",'\'').replace("”",'\'')
    text = re.sub("“", '\'', text)
    text = re.sub("”", '\'', text)
     # replace ’ with '
    # text = text.replace('’','\'')
    text = re.sub('’', '\'', text)
    # replace [] and {} with ()
    #text = text.replace('[','(').replace(']',')').replace('{','(').replace('}',')')
    text = re.sub('\[','(', text)
    text = re.sub('\]',')', text)
    text = re.sub('\{','(', text)
    text = re.sub('\}',')', text)
    # replace ? with itself and a whitespace preceding it
    # ex. what's your name? (we want the word name and question mark to be separate tokens)
    # text = re.sub('\?', ' ?', text)
    # creating a space between a word and the punctuation following it
    # punctuation we're using: . , : ; ' ? ! + - * / = % $ @ & ( )
    text = re.sub("([?.!,:;'?!+\-*/=%$@&()])", r" \1 ", text)
    
    
    # REMOVAL OF UNWANTED CHARACTERS
    # accept only alphanumeric and some special characters and remove all others
    # a-zA-Z0-9 : matches any alphanumeric character and the underscore.
    # \. : matches .
    # \, : matches ,
    # \: : matches :
    # \; : matches ;
    # \' : matches '
    # \? : matches ?
    # \! : matches !
    # \+ : matches +
    # \- : matches -
    # \* : matches *
    # \/ : matches /
    # \= : matches =
    # \% : matches %
    # \$ : matches $
    # \@ : matches @
    # \& : matches &
    # ^ is added to the beginning of the set to express that we want the regex to recognize all other characters except
    # these that are explicitly specified, so that we can omit them.
    # define the pattern
    pattern = re.compile('[^a-zA-Z0-9_\.\,\:\;\'\?\!\+\-\*\/\=\%\$\@\&\(\)]')
    # remove unwanted characters
    text = re.sub(pattern, ' ', text)
    
    # lower case the characters in the string
    text = text.lower()
    
    # REMOVAL OF EXTRA WHITESPACES
    # remove duplicated spaces
    text = re.sub(' +', ' ', text)
    # remove leading and trailing spaces
    text = text.strip()
    
    return text

Let's try it out:

In [10]:
clean_text("A nice quote I read    today: “Everything that you are going through is preparing you for what you asked for”. @hi % & =+-*/")

"a nice quote i read today : ' everything that you are going through is preparing you for what you asked for ' . @ hi % & = + - * /"

The following method prints a question-answer pair from the dataset, it will be helpful to give us a sense of what the clean_text function results in:

In [11]:
def print_question_answer(df, index, cols):
    print(f"Question: ({index})")
    print(df.loc[index][cols[0]])
    print(f"Answer: ({index})")
    print(df.loc[index][cols[1]])

In [12]:
print("Before applying text preprocessing:")
print_question_answer(dataset, 102, ['Question', 'Answer'])
print_question_answer(dataset, 200, ['Question', 'Answer'])
print_question_answer(dataset, 886, ['Question', 'Answer'])
print_question_answer(dataset, 1951, ['Question', 'Answer'])

Before applying text preprocessing:
Question: (102)
My apartment manager won't let me keep an emotional support dog
Answer: (102)
At present, the American Disability Association (ADA) only allows protection and guiltiness for Service Animals, which is fall under a separate distinction from Emotional Support Animals. Emotional support animals are untrained animals (typically pets or other domestic animals) that provide wonderful services to their owners, and there are many benefits to having one.Service animals who are trained to provide specific services for an individual with limitations that make them fall under what the ADA defines as "disabled" are protected under specific laws, and must be accommodated in public places where the human they serve is present. While it may be beneficial to request that the therapist who "prescribed" the dog to you write a letter to the apartment manager, it sounds like the manager is aware of ADA guidelines, and the simplest course of action may be t

Apply text preprocessing (characters replacement, removal of unwanted characters, removal of extra whitespaces):

In [13]:
dataset = dataset.applymap(clean_text)

In [14]:
print("Before applying text preprocessing:")
print_question_answer(dataset, 102, ['Question', 'Answer'])
print_question_answer(dataset, 200, ['Question', 'Answer'])
print_question_answer(dataset, 886, ['Question', 'Answer'])
print_question_answer(dataset, 1951, ['Question', 'Answer'])

Before applying text preprocessing:
Question: (102)
my apartment manager won ' t let me keep an emotional support dog
Answer: (102)
at present , the american disability association ( ada ) only allows protection and guiltiness for service animals , which is fall under a separate distinction from emotional support animals . emotional support animals are untrained animals ( typically pets or other domestic animals ) that provide wonderful services to their owners , and there are many benefits to having one . service animals who are trained to provide specific services for an individual with limitations that make them fall under what the ada defines as ' disabled ' are protected under specific laws , and must be accommodated in public places where the human they serve is present . while it may be beneficial to request that the therapist who ' prescribed ' the dog to you write a letter to the apartment manager , it sounds like the manager is aware of ada guidelines , and the simplest cours

The following function applies some preprocessing operations on the data, concretely:
Drops unecessary duplicate pairs (rows) but keep only one instance of all duplicates. (For example, if the dataset contains three duplicates of the same question-answer pair, then two of them would be removed and one kept.) Drops rows with empty question/answer. (These may appear because of the previous step or because they happen to be empty in the original dataset) Drops rows with more than 30 words in either the question or the answer or if the answer has less than two characters. (Note: this is a hyperparameter and you can try other values.)

In [15]:
def preprocess_data(data, cols):
    """
    This method preprocess data and does the following:
    1. drops unecessary duplicate pairs.
    2. drops rows with empty strings.
    3. drops rows with more than 30 words in either the question or the answer, 
    or if the an answer has less than two characters.
    Arguments:
        data: a pandas dataframe.
        cols: a Python list, representing names of columns for questions and answers. First item of the list should be the name 
        of the questions column and the second item should be the name of the column corresponding to answers.
    Returns:
        a pandas dataframe.
    """
    
    
    # (1) Remove unecessary duplicate pairs but keep only one instance of all duplicates.
    print('Removing unecessary duplicate pairs:')
    data_len_before = len(data) # len of data before removing duplicates
    print(f"# of examples before removing duplicates: {data_len_before}")
    # drop duplicates
    data = data.drop_duplicates(keep='first')
    data_len_after = len(data) # len of data after removing duplicates
    print(f"# of examples after removing duplicates: {data_len_after}")
    print(f"# of removed duplicates: {data_len_before-data_len_after}")
    
    
    # (2) Drop rows with empty strings.
    print('Removing empty string rows:')
    if cols is None:
        cols = list(data.columns)
        data_len_before = len(data) # len of data before removing empty strings
    print(f"# of examples before removing rows with empty question/answers: {data_len_before}")
    # I am going to use boolean masking to filter out rows with an empty question or answer
    data = data[(data[cols[0]] != '') & (data[cols[1]] != '')]
    # also, the following row results in the same as the above.
    # data = data.query('Answer != "" and Question != ""')
    data_len_after = len(data) # len of data after removing empty strings
    print(f"# of examples after removing with empty question/answers: {data_len_after}")
    print(f"# of removed empty string rows: {data_len_before-data_len_after}")
    
    
    # (3) Drop rows with more than 30 words in either the question or the answer
    # or if the an answer has less than two characters.
    def accepted_length(qa_pair):
        q_len = len(qa_pair[0].split(' '))
        a_len = len(qa_pair[1].split(' '))
        if (q_len <= 30) & ((a_len <= 30) & (len(qa_pair[1]) > 1)):
            return True
        return False
    
    print('Removing rows with more than 30 words in either the question or the answer:')
    data_len_before = len(data) # len of data before dropping those rows (30+ words)
    print(f"# of examples before removing rows with more than 30 words: {data_len_before}")
    # filter out rows with more than 30 words
    accepted_mask = data.apply(accepted_length, axis=1)
    data = data[accepted_mask]
    data_len_after = len(data) # len of data after dropping those rows (50+ words)
    print(f"# of examples after removing rows with more than 30 words: {data_len_after}")
    print(f"# of removed empty rows with more than 30 words: {data_len_before-data_len_after}")
    
    print("Data preprocessing is done.")
    
    return data

In [16]:
dataset = preprocess_data(dataset, ['Question', 'Answer'])

Removing unecessary duplicate pairs:
# of examples before removing duplicates: 2271
# of examples after removing duplicates: 2149
# of removed duplicates: 122
Removing empty string rows:
# of examples before removing rows with empty question/answers: 2271
# of examples after removing with empty question/answers: 2149
# of removed empty string rows: 122
Removing rows with more than 30 words in either the question or the answer:
# of examples before removing rows with more than 30 words: 2149
# of examples after removing rows with more than 30 words: 45
# of removed empty rows with more than 30 words: 2104
Data preprocessing is done.


In [17]:
print(f"# of question-answer pairs we have left in the  dataset: {len(dataset)}")

# of question-answer pairs we have left in the  dataset: 45


In [18]:
dataset.head()

Unnamed: 0,Question,Answer
33,do i have too many issues for counseling ?,there is no such thing as too many issues for ...
126,how can i best fight the winter blues ?,
132,how can i find myself again ?,check this blog out : four - ways - add - self...
136,i ' m seriously unhappy with everything in my ...,
156,how do i deal with depression ?,


In [19]:
data_len_before = len(dataset) # len of data before removing duplicates
print(f"# of examples before removing duplicates: {data_len_before}")
# drop duplicates
dataset = dataset.drop_duplicates(keep='first')
data_len_after = len(dataset) # len of data after removing duplicates
print(f"# of examples after removing duplicates: {data_len_after}")
print(f"# of removed duplicates: {data_len_before-data_len_after}")

# of examples before removing duplicates: 45
# of examples after removing duplicates: 45
# of removed duplicates: 0


Let's drop rows with NaN values if there's any:

In [20]:
dataset.dropna(inplace=True)

In [21]:
dataset

Unnamed: 0,Question,Answer
33,do i have too many issues for counseling ?,there is no such thing as too many issues for ...
126,how can i best fight the winter blues ?,
132,how can i find myself again ?,check this blog out : four - ways - add - self...
136,i ' m seriously unhappy with everything in my ...,
156,how do i deal with depression ?,
345,i feel really uncomfortable around other people,
398,is it normal to go into therapy feeling nervous ?,certainly .
430,i ' m worried about my new job,
448,i have difficulty with communication,i can offer you hypnosis for confidence in pre...
601,i think my daughter is stressing too much,


In [22]:
dataset.to_csv(files_path + '/data.csv')

In [23]:
dafile = pd.read_csv(files_path + 'data1.csv', usecols=['Question', 'Answer'])

ValueError: Usecols do not match columns, columns expected but not found: ['Question', 'Answer']

In [24]:
dafile = pd.read_csv(files_path + 'data1.csv', usecols=['Question', 'Answer'])

In [25]:
dafile

Unnamed: 0,Question,Answer
0,do i have too many issues for counseling ?,there is no such thing as too many issues for ...
1,how can i find myself again ?,check this blog out : four - ways - add - self...
2,is it normal to go into therapy feeling nervous ?,certainly .
3,i have difficulty with communication,i can offer you hypnosis for confidence in pre...
4,how do i become more self - confident in gener...,check out my latest blog : four - ways - add -...
5,why do i have low self - esteem and lack confi...,check out my latest blog on : four - ways - ad...
6,i hate the way i look,check out my latest blog : four - ways - add -...
7,i want to be a boy but i can ' t because of my...,"chances are your family already knows , they a..."
8,"i cheated on my partner , and i don ' t know h...",my questions to you would be : why did you get...
9,what do you do if your partner isn ' t satisfy...,depends : what do would you like to see happen ?


In [26]:
dafile = dafile.applymap(str)

In [27]:
print(f"Number of question-answer pairs in the dafile: {len(dafile)}")

Number of question-answer pairs in the dafile: 25


In [28]:
distinct_chars(dafile, ['Question', 'Answer'])

Number of distinct characters used in the dataset: 36
Digits: ['0', '4']
Alphabets: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']
Special characters: [' ', '!', "'", ',', '-', '.', '/', ':', '?']


In [29]:
dafile.to_csv(files_path + '/bot.csv')