In [131]:
# importing python libraries
import pandas as pd
import re
import nltk
import en_core_web_sm
import string
import spacy

from nltk.tokenize import word_tokenize
from xml.dom import minidom
from rake_nltk import Rake
from html import unescape
from nltk.corpus import stopwords

In [132]:
# initializing the required variables
nlp = en_core_web_sm.load()
root = minidom.Document()
stop_words = set(stopwords.words("english"))
r = Rake()

In [133]:
# loading the datasets
questions = pd.read_csv("data/questions.csv", encoding="ANSI")
answers = pd.read_csv("data/answers.csv", encoding="ANSI")

In [134]:
# printing a sample of the data
print(questions.head())
print(answers.head())

         Id                                              Title  \
0  33071859  What is the difference between an array of cha...   
1  24855711     How can I create a 2D array of 2D char arrays?   
2  49313051                       Java Generic Matrix creation   
3    663632  Converting a Bidimensional Array (Numbers) Int...   
4    705288    Java array: direct access to component in array   

                                                Body  
0  <p>I'm new to java and I need help learning no...  
1  <p>I want to create a 2D array [3][3] ; each e...  
2  <p>I want to create a generic type matrix in j...  
3  <p>I'm in need of help right now. I need to co...  
4  <p>Am I able to access an array component dire...  
                                                Body  Score  ParentId
0  <p>Use the <code>toArray()</code> method of th...     23   1291704
1  <p>If you don't want duplicates in a <code>Col...   1011    203984
2  <p>If you want the array of items to expand (i...      0   1

In [135]:
# joining the two database
questions_and_answers = questions.set_index("Id").join(answers.set_index("ParentId"), lsuffix="_question", rsuffix="_answers", on="Id")

print("The columns of the new table are: ", end="| ")
for column in questions_and_answers.columns:
    print(column, end=" | ")

The columns of the new table are: | Title | Body_question | Body_answers | Score | 

In [136]:
# grouping together all the answers to the same question
# creating for each question a list with the answers and a list with the scores
# to those answers
# the initial implementation was to use the answer with the highest score but
# after I decided to use all the answers
questions_and_answers = questions_and_answers.groupby("Id").agg(list)
print(questions_and_answers.head())

                                                    Title  \
Id                                                          
203984  [How do I remove repeated elements from ArrayL...   
286161  [How can I see if an element in an int array i...   
362367  [Java Arrays & Generics : Java Equivalent to C...   
374339  [Is there a Java array/list which is staticall...   
440430  [Java sort String array of file names by their...   

                                            Body_question  \
Id                                                          
203984  [<p>I have an <code>ArrayList&lt;String&gt;</c...   
286161  [<p>example:</p>\n\n<p>I want to see if <code>...   
362367  [<p>So in C#, I can treat a <code>string[]</co...   
374339  [<p>This would be very handy as typecasting ge...   
440430  [<p>I have an array of filenames and need to s...   

                                             Body_answers  \
Id                                                          
203984  [<p>If you don

In [137]:
# variable used to insert the placeholders to the code snippets int he phrases
code_index = 0

# function that remove the liststs for the columns that have only one element for list
def reformat_columns():
    questions_and_answers["Title"] = questions_and_answers["Title"].apply(get_list_first_elem)
    questions_and_answers["Body_question"] = questions_and_answers["Body_question"].apply(get_list_first_elem)

# function that takes the firs element of the list passed as parameter
def get_list_first_elem(param_list):
    if(isinstance(param_list, list)):
        return param_list[0]
    return param_list

# function that removes the html tags from the questions and answers, and also reoves the extra characters
# like new line or spaces
def strip_html():
    questions_and_answers["Title"] = questions_and_answers["Title"].apply(remove_html_tags)
    questions_and_answers["Body_question"] = questions_and_answers["Body_question"].apply(remove_html_tags)
    questions_and_answers["Body_answers"] = questions_and_answers["Body_answers"].apply(remove_html_tags)
    questions_and_answers["Body_answers"] = questions_and_answers["Body_answers"].apply(remove_extra_chars)
    
# function that search the html tags in the string passed as parameter and removes them
def remove_html_tags(text):
    if(isinstance(text, list)):
        for i in range(len(text)):
            text[i] = re.sub(r'<(?!/code|code).*?>', '', text[i]) 
        return text
    else:
        return re.sub(r'<(?!/code|code).*?>', '', text)

# function that search for extra chars (e.g newline) and remove them from the string passed as parameter
def remove_extra_chars(elements):
    for i in range(len(elements)):
        elements[i] = re.sub(r'([\r\n]+|\r+|\n+)', '\n', elements[i])
    return elements

# function that create an xml element with the content passed as parameter, and append it to the node
# passed also as parameter
def create_dom_element(parent, element, content = ""):
    elem = root.createElement(element)
    elemText = root.createTextNode(content)
    elem.appendChild(elemText)
    parent.appendChild(elem)
    return elem

# function that extracts the keywords from the string passed as parameter
def extract_keywords(text):
    r.extract_keywords_from_text(text)
    ordered_keywords = r.get_ranked_phrases()
    filtered_keywords = []
    
    for i in range(len(ordered_keywords)):
        re_string = r"\b({})\b".format(re.escape(ordered_keywords[i]))
        word = re.search(re_string, text.lower())
        if word is not None:
            filtered_keywords.append((ordered_keywords[i], word.start()))

    return sorted(filtered_keywords, key=lambda tup: tup[1])

# function that creates the structure of the question passed as parameter
def create_question(full_question, keywords, simple = False):
    if simple:
        question = ""
    else:
        question = "\n\t\t\t<!-- " + full_question + " -->\n\t\t\t"

    for i in range(len(keywords)):
        question += "^ " + keywords[i][0] + " "

    if not simple:
        question += "\n\t\t"

    return question

# function that removes the punctuation from the string passed as parameter
def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

# function that extrapolates the code snippets present in the string passed as parameter 
def get_code_snippet_from_text(text):
    code_snippet = re.findall(r'<code>((.|\n)*?)</code>', text)
    
    for i in range(len(code_snippet)):
        code_snippet[i] = re.sub("\n", "<break/>", code_snippet[i][0])

    return code_snippet

# function that create the answer structure from the phrases passed as parameter
def create_answer(phrases, li_element, codes):
    for phrase in phrases.sents:
        phrase_without_endlines = re.sub(r'\r?\n', "", phrase.text)
        if (len(phrase_without_endlines) > 1):
            text = insert_code(phrase.text, codes)
            text_element = root.createTextNode(text)
            li_element.appendChild(text_element)
            delay = create_dom_element(li_element, "delay", "2")
            # li_element.appendChild(delay)

    return li_element

# function that insert the code snippets passed as parameter in the string also 
# passed as parameter
def insert_code(text, codes):
    for i in range(len(codes)):
        text = text.replace("$code_placeholder_" + str(i + 1), codes[i])
    return text

# function that substitute the newline symbols in the string passed as parameter
# with the AIML break construct
def format_code(code):
    return re.sub(r'\r?\n', "<break/>\n", code[0][0])

# function that increments by one a global variable (in python we don't have the ++ operator
# so I had to simulate it with a function)
def increment_by_one():
    global code_index
    code_index = code_index + 1
    return code_index

# function that creates and fill the aiml file with the questions and the answers
def create_aiml_file(questions_and_answers, file_path):
    aiml_element = root.createElement("aiml")
    root.appendChild(aiml_element)

    # looping through the questions
    for i in range(len(questions_and_answers)):
        global code_index
        category_element = create_dom_element(aiml_element, "category")
        keywords = extract_keywords(questions_and_answers["Title"][i])

        question = create_question(questions_and_answers["Title"][i], keywords)
        pattern_element = create_dom_element(category_element, "pattern", question)

        template_element = create_dom_element(category_element, "template") 
        random_element = create_dom_element(template_element, "random")

        # looping through the answest of a particular question
        for j in range(len(questions_and_answers["Body_answers"][i])):
            code_index = 0
            answer_code = get_code_snippet_from_text(questions_and_answers["Body_answers"][i][j])

            answer_without_code = re.sub(r'<code>(.|\n)*?</code>', lambda exp: '$code_placeholder_' + str(increment_by_one()), questions_and_answers["Body_answers"][i][j])

            # creating the phrases from the given answer
            answer_phrases = nlp(answer_without_code)

            li_element = create_dom_element(random_element, "li")

            answer = create_answer(answer_phrases, li_element, answer_code)

            simple_question = create_question(questions_and_answers["Title"][i], keywords, True)

        create_footer(template_element, simple_question)

    xml_string = root.toprettyxml(indent="\t")

    # saving the data to file
    with open(file_path, 'w', encoding="ANSI") as f:
        aiml_string = unescape(xml_string)
        f.write(aiml_string)

# function that creates category which asks the user if he or she understood the question
def create_footer(parent, question):
    delay_element = create_dom_element(parent, "delay", "2")
    random_element = create_dom_element(parent, "random")
    li_1_text = "What do you say, did you understand the answer? Do you want me to give you another one?"
    create_dom_element(random_element, "li", li_1_text)
    li_2_text = "If it isn't clear I could provide  you another answer? Do you want it?"
    create_dom_element(random_element, "li", li_2_text)
    li_3_text = "I could try and give you another answer if you didn't understand. Would you kike that?"
    create_dom_element(random_element, "li", li_3_text)
    li_4_text = "If you didn't understood I could provide you anoter answer. Do you want it?"
    create_dom_element(random_element, "li", li_4_text)
    reply_element_yes = create_dom_element(parent, "reply")
    create_dom_element(reply_element_yes, "text", "Yes")
    create_dom_element(reply_element_yes, "postback", question)
    reply_element_no = create_dom_element(parent, "reply")
    create_dom_element(reply_element_no, "text", "No")
    create_dom_element(reply_element_no, "postback", "goodanswer")

# function that takes the best answer to a given question (the answer with the highest score)
def use_best_answer(question_index):
    best_answer_index = questions_and_answers["Score"][question_index].index(max(questions_and_answers["Score"][i]))
    best_answer = questions_and_answers["Body_answers"][question_index][best_answer_index]

    best_answer_code = get_code_snippet_from_text(best_answer)
    best_answer_without_code = re.sub(r'<code>(.|\n)*?</code>', lambda exp: '$code_placeholder_' + str(increment_by_one()), best_answer)

    answer_phrases = nlp(best_answer_without_code)

    template_element = create_dom_element(category_element, "template")
    answer = create_answer(answer_phrases, template_element, best_answer_code)

# function that applyies POS tagging to the question
def pos_tagging(question):
    question_tokens = word_tokenize(question)
    return nltk.pos_tag(question_tokens)

# function that summarizes the string passed as parameter
def text_summarization(text):
    tokens = word_tokenize(remove_punctuation(text))
    removed_stopwords = [w for w in tokens if not w in stop_words]

    frequency_table = dict()

    # looping throught the tokens and computing the frequency
    for word in tokens:
        word = word.lower()
        if word in stop_words:
            continue;
        if word in frequency_table:
            frequency_table[word] += 1
        else:
            frequency_table[word] = 1

    sentences = nlp(text)
    sentences_values = dict()

    # looping throught the sentences and computing the frequency
    for sentence in sentences.sents:
        no_punctuation_text = remove_punctuation(sentence.text)
        for word, frequency in frequency_table.items():
            if word in no_punctuation_text.lower():
                if no_punctuation_text in sentences_values:
                    sentences_values[no_punctuation_text] += frequency
                else:
                    sentences_values[no_punctuation_text] = frequency
    
    sentences_values = dict(sorted(sentences_values.items(), key=lambda item: item[1]))

    sum_of_values = 0

    for sentence in sentences_values:
        sum_of_values += sentences_values[sentence]

    average_value = int(sum_of_values / len(sentences_values))

    summary = ""

    for sentence in sentences.sents:
        no_punctuation_text = remove_punctuation(sentence.text)
        if(no_punctuation_text in sentences_values) and (sentences_values[no_punctuation_text] >= (1.1 * average_value)):
            summary += " " + sentence.text

    return summary

# function that substitute some parts of speech with the AIML wild card (^)
def remove_pos_tagging():
    questions = []
    for i in range(len(questions_and_answers)):
        pos_phrase = pos_tagging(questions_and_answers["Title"][i])
        phrase = ""
        for j in range(len(pos_phrase)):
            if is_pos_to_be_removed(pos_phrase[j]):
                phrase += "^ "
            else:
                phrase += pos_phrase[j][0] + " "
        questions.append(phrase)
    return questions
 
# the part of speech to be removed from a given text
def is_pos_to_be_removed(pos_list):
    return pos_list[1] == "JJR" or pos_list[1] == "JJS" or pos_list[1] == "CC" or pos_list[1] == "DT" or pos_list[1] == "IN" or pos_list[1] == "WRB" or pos_list[1] == "PRP" or pos_list[1] == "VBP" or pos_list[1] == "TO" or pos_list[1] == "VBZ" or pos_list[1] == "EX"

In [138]:
# removing the list format for the columns that have only one element
reformat_columns()
print(questions_and_answers.head())

                                                    Title  \
Id                                                          
203984  How do I remove repeated elements from ArrayList?   
286161  How can I see if an element in an int array is...   
362367  Java Arrays & Generics : Java Equivalent to C#...   
374339  Is there a Java array/list which is statically...   
440430  Java sort String array of file names by their ...   

                                            Body_question  \
Id                                                          
203984  <p>I have an <code>ArrayList&lt;String&gt;</co...   
286161  <p>example:</p>\n\n<p>I want to see if <code>a...   
362367  <p>So in C#, I can treat a <code>string[]</cod...   
374339  <p>This would be very handy as typecasting get...   
440430  <p>I have an array of filenames and need to so...   

                                             Body_answers  \
Id                                                          
203984  [<p>If you don

In [139]:
# reseting the dataset index
questions_and_answers = questions_and_answers.reset_index()
print(questions_and_answers.head())

       Id                                              Title  \
0  203984  How do I remove repeated elements from ArrayList?   
1  286161  How can I see if an element in an int array is...   
2  362367  Java Arrays & Generics : Java Equivalent to C#...   
3  374339  Is there a Java array/list which is statically...   
4  440430  Java sort String array of file names by their ...   

                                       Body_question  \
0  <p>I have an <code>ArrayList&lt;String&gt;</co...   
1  <p>example:</p>\n\n<p>I want to see if <code>a...   
2  <p>So in C#, I can treat a <code>string[]</cod...   
3  <p>This would be very handy as typecasting get...   
4  <p>I have an array of filenames and need to so...   

                                        Body_answers  \
0  [<p>If you don't want duplicates in a <code>Co...   
1  [<p>There is no such thing as an "empty" eleme...   
2  [<p><code>Iterable &lt;T&gt;</code></p>\n, <p>...   
3  [<p>If by "variable length" you mean that the ...  

In [140]:
# removing the html tags, but keeping the <code> tag
strip_html()
print(questions_and_answers)

           Id                                              Title  \
0      203984  How do I remove repeated elements from ArrayList?   
1      286161  How can I see if an element in an int array is...   
2      362367  Java Arrays & Generics : Java Equivalent to C#...   
3      374339  Is there a Java array/list which is statically...   
4      440430  Java sort String array of file names by their ...   
..        ...                                                ...   
334  63514197  What happens when there are less elements in A...   
335  63936646         Convertion array ['xx=yy'] to map of xx=yy   
336  63944448  Storing multiple objects in a single array ele...   
337  63990703  Is there a difference between new Class[]{} an...   
338  64586481     Can you create a code in array form with this?   

                                         Body_question  \
0    I have an <code>ArrayList&lt;String&gt;</code>...   
1    example:\n\nI want to see if <code>array[5]</c...   
2    So i

In [141]:
create_aiml_file(questions_and_answers, "stackbot.aiml")

In [142]:
# here are the questions obtained by using POS tagging
# the duplicate ^ have to be removed
print(remove_pos_tagging())

['^ ^ ^ remove repeated elements ^ ArrayList ? ', '^ can ^ see ^ ^ element ^ ^ int array ^ empty ? ', 'Java Arrays ^ Generics : Java Equivalent ^ C # IEnumerable ', '^ ^ ^ Java array/list which ^ statically typed AND variable length ', 'Java sort String array ^ file names ^ their extension ', 'Do 2D arrays ^ ^ resources ^ 1D arrays ^ Java ? ', 'Removing ^ element ^ ^ Array ( Java ) ', 'Converting ^ Bidimensional Array ( Numbers ) ^ ^ Dimensional Array ^ Viceversa ^ Java ', '^ ^ get ^ last value ^ ^ ArrayList ', 'Java array : direct access ^ component ^ array ', '^ ^ check ^ ^ elements ^ ^ ArrayList ^ ^ contained ^ ^ ArrayList ', 'Java Iterate Bits ^ Byte Array ', '^ ^ more efficient ^ remove elements ^ ^ ArrayList ^ ^ LinkedList ? ', 'double type array use ^ J2ME ', 'Java : ^ ^ convert HashMap ^ array ', 'Using generics ^ arrays ', '^ ^ ^ declare ^ initialize ^ array ^ Java ? ', 'Convert InputStream ^ byte array ^ Java ', '^ ^ ^ ArrayList preferable ^ ^ array ^ Java ? ', '^ ^ ^ populat

In [146]:
# as it can be seen from the result, for short text the extractive summarization does not work
# in the majority of the cases
for i in range(len(questions_and_answers)):
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
    print(questions_and_answers["Title"][i])
    print("------------------------------------------------------------------")
    print(text_summarization(questions_and_answers["Title"][i]))

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
How do I remove repeated elements from ArrayList?
------------------------------------------------------------------

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
How can I see if an element in an int array is empty?
------------------------------------------------------------------

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Java Arrays & Generics : Java Equivalent to C# IEnumerable
------------------------------------------------------------------

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Is there a Java array/list which is statically typed AND variable length
------------------------------------------------------------------

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Java sort String array of file names by their extension
------------------------------------------------------------------

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

In [145]:
# if I add also the body of the question, the result is better but still can't be used
# for the purpose I had in mind
for i in range(len(questions_and_answers)):
    title_body = [" " + questions_and_answers["Title"][i] + " " + questions_and_answers["Body_question"][i]]
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
    print(remove_extra_chars(title_body)[0])
    print("------------------------------------------------------------------")
    print(text_summarization(title_body[0]))

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 How do I remove repeated elements from ArrayList? I have an <code>ArrayList&lt;String&gt;</code>, and I want to remove repeated strings from it. How can I do this?

------------------------------------------------------------------
 I have an <code>ArrayList&lt;String&gt;</code>, and I want to remove repeated strings from it.
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 How can I see if an element in an int array is empty? example:
I want to see if <code>array[5]</code> holds a value or is empty.

------------------------------------------------------------------
  How can I see if an element in an int array is empty? I want to see if <code>array[5]</code> holds a value or is empty.

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 Java Arrays & Generics : Java Equivalent to C# IEnumerable So in C#, I can treat a <code>string[]</code> as an <code>IEnumerable&lt;string&gt;</code

In [147]:
# this is just to see if the result of the text summarization on a longer text
print(text_summarization('Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.'))

 Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC.
