In [711]:
# importing python libraries
import pandas as pd
import re
import nltk
import en_core_web_sm
import string
import spacy

from nltk.tokenize import word_tokenize
from xml.dom import minidom
from rake_nltk import Rake
from html import unescape

In [712]:
# initializing the required variables
nlp = en_core_web_sm.load()
root = minidom.Document()

In [713]:
# loading the datasets
questions = pd.read_csv("data/questions.csv", encoding="ANSI")
answers = pd.read_csv("data/answers.csv", encoding="ANSI")

In [714]:
# printing a sample of the data
print(questions.head())
print(answers.head())

         Id                                              Title  \
0  33071859  What is the difference between an array of cha...   
1  24855711     How can I create a 2D array of 2D char arrays?   
2  49313051                       Java Generic Matrix creation   
3    663632  Converting a Bidimensional Array (Numbers) Int...   
4    705288    Java array: direct access to component in array   

                                                Body  
0  <p>I'm new to java and I need help learning no...  
1  <p>I want to create a 2D array [3][3] ; each e...  
2  <p>I want to create a generic type matrix in j...  
3  <p>I'm in need of help right now. I need to co...  
4  <p>Am I able to access an array component dire...  
                                                Body  Score  ParentId
0  <p>Use the <code>toArray()</code> method of th...     23   1291704
1  <p>If you don't want duplicates in a <code>Col...   1011    203984
2  <p>If you want the array of items to expand (i...      0   1

In [715]:
# joining the two database
questions_and_answers = questions.set_index("Id").join(answers.set_index("ParentId"), lsuffix="_question", rsuffix="_answers", on="Id")

print("The columns of the new table are: ", end="| ")
for column in questions_and_answers.columns:
    print(column, end=" | ")

The columns of the new table are: | Title | Body_question | Body_answers | Score | 

In [716]:
# grouping together all the answers to the same question
# creating for each question a list with the answers and a list with the scores
# to those answers
questions_and_answers = questions_and_answers.groupby("Id").agg(list)
print(questions_and_answers.head())

                                                    Title  \
Id                                                          
203984  [How do I remove repeated elements from ArrayL...   
286161  [How can I see if an element in an int array i...   
362367  [Java Arrays & Generics : Java Equivalent to C...   
374339  [Is there a Java array/list which is staticall...   
440430  [Java sort String array of file names by their...   

                                            Body_question  \
Id                                                          
203984  [<p>I have an <code>ArrayList&lt;String&gt;</c...   
286161  [<p>example:</p>\n\n<p>I want to see if <code>...   
362367  [<p>So in C#, I can treat a <code>string[]</co...   
374339  [<p>This would be very handy as typecasting ge...   
440430  [<p>I have an array of filenames and need to s...   

                                             Body_answers  \
Id                                                          
203984  [<p>If you don

In [717]:
# removing the list format for the columns that have only one element
reformat_columns()
print(questions_and_answers.head())

                                                    Title  \
Id                                                          
203984  How do I remove repeated elements from ArrayList?   
286161  How can I see if an element in an int array is...   
362367  Java Arrays & Generics : Java Equivalent to C#...   
374339  Is there a Java array/list which is statically...   
440430  Java sort String array of file names by their ...   

                                            Body_question  \
Id                                                          
203984  <p>I have an <code>ArrayList&lt;String&gt;</co...   
286161  <p>example:</p>\n\n<p>I want to see if <code>a...   
362367  <p>So in C#, I can treat a <code>string[]</cod...   
374339  <p>This would be very handy as typecasting get...   
440430  <p>I have an array of filenames and need to so...   

                                             Body_answers  \
Id                                                          
203984  [<p>If you don

In [718]:
# removing the html tags, but keeping the <code> tag
strip_html()
print(questions_and_answers)

                                                      Title  \
Id                                                            
203984    How do I remove repeated elements from ArrayList?   
286161    How can I see if an element in an int array is...   
362367    Java Arrays & Generics : Java Equivalent to C#...   
374339    Is there a Java array/list which is statically...   
440430    Java sort String array of file names by their ...   
...                                                     ...   
63514197  What happens when there are less elements in A...   
63936646         Convertion array ['xx=yy'] to map of xx=yy   
63944448  Storing multiple objects in a single array ele...   
63990703  Is there a difference between new Class[]{} an...   
64586481     Can you create a code in array form with this?   

                                              Body_question  \
Id                                                            
203984    I have an <code>ArrayList&lt;String&gt;</cod

In [719]:
# reseting the dataset index
questions_and_answers = questions_and_answers.reset_index()
print(questions_and_answers.head())

       Id                                              Title  \
0  203984  How do I remove repeated elements from ArrayList?   
1  286161  How can I see if an element in an int array is...   
2  362367  Java Arrays & Generics : Java Equivalent to C#...   
3  374339  Is there a Java array/list which is statically...   
4  440430  Java sort String array of file names by their ...   

                                       Body_question  \
0  I have an <code>ArrayList&lt;String&gt;</code>...   
1  example:\n\nI want to see if <code>array[5]</c...   
2  So in C#, I can treat a <code>string[]</code> ...   
3  This would be very handy as typecasting gets b...   
4  I have an array of filenames and need to sort ...   

                                        Body_answers  \
0  [If you don't want duplicates in a <code>Colle...   
1  [There is no such thing as an "empty" element ...   
2  [<code>Iterable &lt;T&gt;</code>\n, Are you lo...   
3  [If by "variable length" you mean that the siz...  

In [720]:
code_index = 0

def reformat_columns():
    questions_and_answers["Title"] = questions_and_answers["Title"].apply(get_list_first_elem)
    questions_and_answers["Body_question"] = questions_and_answers["Body_question"].apply(get_list_first_elem)

def get_list_first_elem(param_list):
    if(isinstance(param_list, list)):
        return param_list[0]
    return param_list

def strip_html():
    questions_and_answers["Title"] = questions_and_answers["Title"].apply(remove_html_tags)
    questions_and_answers["Body_question"] = questions_and_answers["Body_question"].apply(remove_html_tags)
    questions_and_answers["Body_answers"] = questions_and_answers["Body_answers"].apply(remove_html_tags)
    
def remove_html_tags(text):
    if(isinstance(text, list)):
        for i in range(len(text)):
            text[i] = re.sub(r'<(?!/code|code).*?>', '', text[i]) 
        return text
    else:
        return re.sub(r'<(?!/code|code).*?>', '', text)

def create_dom_element(parent, element, content = ""):
    elem = root.createElement(element)
    elemText = root.createTextNode(content)
    elem.appendChild(elemText)
    parent.appendChild(elem)
    return elem

def extract_keywords(text):
    r = Rake()

    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()

def create_question(keywords):
    question = ""

    for i in range(len(keywords)):
        question += "^ " + keywords[i] + " "
    
    return question

def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))

def get_code_snippet_from_text(text):
    return re.findall(r'<code>((.|\n)*?)</code>', text)

def create_answer(phrases, template, codes):
    for phrase in phrases.sents:
        phrase_without_endlines = re.sub(r'\r?\n', "", phrase.text)
        if (len(phrase_without_endlines) > 1):
            text = insert_code(phrase.text, codes)
            text_element = root.createTextNode(text)
            template.appendChild(text_element)
            delay = create_dom_element(template, "delay", "2")
            template.appendChild(delay)

    return template

def insert_code(text, codes):
    print(text)
    for i in range(len(codes)):
        print("$code_placeholder_" + str(i + 1))
        text = text.replace("$code_placeholder_" + str(i + 1), codes[i][0])
    print(text)
    return text

def format_code(code):
    return re.sub(r'\r?\n', "<break/>\n", code[0][0])

def increment_by_one():
    global code_index
    code_index = code_index + 1
    return code_index

def create_aiml_file(questions_and_answers, file_path):
    aiml_element = root.createElement("aiml")
    root.appendChild(aiml_element)

    for i in range(len(questions_and_answers)):
        global code_index
        code_index = 0
        category_element = create_dom_element(aiml_element, "category")
        keywords = extract_keywords(questions_and_answers["Title"][i])
        question = create_question(keywords)
        pattern_element = create_dom_element(category_element, "pattern", question)

        best_answer_index = questions_and_answers["Score"][i].index(max(questions_and_answers["Score"][i]))
        best_answer = questions_and_answers["Body_answers"][i][best_answer_index]

        best_answer_code = get_code_snippet_from_text(best_answer)
        best_answer_without_code = re.sub(r'<code>(.|\n)*?</code>', lambda exp: '$code_placeholder_' + str(increment_by_one()), best_answer)

        answer_phrases = nlp(best_answer_without_code)

        template_element = create_dom_element(category_element, "template")
        answer = create_answer(answer_phrases, template_element, best_answer_code)

        xml_string = root.toprettyxml(indent="\t")

        with open(file_path, 'w') as f:
            aiml_string = unescape(xml_string)
            f.write(aiml_string)

In [721]:
create_aiml_file(questions_and_answers.head(), "stackbot.aiml")

If you don't want duplicates in a $code_placeholder_1, you should consider why you're using a $code_placeholder_2 that allows duplicates.
$code_placeholder_1
$code_placeholder_2
$code_placeholder_3
$code_placeholder_4
$code_placeholder_5
$code_placeholder_6
$code_placeholder_7
If you don't want duplicates in a Collection, you should consider why you're using a Collection that allows duplicates.
The easiest way to remove repeated elements is to add the contents to a $code_placeholder_3 (which will not allow duplicates) and then add the $code_placeholder_4 back to the $code_placeholder_5:

$code_placeholder_6


$code_placeholder_1
$code_placeholder_2
$code_placeholder_3
$code_placeholder_4
$code_placeholder_5
$code_placeholder_6
$code_placeholder_7
The easiest way to remove repeated elements is to add the contents to a Set (which will not allow duplicates) and then add the Set back to the ArrayList:

Set&lt;String&gt; set = new HashSet&lt;&gt;(yourList);
yourList.clear();
yourList.addAll

In [722]:
segmented = nlp(u"I Love Coding. Geeks for Geeks helped me in this regard very much. I Love Geeks for Geeks.")
print(segmented)
for sent in segmented.sents:
    print(sent)

I Love Coding. Geeks for Geeks helped me in this regard very much. I Love Geeks for Geeks.
I Love Coding.
Geeks for Geeks helped me in this regard very much.
I Love Geeks for Geeks.
