In [106]:
# importing python libraries
import pandas as pd
import re

In [107]:
# loading the datasets
questions = pd.read_csv("data/questions.csv", encoding="ANSI")
answers = pd.read_csv("data/answers.csv", encoding="ANSI")

In [115]:
# printing a sample of the data
print(questions.head())
print(answers.head())

         Id                                              Title  \
0  33071859  What is the difference between an array of cha...   
1  24855711     How can I create a 2D array of 2D char arrays?   
2  49313051                       Java Generic Matrix creation   
3    663632  Converting a Bidimensional Array (Numbers) Int...   
4    705288    Java array: direct access to component in array   

                                                Body  
0  <p>I'm new to java and I need help learning no...  
1  <p>I want to create a 2D array [3][3] ; each e...  
2  <p>I want to create a generic type matrix in j...  
3  <p>I'm in need of help right now. I need to co...  
4  <p>Am I able to access an array component dire...  
0       <p>Use the <code>toArray()</code> method of th...
1       <p>If you don't want duplicates in a <code>Col...
2       <p>If you want the array of items to expand (i...
3       <p>If you don't want duplicates, use a <a href...
4       <p>Pretty much always prefer a li

In [109]:
# joining the two database
questions_and_answers = questions.set_index("Id").join(answers.set_index("ParentId"), lsuffix="_question", rsuffix="_answers", on="Id")

print("The columns of the new table are: ", end="| ")
for column in questions_and_answers.columns:
    print(column, end=" | ")

The columns of the new table are: | Title | Body_question | Body_answers | Score | 

In [110]:
# grouping together all the answers to the same question
# creating for each question a list with the answers and a list with the scores
# to those answers
questions_and_answers = questions_and_answers.groupby("Id").agg(list)
print(questions_and_answers.head())

                                                    Title  \
Id                                                          
203984  [How do I remove repeated elements from ArrayL...   
286161  [How can I see if an element in an int array i...   
362367  [Java Arrays & Generics : Java Equivalent to C...   
374339  [Is there a Java array/list which is staticall...   
440430  [Java sort String array of file names by their...   

                                            Body_question  \
Id                                                          
203984  [<p>I have an <code>ArrayList&lt;String&gt;</c...   
286161  [<p>example:</p>\n\n<p>I want to see if <code>...   
362367  [<p>So in C#, I can treat a <code>string[]</co...   
374339  [<p>This would be very handy as typecasting ge...   
440430  [<p>I have an array of filenames and need to s...   

                                             Body_answers  \
Id                                                          
203984  [<p>If you don

In [111]:
# removing the list format for the columns that have only one element
reformat_columns()
print(questions_and_answers.head())

                                                    Title  \
Id                                                          
203984  How do I remove repeated elements from ArrayList?   
286161  How can I see if an element in an int array is...   
362367  Java Arrays & Generics : Java Equivalent to C#...   
374339  Is there a Java array/list which is statically...   
440430  Java sort String array of file names by their ...   

                                            Body_question  \
Id                                                          
203984  <p>I have an <code>ArrayList&lt;String&gt;</co...   
286161  <p>example:</p>\n\n<p>I want to see if <code>a...   
362367  <p>So in C#, I can treat a <code>string[]</cod...   
374339  <p>This would be very handy as typecasting get...   
440430  <p>I have an array of filenames and need to so...   

                                             Body_answers  \
Id                                                          
203984  [<p>If you don

In [116]:
# removing the html tags, but keeping the <code> tag
strip_html()
print(questions_and_answers)

                                                      Title  \
Id                                                            
203984    How do I remove repeated elements from ArrayList?   
286161    How can I see if an element in an int array is...   
362367    Java Arrays & Generics : Java Equivalent to C#...   
374339    Is there a Java array/list which is statically...   
440430    Java sort String array of file names by their ...   
...                                                     ...   
63514197  What happens when there are less elements in A...   
63936646         Convertion array ['xx=yy'] to map of xx=yy   
63944448  Storing multiple objects in a single array ele...   
63990703  Is there a difference between new Class[]{} an...   
64586481     Can you create a code in array form with this?   

                                              Body_question  \
Id                                                            
203984    I have an <code>ArrayList&lt;String&gt;</cod

In [113]:
def reformat_columns():
    questions_and_answers["Title"] = questions_and_answers["Title"].apply(get_list_first_elem)
    questions_and_answers["Body_question"] = questions_and_answers["Body_question"].apply(get_list_first_elem)

def get_list_first_elem(param_list):
    if(isinstance(param_list, list)):
        return param_list[0]
    return param_list
    
def strip_html():
    questions_and_answers["Title"] = questions_and_answers["Title"].apply(remove_html_tags)
    questions_and_answers["Body_question"] = questions_and_answers["Body_question"].apply(remove_html_tags)
    questions_and_answers["Body_answers"] = questions_and_answers["Body_answers"].apply(remove_html_tags)
    
def remove_html_tags(text):
    if(isinstance(text, list)):
        for i in range(len(text)):
            text[i] = re.sub(r'<(?!/code|code).*?>', '', text[i]) 
        return text
    else:
        return re.sub(r'<(?!/code|code).*?>', '', text)