# Library Imports

In [None]:
# importing libraries
import re
import ast

import numpy as np
import pandas as pd

from google.colab import files

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

# Data Preparation

In [None]:
#uploading the question file taken from https://www.cs.washington.edu/nlp/arithmetic

uploaded_questions = files.upload()

Saving q.txt to q.txt


In [None]:
#uploading the answer file taken from https://www.cs.washington.edu/nlp/arithmetic

uploaded_answers = files.upload()

Saving ans.txt to ans.txt


In [None]:
#uploading the equation file taken from https://www.cs.washington.edu/nlp/arithmetic

uploaded_equations = files.upload()

Saving eq.txt to eq.txt


In [None]:
#creating the questions dataframe
#have accounted for grammar errors, unsure of the effect on a small language model but it has been controlled

questions_file = 'q.txt'

with open(questions_file, 'r', encoding = 'utf-8') as qfile:
  questions_content = qfile.read()

questions = questions_content.split('?')

#removing the whitespaces like /n
questions = [q.strip() for q in questions if q.strip()]

q_df = pd.DataFrame(questions, columns = ['Question'])

#removing spaces before punction
q_df['Question'] = q_df['Question'].apply(lambda x: re.sub(r"\s+([?.!'])", r"\1", x))

#adding a question mark at the end of every question
q_df['Question'] = q_df['Question'].apply(lambda x: x.strip() + '?')

#handling commas separately as there should be a space after a comma but not before and no comma before and
q_df['Question'] = q_df['Question'].apply(lambda x: re.sub(r'\s+,', ',', x))
q_df['Question'] = q_df['Question'].apply(lambda x: re.sub(r',\s+and', ' and', x))  # Removes comma before 'and'

#fixing capitals after fullstop
q_df['Question'] = q_df['Question'].apply(lambda x: re.sub(r'(?<=[\.\?\!]\s)(\w)', lambda match: match.group(1).upper(), x))

q_df.head(2)

Unnamed: 0,Question
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?


In [None]:
#creating the answer dataframe

answers_file = 'ans.txt'

with open(answers_file, 'r', encoding = 'utf-8') as ansfile:
  answers_content = ansfile.read()

answers = answers_content.splitlines()

a_df = pd.DataFrame(answers, columns = ['Answer'])

#removing trailing .0
a_df['Answer'] = a_df['Answer'].apply(lambda x: re.sub(r'(\d+)\.0$', r'\1', x))

a_df.head(2)

Unnamed: 0,Answer
0,43
1,26


In [None]:
#creating the equation dataframe

equations_file = 'eq.txt'

with open(equations_file, 'r', encoding = 'utf-8') as eqfile:
  equations_content = eqfile.read()

equations = equations_content.splitlines()

eq_df = pd.DataFrame(equations, columns = ['Equation'])

eq_df.head(2)

Unnamed: 0,Equation
0,X = 70 - 27
1,X = 54 - 28


In [None]:
#creating the question answer df

qa_df = pd.concat([q_df, eq_df, a_df], axis=1)

qa_df.head(5)

Unnamed: 0,Question,Equation,Answer
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?,X = 70 - 27,43
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,X = 54 - 28,26
2,Mary is baking a cake. The recipe wants 8 cups of flour. She already put in 2 cups. How many cups does she need to add?,X = 8 - 2,6
3,Sara's high school played 12 basketball games this year. The team won most of their games. They were defeated during 4 games. How many games did they win?,X = 12 - 4,8
4,There are 22 walnut trees currently in the park. Park workers will plant walnut trees today. When the workers are finished there will be 55 walnut trees in the park. How many walnut trees did the workers plant today?,X = 55 - 22,33


In [None]:
#getting the maximum digit length for digit complexity

def max_digits(q_text):

    q_text = str(q_text)

    numbers = re.findall(r'\d+\.?\d*', q_text)

    if numbers:
        return max(len(num.replace('.', '')) for num in numbers)
    else:
        return 0

qa_df['Question_Digits'] = qa_df['Question'].apply(max_digits)

qa_df.head(2)

Unnamed: 0,Question,Equation,Answer,Question_Digits
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?,X = 70 - 27,43,2
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,X = 54 - 28,26,2


In [None]:
#checking if the question contains a decimal

def contains_decimal(q_text):

    q_text = str(q_text)

    if re.search(r'\d+\.\d+', q_text):
        return 1
    else:
        return 0

qa_df['Decimal_Question'] = qa_df['Question'].apply(contains_decimal)

qa_df.head(2)

Unnamed: 0,Question,Equation,Answer,Question_Digits,Decimal_Question
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?,X = 70 - 27,43,2,0
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,X = 54 - 28,26,2,0


In [None]:
#getting the question steps ie 2 + 2 is one step/only one addition
#subtracting 1 to account for the + or -

def question_steps(equation):

    equation = str(equation)

    numbers = re.findall(r'\d+\.?\d*', equation)
    return max(len(numbers) - 1, 0)

qa_df['Question_Steps'] = qa_df['Equation'].apply(question_steps)

qa_df.head(2)

Unnamed: 0,Question,Equation,Answer,Question_Digits,Decimal_Question,Question_Steps
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?,X = 70 - 27,43,2,0,1
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,X = 54 - 28,26,2,0,1


In [None]:
#getting the length of the questions

qa_df['Question_Length'] = qa_df['Question'].str.len()

qa_df.head(2)

Unnamed: 0,Question,Equation,Answer,Question_Digits,Decimal_Question,Question_Steps,Question_Length
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?,X = 70 - 27,43,2,0,1,134.0
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,X = 54 - 28,26,2,0,1,160.0


In [None]:
#downloading the dataframe to do some manual annotation for relevant information and fixing questions
#information irrelevance marked as 0 1 or 2
#0 is when its one subject and direct question no extra context
#1 is when there is extra context or multiple subjects
#2 is when there are numbers that can be confused

qa_df.to_csv('questions_and_answers.csv', index=False)
files.download('questions_and_answers.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Code was cleaned up till here then downloaded for manual annotation. Loaded back up for the final columns.

In [None]:
#uploading the manually annotated question answer file

qa_data_with_irrelevance = files.upload()

Saving qa_data_with_irrelevance.csv to qa_data_with_irrelevance.csv


In [None]:
#reading the csv and getting the dataframe

qa_df_final = pd.read_csv('qa_data_with_irrelevance.csv')

qa_df_final.head(2)

Unnamed: 0,Question,Fixed_Question,Equation,Answer,Question_Digits,Decimal_Question,Question_Steps,Question_Information_Irrelevance,Question_Length
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashells left. How many seashells did she give to Sam?,X = 70 - 27,43,2,0,1,1,134
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,X = 54 - 28,26,2,0,1,1,160


In [None]:
#getting the length of the fixed questions
qa_df_final['Fixed_Question_Length'] = qa_df_final['Fixed_Question'].str.len()

#removing trailing .0
qa_df_final['Answer'] = qa_df_final['Answer'].apply(lambda x: re.sub(r'(\d+)\.0$', r'\1', str(x)))

qa_df_final.head(2)

Unnamed: 0,Question,Fixed_Question,Equation,Answer,Question_Digits,Decimal_Question,Question_Steps,Question_Information_Irrelevance,Question_Length,Fixed_Question_Length
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashells left. How many seashells did she give to Sam?,X = 70 - 27,43,2,0,1,1,134,140
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,X = 54 - 28,26,2,0,1,1,160,160


In [None]:
#identifying question type

def question_type(equation):

    equation = equation.strip()
    equation = str(equation)

    if re.match(r'X\s*=', equation):
        return 'Direct'
    else:
      return 'Indirect'

qa_df_final['Question_Type'] = qa_df_final['Equation'].apply(question_type)

qa_df_final.head(3)

Unnamed: 0,Question,Fixed_Question,Equation,Answer,Question_Digits,Decimal_Question,Question_Steps,Question_Information_Irrelevance,Question_Length,Fixed_Question_Length,Question_Type
0,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashell. How many seashells did she give to Sam?,Joan found 70 seashells on the beach. She gave Sam some of her seashells. She has 27 seashells left. How many seashells did she give to Sam?,X = 70 - 27,43,2,0,1,1,134,140,Direct
1,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,There were 28 bales of hay in the barn. Tim stacked bales in the barn today. There are now 54 bales of hay in the barn. How many bales did he store in the barn?,X = 54 - 28,26,2,0,1,1,160,160,Direct
2,Mary is baking a cake. The recipe wants 8 cups of flour. She already put in 2 cups. How many cups does she need to add?,Mary is baking a cake. The recipe wants 8 cups of flour. She already put in 2 cups. How many cups does she need to add?,X = 8 - 2,6,1,0,1,1,119,119,Direct


In [None]:
qa_df_final.to_csv('final_qa_data.csv', index=False)
files.download('final_qa_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>