In [None]:
import os
import textract 
import pickle
import pandas as pd

In [None]:
def find_questions(blank, q):
    blank = str(textract.process(blank)).replace("\\n","")
    questions = {}
    for i in list(range(1,q+1)):
        if i != 30:
            start = blank.find('Q%d:' % (i), 0, len(blank)) + len('Q%d:' % (i))
        else:
            start = blank.find('Q%d' % (i), 0, len(blank)) + len('Q%d' % (i))
        end = blank.find('**A:**', start, len(blank))
        tmp = blank[start:end].strip(' *').replace('*', '').replace("\\'", "'")
        if "```" in tmp:
            tmp = tmp[0:tmp.find("```", 0, len(tmp))]
        questions[i] = tmp
    return questions

In [None]:
def find_end_flags(blank, q):
    blank = str(textract.process(blank)).replace("\\n","")
    end_flag = {}
    for i in list(range(2,q+2)):
        if i != 31:
            start = blank.find('Q%d:' % (i-1), 0, len(blank))
        else:
            start = blank.find('Q%d' % (i-1), 0, len(blank))
        start = blank.find('**A:**', start, len(blank)) + len('**A:**')
        end = start + 30 
        if '**Q%d:**' % (i) in blank[start:end]:
            end_flag[i-1] = 'Q%d' % (i)
        else:
            words = blank[start:end].replace("*","").replace("#","").split(' ')
            for j in words:
                if len(j) != 0:
                    word = j
                    break;
            end_flag[i-1] = word.replace("\\'", "'")
    return end_flag

In [None]:
def read_submissions(directory, submission_dirs, end_flag, q):
    submissions = []
    for i in range(len(submission_dirs)):
        try:
            ext = submission_dirs[i].split('.')[-1]
            submission = {}
            submission['name'] = submission_dirs[i]
            submission['type'] = ext
            if ext == 'pdf':
                text = str(textract.process('%s/%s' % (directory, submission_dirs[i])))
            elif ext == 'md':
                text = open('%s/%s' % (directory, submission_dirs[i])).read()
            elif ext in ['ipynb', 'py', 'ipynb_issues']:
                continue
            text = text.replace("\\'", "'")
            for i in range(2,q+2):
                if i != 31:
                    start = text.find('Q%d:' % (i-1),0,len(text))
                else:
                    start = text.find('Q%d' % (i-1),0,len(text))
                start = text.find('A:', start, len(text))
                
                if (i == 4) & (directory == 'simplex_submissions') & (text.find('Q3.5:', 0, len(text)) == -1):
                    end = text.find("Let's", start, len(text))
                else:
                    end = text.find(end_flag[i-1], start, len(text))
                submission[i-1] = text[start:end]
            submissions.append(submission)
            print('%s/%s' % (directory, submission_dirs[i]))
        except UnicodeDecodeError:
            print('fail')
            
    # Remove empty submissions
    tmp = []
    for i in submissions:
        empty = True
        for j in range(1,q+1):
            if i[j] != '':
                empty = False
                break
        if not empty:
            tmp.append(i)
    submissions = tmp
    
    return submissions

In [None]:
# Get Simplex Questions
simplex_questions = find_questions('simplex_blank.txt', 41)
simplex_questions[10] = 'Rewrite the example LP `ALL_INTEGER_2D_LP` in dictionary form. Show your steps!'

# Get Simplex End Flags
simplex_end_flag = find_end_flags('simplex_blank.txt', 41)
simplex_end_flag[39] = "Q40"

# Get Simplex Submissions
simplex_dirs = os.listdir('simplex_submissions') 
simplex_dirs.remove('.DS_Store')
simplex_submissions = read_submissions('simplex_submissions', simplex_dirs, simplex_end_flag, 41)

In [None]:
# Get Branch & Bound Questions
bnb_questions = find_questions('bnb_blank.txt', 44)

# Get Branch & Bound End Flags
bnb_end_flag = find_end_flags('bnb_blank.txt', 44)
bnb_end_flag[9] = 'Q10'
bnb_end_flag[36] = 'Q37'
bnb_end_flag[39] = 'Q40'
bnb_end_flag[40] = 'Q41'
bnb_end_flag[41] = 'Q42'

# Get Branch & Bound Submissions
bnb_dirs = os.listdir('bnb_submissions') 
bnb_dirs.remove('.DS_Store')
bnb_submissions = read_submissions('bnb_submissions', bnb_dirs, bnb_end_flag, 44)

In [None]:
def clean_submissions(submissions, q):
    clean_submissions = []
    for i in submissions:
        clean_sub = {}
        clean_sub['name'] = i['name']
        clean_sub['type'] = i['type']
        for j in range(1,q+1):
            tmp = i[j]
            
            # Missed unicode decoding
            tmp = tmp.replace('\u200b', '')
            tmp = tmp.replace('\\xe2\\x80\\xa9', ' ')
            tmp = tmp.replace('\\xef\\xac\\x81', 'fi')
            tmp = tmp.replace('\\xef\\xac\\x82', 'fl')
            tmp = tmp.replace('\\xef\\xac\\x83', 'ffi')
            tmp = tmp.replace('\\xe2\\x88\\x92', '_')
            tmp = tmp.replace('\\xe0\\xa3\\x98', '?')
            tmp = tmp.replace('\\xe0\\xa3\\x99', '?')
            tmp = tmp.replace('\\x01', ' ')
            tmp = tmp.replace('\\xc2', ' ')
            tmp = tmp.replace('\\xa0', ' ')
            
            # Remove anything after these phrases
            def remove_after(substring):
                index = tmp.find(substring,0,len(tmp))
                if index != -1:
                    return tmp[0:index]
                else:
                    return tmp
                
            tmp = remove_after('localhost')
            tmp = remove_after('simplex_visual')
            tmp = remove_after('simplex_lab_update')
            tmp = remove_after('https')
            tmp = remove_after('```')
            tmp = remove_after('In [')
            tmp = remove_after('In  [')
            tmp = remove_after('(cid:166)')
            
            # Replace
            tmp = tmp.replace('#', '')
            tmp = tmp.replace('\n', '')
            tmp = tmp.replace('\\n', '')
            tmp = tmp.replace("\\'", "'")
            tmp = tmp.replace("**", " ")
            tmp = tmp.replace("<font color='blue'>", '')
            tmp = tmp.replace("</font>", '')
            
            # Strip
            tmp = tmp.lstrip('A:')
            tmp = tmp.strip('* ')
            
            # Remove Short
            if len(tmp) < 2:
                tmp = None
            clean_sub[j] = tmp
        clean_submissions.append(clean_sub)
    return clean_submissions

In [None]:
# Clean and export simplex submissions
simplex_submissions = clean_submissions(simplex_submissions, 41)

simplex_df = pd.DataFrame(simplex_submissions).drop_duplicates()
simplex_df = simplex_df.append({**simplex_questions, **{'name' : 'questions'}}, ignore_index=True)
simplex_df = simplex_df.sort_values('name')
simplex_df.to_csv('simplex_submissions.csv')

In [None]:
# Clean and export branch & bound submissions
bnb_submissions = clean_submissions(bnb_submissions, 44)

bnb_df = pd.DataFrame(bnb_submissions).drop_duplicates()
bnb_df = bnb_df.append({**bnb_questions, **{'name' : 'questions'}}, ignore_index=True)
bnb_df = bnb_df.sort_values('name')
bnb_df.to_csv('bnb_submissions.csv')