This notebook cleans .txt and .dta data for students performance on exams for BIO1111 and exports a csv which can be analyzed in an R script.
#### Author: Christopher Agard

In [1]:
import pandas as pd
import numpy as np
import os,glob
from soar import *

def cleanSoarExam (data\
                   , examNum\
                   , fileType='flat'\
                   , colSpec = [(0,9),(10,21),(22,28),(29,30),(30,32),(32,34),(34,36),(36,38),(39,41),(41,68)]\
                  , soarSessions=[]):
    import pandas as pd
    import numpy as np
    import os
    if fileType != 'flat':
        print("\n{} fileType is not currently supported.".format(fileType))
    else:
        try:
            df=pd.read_fwf(data,colSpec,
                           names=['tuid','last','first',
                                  'middle','unnamed1','unnamed2',
                                  'unnamed3','soar','ncorrect','item'])
        except:
            "print(\nCould not find file {} or {} was not acceptable value for colSpec)".format(data,colSpec)
        #try:
        #    df.columns = ['tuid','last','first','middle','unnamed1','unnamed2','unnamed3','soar','ncorrect','item']
        #except:
        #    print("\nColumn number != 10.\n{}".format(len(df.columns)))
        try:
            df['examNumber']=examNum
            numbers=pd.Series(list(range(28))).astype(str)
            itemNames= 'item_'+ numbers[1:]
            itemData=df.item.apply(lambda i: pd.Series(list(i)))
            itemData.columns=itemNames
            df=df.merge(itemData,'outer',left_index=True,right_index=True).drop('item',axis=1)
        except:
            print("\nUnhandled exception encountered.")
        if soarSessions ==np.nan:
            df.loc['soarType']='other'
        else:
            df.loc[df.soar.isin(soarSessions),'soarType']='mine'
            df.loc[~df.soar.isin(soarSessions),'soarType']='other'
            df.loc[df.tuid=='NNNNNNNNN','soarType']='key'
    return df
        

# Setting source and output folders

In [2]:
worksource = "C:/Users/tuh27554/Documents/Grading/2019/FALL EX1/Input Data"
workoutput = "C:/Users/tuh27554/Documents/Grading/2019/FALL EX1/Output Data"

# Getting and Cleaning Exam Data

In [3]:
os.chdir(worksource)
files = glob.glob('*.dta')
files

['marcexams_2019_2019-A0153.dta', 'marcexams_2019_2019-A0154.dta']

In [4]:
os.chdir(worksource)
df1=cleanSoarExam(files[0],examNum=1,soarSessions=[81])
df1['version'] = 1
df2=cleanSoarExam(files[1],examNum=1,soarSessions=[81])
df2['version'] = 2
df = df1.append(df2)
df.head()

Unnamed: 0,tuid,last,first,middle,unnamed1,unnamed2,unnamed3,soar,ncorrect,examNumber,...,item_20,item_21,item_22,item_23,item_24,item_25,item_26,item_27,soarType,version
0,500 - BLU,EXAM - KEY,NNNNNN,N,,,,,27,1,...,2,1,3,3,1,3,1,4,other,1
1,915565870,CHIUMENTO,JOSEPH,,11.0,11.0,48.0,75.0,22,1,...,2,1,3,1,1,3,1,1,other,1
2,915221763,SOILIS,NICHOL,G,11.0,11.0,33.0,72.0,23,1,...,2,1,3,3,1,3,1,3,other,1
3,915641754,HOWARD,LEANNA,C,11.0,10.0,23.0,81.0,24,1,...,2,1,2,3,1,3,1,1,mine,1
4,915645190,KIFAIEH,YAZAN,,11.0,10.0,12.0,87.0,20,1,...,2,1,3,3,1,3,1,3,other,1


Now we change the dir to the outputfolder and save the cleaned, merged file.

In [5]:
os.chdir(workoutput)
df.to_csv("Fall19Exam 1_cleaned.csv")

# Analyzing Exam Data

In [6]:
df.head()

Unnamed: 0,tuid,last,first,middle,unnamed1,unnamed2,unnamed3,soar,ncorrect,examNumber,...,item_20,item_21,item_22,item_23,item_24,item_25,item_26,item_27,soarType,version
0,500 - BLU,EXAM - KEY,NNNNNN,N,,,,,27,1,...,2,1,3,3,1,3,1,4,other,1
1,915565870,CHIUMENTO,JOSEPH,,11.0,11.0,48.0,75.0,22,1,...,2,1,3,1,1,3,1,1,other,1
2,915221763,SOILIS,NICHOL,G,11.0,11.0,33.0,72.0,23,1,...,2,1,3,3,1,3,1,3,other,1
3,915641754,HOWARD,LEANNA,C,11.0,10.0,23.0,81.0,24,1,...,2,1,2,3,1,3,1,1,mine,1
4,915645190,KIFAIEH,YAZAN,,11.0,10.0,12.0,87.0,20,1,...,2,1,3,3,1,3,1,3,other,1


# Most Frequently Wrong Answers

Here we identify the columns containing information about responses.

In [7]:
itemcols = df.columns[df.columns.str.contains('item')]
itemcols

Index(['item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'item_6', 'item_7',
       'item_8', 'item_9', 'item_10', 'item_11', 'item_12', 'item_13',
       'item_14', 'item_15', 'item_16', 'item_17', 'item_18', 'item_19',
       'item_20', 'item_21', 'item_22', 'item_23', 'item_24', 'item_25',
       'item_26', 'item_27'],
      dtype='object')

In [8]:
df.soarType.value_counts()

other    234
mine      33
Name: soarType, dtype: int64

In [9]:
from soar import countwrong,countwrong_allcol

def countwrong (df,col,ver,truthcol = 'soarType',truthind = 'key',compind = ['mine'],vercol = 'version'):
    truth = df.loc[(df[truthcol]==truthind)&(df[vercol] == ver),col]
    comp = df.loc[(df[truthcol].isin(compind))&(df[vercol] == ver),col]
    res = pd.DataFrame(data ={'item':[col],'number':[(comp != truth[0]).sum()]})
    return(res)


def countwrong_allcol(df,ver,itemcols,sortfields = ['number','item'],
                      truthcol = 'soarType',truthind = 'key',compind = 'mine',vercol = 'version'):
    tmp = pd.DataFrame()
    for col in itemcols:
        tmp = tmp.append(countwrong(df= df, col = col, ver = ver))
    tmp = tmp.sort_values(sortfields,ascending=False).reset_index(drop = True)
    return tmp

In [10]:
v1wrong = countwrong_allcol(df,1,itemcols = df.columns[df.columns.str.contains('item')])
v2wrong = countwrong_allcol(df,2,itemcols = df.columns[df.columns.str.contains('item')])

IndexError: index out of bounds

Now we apply this function to create a ranked list of items by order of the number of my students getting them wrong.

In [None]:
wrongcount_v1 = pd.DataFrame()
for col in itemcols:
    wrongcount_v1 = wrongcount_v1.append(countwrong(df= df, col = col, ver = 1,compind=['mine','other']))
wrongcount_v1 = wrongcount_v1.sort_values(['number','item'],ascending=False).reset_index(drop = True)
wrongcount_v1

In [None]:
wrongcount_v2 = pd.DataFrame()
for col in itemcols:
    wrongcount_v2 = wrongcount_v2.append(countwrong(df= df, col = col, ver = 2,compind=['other','mine']))
wrongcount_v2 = wrongcount_v2.sort_values(['number','item'],ascending=False).reset_index(drop = True)
wrongcount_v2

## Sec 73

## Sec 74

## Sec 75

# Printing the Keys

In [None]:
v1keys = df.loc[(df.tuid=='NNNNNNNNN')&(df.version==1),itemcols]
v1keys

In [None]:
v2keys = df.loc[(df.tuid=='NNNNNNNNN')&(df.version==2),itemcols]
v2keys

In [None]:
# v1keys = v1keys.T.rename(columns={0:'version1'})
# v2keys = v2keys.T.rename(columns={0:'version2'})
pd.DataFrame(data = {'version1':v1keys,'version2':v2keys},index = itemcols)

In [None]:
letters2num_dict = {1:'D',2:'C',3:'B',4:'A'}
exkeys.version