This notebook cleans .txt and .dta data for students performance on exams for BIO1111 and exports a csv which can be analyzed in an R script.
#### Author: Christopher Agard

In [1]:
import pandas as pd
import numpy as np
import os,glob

In [2]:
def cleanSoarExam (data\
                   , examNum\
                   , fileType='flat'\
                   , colSpec = [(0,9),(10,21),(22,28),(29,30),(30,32),(32,34),(34,36),(36,38),(39,41),(41,68)]\
                  , soarSessions=[]):
    import pandas as pd
    import numpy as np
    import os
    if fileType != 'flat':
        print("\n{} fileType is not currently supported.".format(fileType))
    else:
        try:
            df=pd.read_fwf(data,colSpec,
                           names=['tuid','last','first',
                                  'middle','unnamed1','unnamed2',
                                  'unnamed3','soar','ncorrect','item'])
        except:
            "print(\nCould not find file {} or {} was not acceptable value for colSpec)".format(data,colSpec)
        #try:
        #    df.columns = ['tuid','last','first','middle','unnamed1','unnamed2','unnamed3','soar','ncorrect','item']
        #except:
        #    print("\nColumn number != 10.\n{}".format(len(df.columns)))
        try:
            df['examNumber']=examNum
            numbers=pd.Series(list(range(28))).astype(str)
            itemNames= 'item_'+ numbers[1:]
            itemData=df.item.apply(lambda i: pd.Series(list(i)))
            itemData.columns=itemNames
            df=df.merge(itemData,'outer',left_index=True,right_index=True).drop('item',axis=1)
        except:
            print("\nUnhandled exception encountered.")
        if soarSessions ==np.nan:
            df.loc['soarType']='other'
        else:
            df.loc[df.soar.isin(soarSessions),'soarType']='mine'
            df.loc[~df.soar.isin(soarSessions),'soarType']='other'
            df.loc[df.tuid=='NNNNNNNNN','soarType']='key'
    return df
        

# Setting source and output folders

In [3]:
worksource = "C:/Users/tuh27554/Documents/BIOL1111/Spr 2019/Raw Data/Exam 1"
workoutput = "C:/Users/tuh27554/Documents/BIOL1111/Spr 2019/Results/Exam 1"

# Getting and Cleaning Exam Data

In [12]:
os.chdir(worksource)
df1=cleanSoarExam('Exam 1 Version A.dta',examNum=1,soarSessions=[73,74,75])
df1['version'] = 1
df2=cleanSoarExam('Exam 1 Version B.dta',examNum=1,soarSessions=[73,74,75])
df2['version'] = 2
df = df1.append(df2)
df.head()

Unnamed: 0,tuid,last,first,middle,unnamed1,unnamed2,unnamed3,soar,ncorrect,examNumber,...,item_20,item_21,item_22,item_23,item_24,item_25,item_26,item_27,soarType,version
0,NNNNNNNNN,NNNNNNNNNNN,NNNNNN,N,,,,,27,1,...,4,1,3,2,3,4,1,1,key,1
1,915219911,MONASTRA,CARA,,11.0,,,75.0,21,1,...,3,2,3,3,3,4,1,1,mine,1
2,915672925,HUDSON,ERIN,N,11.0,,,85.0,15,1,...,4,1,4,2,1,4,1,1,other,1
3,915319790,SIMONS,MICHAE,J,11.0,,,85.0,21,1,...,2,3,3,3,3,4,1,1,other,1
4,915581890,IRISH,ABBIGA,,11.0,,,77.0,10,1,...,3,2,4,3,3,1,1,2,other,1


Now we change the dir to the outputfolder and save the cleaned, merged file.

In [13]:
os.chdir(workoutput)
df.to_csv("Spr19Exam 1_cleaned.csv")

# Analyzing Exam Data

In [14]:
df.head()

Unnamed: 0,tuid,last,first,middle,unnamed1,unnamed2,unnamed3,soar,ncorrect,examNumber,...,item_20,item_21,item_22,item_23,item_24,item_25,item_26,item_27,soarType,version
0,NNNNNNNNN,NNNNNNNNNNN,NNNNNN,N,,,,,27,1,...,4,1,3,2,3,4,1,1,key,1
1,915219911,MONASTRA,CARA,,11.0,,,75.0,21,1,...,3,2,3,3,3,4,1,1,mine,1
2,915672925,HUDSON,ERIN,N,11.0,,,85.0,15,1,...,4,1,4,2,1,4,1,1,other,1
3,915319790,SIMONS,MICHAE,J,11.0,,,85.0,21,1,...,2,3,3,3,3,4,1,1,other,1
4,915581890,IRISH,ABBIGA,,11.0,,,77.0,10,1,...,3,2,4,3,3,1,1,2,other,1


# Most Frequently Wrong Answers

Here we identify the columns containing information about responses.

In [15]:
itemcols = df.columns[df.columns.str.contains('item')]
itemcols

Index(['item_1', 'item_2', 'item_3', 'item_4', 'item_5', 'item_6', 'item_7',
       'item_8', 'item_9', 'item_10', 'item_11', 'item_12', 'item_13',
       'item_14', 'item_15', 'item_16', 'item_17', 'item_18', 'item_19',
       'item_20', 'item_21', 'item_22', 'item_23', 'item_24', 'item_25',
       'item_26', 'item_27'],
      dtype='object')

In [16]:
df.soarType.value_counts()

other    260
mine      73
key        2
Name: soarType, dtype: int64

In [71]:
def countwrong (df,col,ver,truthcol = 'soarType',truthind = 'key',compind = ['mine'],vercol = 'version'):
    truth = df.loc[(df[truthcol]==truthind)&(df[vercol] == ver),col]
    comp = df.loc[(df[truthcol].isin(compind))&(df[vercol] == ver),col]
    res = pd.DataFrame(data ={'item':[col],'number':[(comp != truth[0]).sum()]})
    return(res)


In [18]:
def countwrong_allcol(df,ver,itemcols,sortfields = ['number','item'],
                      truthcol = 'soarType',truthind = 'key',compind = 'mine',vercol = 'version'):
    tmp = pd.DataFrame()
    for col in itemcols:
        tmp = tmp.append(countwrong(df= df, col = col, ver = ver))
    tmp = tmp.sort_values(sortfields,ascending=False).reset_index(drop = True)
    return tmp

In [19]:
v1wrong = countwrong_allcol(df,1,itemcols = df.columns[df.columns.str.contains('item')])
v2wrong = countwrong_allcol(df,2,itemcols = df.columns[df.columns.str.contains('item')])

Now we apply this function to create a ranked list of items by order of the number of my students getting them wrong.

In [73]:
wrongcount_v1 = pd.DataFrame()
for col in itemcols:
    wrongcount_v1 = wrongcount_v1.append(countwrong(df= df, col = col, ver = 1,compind=['mine','other']))
wrongcount_v1 = wrongcount_v1.sort_values(['number','item'],ascending=False).reset_index(drop = True)
wrongcount_v1

Unnamed: 0,item,number
0,item_12,180
1,item_14,171
2,item_20,136
3,item_22,122
4,item_13,111
5,item_7,105
6,item_8,94
7,item_2,91
8,item_18,81
9,item_9,76


In [74]:
wrongcount_v2 = pd.DataFrame()
for col in itemcols:
    wrongcount_v2 = wrongcount_v2.append(countwrong(df= df, col = col, ver = 2,compind=['other','mine']))
wrongcount_v2 = wrongcount_v2.sort_values(['number','item'],ascending=False).reset_index(drop = True)
wrongcount_v2

Unnamed: 0,item,number
0,item_12,125
1,item_14,106
2,item_20,91
3,item_7,74
4,item_22,71
5,item_13,71
6,item_2,66
7,item_24,59
8,item_9,58
9,item_8,58


## Sec 73

## Sec 74

## Sec 75

# Printing the Keys

In [49]:
v1keys = df.loc[(df.tuid=='NNNNNNNNN')&(df.version==1),itemcols]
v1keys

Unnamed: 0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,...,item_18,item_19,item_20,item_21,item_22,item_23,item_24,item_25,item_26,item_27
0,1,3,1,4,2,4,3,1,2,3,...,4,4,4,1,3,2,3,4,1,1


In [50]:
v2keys = df.loc[(df.tuid=='NNNNNNNNN')&(df.version==2),itemcols]
v2keys

Unnamed: 0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,...,item_18,item_19,item_20,item_21,item_22,item_23,item_24,item_25,item_26,item_27
0,1,1,2,3,1,2,3,4,2,1,...,4,4,4,1,3,2,3,4,1,1


In [75]:
# v1keys = v1keys.T.rename(columns={0:'version1'})
# v2keys = v2keys.T.rename(columns={0:'version2'})
pd.DataFrame(data = {'version1':v1keys,'version2':v2keys},index = itemcols)

ValueError: cannot copy sequence with size 27 to array axis with dimension 1

In [None]:
letters2num_dict = {1:'D',2:'C',3:'B',4:'A'}
exkeys.version