This notebook cleans .txt data for students performance on exams for BIO1111 and exports a csv which can be analyzed in an R script.
#### Author: Christopher Agard

In [1]:
import string

In [2]:
import pandas as pd
import numpy as np
import os,glob

def cleanSoarExam (data, examNum, fileType='flat',
                   colSpec = [(0,9),(10,21),(22,28),(29,30),(30,32),(32,34),(34,36),(36,38),(39,41),(41,68)],
                   soarSessions=[]):

    if fileType != 'flat':
        print("\n{} fileType is not currently supported.".format(fileType))
    else:
        try:
            df=pd.read_fwf(data,colSpec,names=['tuid','last','first','middle','unnamed1','unnamed2','unnamed3','soar','ncorrect','item'])
        except:
            "print(\nCould not find file {} or {} was not acceptable value for colSpec)".format(data,colSpec)
        #try:
        #    df.columns = ['tuid','last','first','middle','unnamed1','unnamed2','unnamed3','soar','ncorrect','item']
        #except:
        #    print("\nColumn number != 10.\n{}".format(len(df.columns)))
        try:
            df['examNumber']=examNum
            numbers=pd.Series(list(range(28))).astype(str)
            itemNames= 'item_'+ numbers[1:]
            itemData=df.item.apply(lambda i: pd.Series(list(i)))
            itemData.columns=itemNames
            df=df.merge(itemData,'outer',left_index=True,right_index=True).drop('item',axis=1)
        except:
            print("\nUnhandled exception encountered.")
        if soarSessions ==np.nan:
            df.loc['soarType']='other'
        else:
            df.loc[df.soar.isin(soarSessions),'soarType']='mine'
            df.loc[~df.soar.isin(soarSessions),'soarType']='other'
            df.loc[df.tuid=='NNNNNNNNN','soarType']='key'
    return df
        

Here we need to write a function to determine how many students get each item wrong.

In [3]:
# import pandas as pd
# import numpy as np

def nwrong (x,key):
    """
    :param x: pd.Series
    :param key: ~None
    Takes a pandas series and a specified value and returns the number of values 
    in the series which do not match the specified value."""
    assert isinstance(x, pd.Series)
    x = x.astype(str)
    key = str(key)
    return x[x!=key].count()
    

In [4]:
list(string.ascii_lowercase)[0]

'a'

# Setting up notebook

In [5]:
import pandas as pd
import numpy as np
import os,glob

pd.options.display.max_columns=50

# Getting Exam Data

Here we define paths for getting exam data and outputting results

In [6]:
homesource = 'S:/Chris/Temple/Biol1111/Fall 2018/Raw Data/Exam 2'
homeoutputFolder = 'S:/Chris/Temple/Biol1111/Fall 2018/Results/Exam 2'
worksource = 'C:/Users/tuh27554/Documents/BIOL1111/Fall 2018/Raw Data/Exam 2'
workoutputFolder = 'C:/Users/tuh27554/Documents/BIOL1111/Fall 2018/Results/Exam 2'

Here we get a list of source data files. For this notebook we will use the \*.txt files.

In [7]:
os.chdir(homesource)
files = glob.glob('*.txt')
print(files)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:/Users/tuh27554/Documents/BIOL1111/Fall 2018/Raw Data/Exam 2'

 And now we read in and clean those data using the *cleanSoarExam* function.

In [None]:
df1 = cleanSoarExam(files[0],examNum=1,soarSessions=[81])
print('Version 1 of the exam has {} students.'.format(df1.shape[0]-1))
df2 = cleanSoarExam(files[1],examNum=1,soarSessions=[81])
print('Version 2 of the exam has {} students.'.format(df2.shape[0]-1))
os.chdir(workoutputFolder)

In [None]:
# df1.head() 

In [None]:
# df2.head() 

# Analyzing Exam Data

## Determining the most problematic items for the class

We need to identify the item columns over which to apply *nwrong*.

In [None]:
itemcols1 = df1.columns[df1.columns.str.contains('item_')]
itemcols2 = df2.columns[df2.columns.str.contains('item_')]

Now we apply the function to determine the number of students who answered incorrectly for version 1 and version 2 separately. We will only print one of these here for an example.

In [None]:
v1wrong = df1.loc[df1.soarType.isin(['mine','key']),itemcols1].apply(lambda x: nwrong(x=x[1:],key=x[0]))
v2wrong = df2.loc[df2.soarType.isin(['mine','key']),itemcols2].apply(lambda x: nwrong(x=x[1:],key=x[0]))
v2wrong

Adding these lists together we get the total number wrong on each item.  If we sort the resulting series in descending order, we will have the guide we need to determine the order for discussing the items in class.

In [None]:
totalwrong = v1wrong + v2wrong
orderedwrong = totalwrong.sort_values(ascending=False)
orderedwrong

Let's also print the letters for the correct answers for each item.

In [None]:

v1keys = df1.loc[df1.soarType=='key',itemcols1].apply(lambda x: list(string.ascii_uppercase)[int(x)-1])
v1keys
# v2wrong = df2.loc[df2.soarType.isin(['mine','key']),itemcols2].apply(lambda x: nwrong(x=x[1:],key=x[0]))

Here we need to write a function to determine how many students get each item wrong.

In [None]:
v2keys = df1.loc[df1.soarType=='key',itemcols2].apply(lambda x: list(string.ascii_uppercase)[int(x)-1])
v2keys

## Evaluating students performace 

Now we look at scores by section.  For this we can rename *unnamed1* to *version* to keep track of the versions and append the 2 dfs. 

In [None]:
df = df1.append(df2)
print(df.shape)
df.to_csv('merged exam 2 results.csv')
# df.head()

We can drop the "key" data and store as a separate variable,*maxscore*, the maximum possible value for ncorrect.

In [None]:
maxscore = df.ncorrect.max()
df = df.loc[df.soarType!='key',:]
print('maxscore:{}'.format(maxscore))
# df.head()

We will use this *df* for the rest of our analysis.

In [None]:
scores = df.groupby(['soar']).ncorrect.agg(['min','max','median','mean','count']).sort_values('median')
scores.to_csv('Exam1Score breakdown.csv')
scores

In [None]:
print(df.groupby('soar').ncorrect.quantile(.75))

Now let's look at the distribution of individuals who scored above the 75-percentile in their class.  This gives us an idea of how distributed the high scores are in each section.

In [None]:
highAchieverslocal = df.groupby('soar').ncorrect.apply(lambda x: x.loc[x>x.quantile(.75)].count()).reset_index()\
.rename(columns = {'ncorrect':'nAbove75p'})
highAchieverslocal

Now let's look at the distribution of individuals who scored above the 75-percentile across the entire class.  This gives us an idea of how distributed the high scores are across sections.

In [None]:
highAchieversGlobal = df.loc[df.ncorrect>df.ncorrect.quantile(.75)]\
.groupby('soar').ncorrect.count().reset_index()\
.rename(columns = {'ncorrect':'nAbove75p'})
print('The overall all median and 75 percentile were {} and {}, \
respectively.'.format(df.ncorrect.median(),df.ncorrect.quantile(.75)))
highAchieversGlobal