In [None]:
from matplotlib import pyplot as plt
from hashlib import md5
import numpy as np
import pandas as pd
%matplotlib inline

## 2017

In [None]:
# Assignment column names
assignments = [[
        # Early period
        'Assignment 2 (375591)',
        'Assignment 3 (380197)',
        'Assignment 4 (384309)',
        'Assignment 5 (388302)',
        'Assignment 6 (392378)',
        'Assignment 7 (399215)',
        
    ], [
        # Late period
        'Assignment 8 (403014)',
        'Assignment 9 (406816)',
        'Assignment 10 (410342)',
        'Assignment 11 (413824)',
        'Assignment 12 (416215)',
        'Assignment 13 (421191)'
    ]
]

In [None]:
# Load raw data
grade_2017 = 'data/grades/2017/2019-07-12T1714_Grades-SI_301_001_FA_2017.csv'
df = pd.read_csv(grade_2017)

# Create data frame
df_2017 = pd.DataFrame(index=df.index)

# Create uid based on hash of username
df_2017['uid'] = pd.Series(index=df.index, dtype='uint64')
for i in df.index:
    df_2017.loc[i, 'uid'] = md5(str(df['SIS Login ID'][i]).lower().encode('utf-8')).hexdigest()
    
# Add treatment column
df_2017['treatment'] = pd.Series(index=df_2017.index)
df_2017['teamwork'] = pd.Series(index=df_2017.index)
for i in df_2017.index:
    if df['Early/Late Team Work Assignment (373441)'][i] == 1.0:
        df_2017.loc[i, 'teamwork'] = 'Early'
        df_2017.loc[i, 'treatment'] = 'experiment'
    elif df['Early/Late Team Work Assignment (373441)'][i] == 2.0:
        df_2017.loc[i, 'teamwork'] = 'Late'
        df_2017.loc[i, 'treatment'] = 'control'
    else:
        df_2017.loc[i, 'treatment'] = None
    
# Add midterm column
df_2017['midterm'] = df['Midterm (401592)']

# Add final column
df_2017['final'] = df['Final (437773)']

# Add year column
df_2017['year'] = '2017'

# Calculate assignment mean for period
df_2017['early_assignments_unfixed'] = df.loc[:,assignments[0]].mean(axis=1, skipna=False)
df_2017['late_assignments_unfixed'] = df.loc[:,assignments[1]].mean(axis=1, skipna=False)

# Adjust for bonus given to only treatment group
# Assign bonus to control in same fashion
for i in df.index:
    if df['Early/Late Team Work Assignment (373441)'][i] == 1.0:
        individual_period = 1
    elif df['Early/Late Team Work Assignment (373441)'][i] == 2.0:
        individual_period = 0
    else:
        continue
    for c in assignments[individual_period]:
        df.loc[i, c] = min(100, df.loc[i, c] + 10)

# Calculate assignment mean for period
df_2017['early_assignments'] = df.loc[:,assignments[0]].mean(axis=1, skipna=False)
df_2017['late_assignments'] = df.loc[:,assignments[1]].mean(axis=1, skipna=False)


In [None]:
df_2017 = df_2017.dropna(subset=['treatment'])
df_2017.to_csv('preprocessed/grades_2017.csv')

In [None]:
plt.hist(
    [df_2017[df_2017.treatment == 'control'].early_assignments,
    df_2017[df_2017.treatment == 'experiment'].early_assignments],
    label=['control', 'experiment'],
    bins=range(0, 101, 10))
plt.legend()

## 2018

In [None]:
# Assignment column names
assignments = [[
        # Early period
        'Assignment2 (603916)',
        'Assignment3 (611541)',
        'Assignment 4 (614794)',
        'Assignment 5 (622368)',
        'Assignment 6 (628783)',
        'Assignment 7 (632651)'
    ], [
        # Late period
        'Assignment 8 (636118)',
        'Assignment 9 (654039)',
        'Assignment 10 (656450)',
        'Assignment 11 (656451)',
        'Assignment 12 (656452)',
        'Assignment 13 (656453)'
    ]
]

In [None]:
# Load raw data
grade_2018 = 'data/grades/2018/2019-07-12T1704_Grades-SI_301_001_FA_2018.csv'
df = pd.read_csv(grade_2018)

# Remove rows with no uniqname
df = df[~df['SIS Login ID'].isna()]

# Create uid based on hash of username
df['uid'] = pd.Series(index=df.index, dtype='uint64')
for i in df.index:
    uniqname = str(df['SIS Login ID'][i]).lower()
    df.loc[i, 'uid'] = md5(uniqname.encode('utf-8')).hexdigest()

# Use uid as index    
df = df.set_index('uid')
    
# Create data frame
df_2018 = pd.DataFrame(index=df.index)

# Add uniqename
#df_2018['uniqname'] = df['SIS Login ID']
    
# Add midterm column
df_2018['midterm'] = df['Midterm (622350)']

# Add final column
df_2018['final'] = df['Final (664612)']

# Add year
df_2018['year'] = '2018'

In [None]:
# Load treatment data
assignment_file = 'data/grades/2018/Working groups and pairings/group_assignments.csv'
df_treatment = pd.read_csv(assignment_file)

# Create uid based on hash of username
df_treatment['uid'] = pd.Series(index=df_treatment.index, dtype='uint64')
for i in df_treatment.index:
    uniqname = str(df_treatment['uniqname'][i]).lower().encode('utf-8')
    df_treatment.loc[i, 'uid'] = md5(uniqname).hexdigest()
df_treatment = df_treatment.set_index('uid')
    
# Add treatment column
df_2018['treatment'] = pd.Series(index=df_2018.index)
df_2018['teamwork'] = pd.Series(index=df_2018.index)
for uid in df_treatment.index:
    if uid not in df_2018.index:
        print('Skipping uid: {}'.format(uid), df_treatment.loc[uid, 'uniqname'])
        continue
    if df_treatment.loc[uid, 'Working Group'] == 'Early':
        df_2018.loc[uid, 'teamwork'] = 'Early'
        df_2018.loc[uid, 'treatment'] = 'experiment'
    elif df_treatment.loc[uid, 'Working Group'] == 'Late':
        df_2018.loc[uid, 'teamwork'] = 'Late'
        df_2018.loc[uid, 'treatment'] = 'control'
    else:
        df_2018.loc[uid, 'treatment'] = None
        
# Adjust for bonus given to only treatment group
# Assign bonus to control in same fashion
for uid in df.index:
    if df_2018.loc[uid, 'treatment'] == 'Early':
        individual_period = 1
    elif df_2018.treatment[uid] == 'Late':
        individual_period = 0
    else:
        continue
    for c in assignments[individual_period]:
        df.loc[uid, c] = min(100, df.loc[uid, c] + 10)

# Calculate assignment mean for period
df_2018['early_assignments'] = df.loc[:,assignments[0]].mean(axis=1, skipna=False)
df_2018['late_assignments'] = df.loc[:,assignments[1]].mean(axis=1, skipna=False)


In [None]:
df_2018 = df_2018.dropna(subset=['treatment'])
df_2018.to_csv('preprocessed/grades_2018.csv')

In [None]:
plt.hist(
    [df_2018[df_2018.treatment == 'control'].early_assignments,
    df_2018[df_2018.treatment == 'experiment'].early_assignments],
    label=['control', 'experiment'],
    bins=range(0, 101, 10))
plt.legend()

## 2019

In [None]:
# Assignment column names
assignments = [[
        # Early period
        'Assignment 2 (864086)',
        'Assignment 3 (864087)',
        'Assignment 4 (871847)',
        'Assignment 5 (871848)',
        'Assignment 6 (884293)',
        'Assignment 7 (884294)'
    ], [
        # Late period
        'Assignment 8 (894111)',
        'Assignment 9 (894112)',
        'Assignment 10 (894113)',
        'Assignment 11 (905611)',
        'Assignment 12 (905612)',
        'Assignment 13 (905613)'
    ]
]

In [None]:
# Load raw data
grade_2019 = 'data/grades/2019/2022-06-15T1627_Grades-SI_301_001_FA_2019.csv'
df = pd.read_csv(grade_2019)

# Remove rows with no uniqname
df = df[~df['SIS Login ID'].isna()]

# Create uid based on hash of username
df['uid'] = pd.Series(index=df.index, dtype='uint64')
for i in df.index:
    df.loc[i, 'uid'] = md5(str(df['SIS Login ID'][i]).lower().encode('utf-8')).hexdigest()

# Use uid as index    
df = df.set_index('uid')
    
# Create data frame
df_2019 = pd.DataFrame(index=df.index)

# Add uniqename
#df_2018['uniqname'] = df['SIS Login ID']
    
# Add midterm column
df_2019['midterm'] = df['Midterm (873981)']

# Add final column
df_2019['final'] = df['Final (908348)']

# Add year
df_2019['year'] = '2019'

In [None]:
# Load treatment data
assignment_file = 'data/grades/2019/Working groups and pairings/group_assignments.csv'
df_treatment = pd.read_csv(assignment_file)

# Create uid based on hash of username
df_treatment['uid'] = pd.Series(index=df_treatment.index, dtype='uint64')
for i in df_treatment.index:
    uniqname = str(df_treatment['uniqname'][i]).lower().encode('utf-8')
    df_treatment.loc[i, 'uid'] = md5(uniqname).hexdigest()
df_treatment = df_treatment.set_index('uid')
    
# Add treatment column
df_2019['treatment'] = pd.Series(index=df_2019.index)
df_2019['teamwork'] = pd.Series(index=df_2019.index)
for uid in df_treatment.index:
    if uid not in df_2019.index:
        print('Skipping uid: {}'.format(uid), df_treatment.loc[uid, 'uniqname'])
        continue
    if df_treatment.loc[uid, 'Working Group'] == 'Early':
        df_2019.loc[uid, 'teamwork'] = 'Early'
        df_2019.loc[uid, 'treatment'] = 'control'
    elif df_treatment.loc[uid, 'Working Group'] == 'Late':
        df_2019.loc[uid, 'teamwork'] = 'Late'
        df_2019.loc[uid, 'treatment'] = 'experiment'
    else:
        df_2019.loc[uid, 'treatment'] = None
        
# Adjust for bonus given to only treatment group
# Assign bonus to control in same fashion
for uid in df_2019.index:
    if df_2019.treatment[uid] == 'Early':
        individual_period = 1
    elif df_2019.treatment[uid] == 'Late':
        individual_period = 0
    else:
        continue
    for c in assignments[individual_period]:
        df.loc[uid, c] = min(100, df.loc[uid, c] + 10)

# Calculate assignment mean for period
df_2019['early_assignments'] = df.loc[:,assignments[0]].mean(axis=1, skipna=False)
df_2019['late_assignments'] = df.loc[:,assignments[1]].mean(axis=1, skipna=False)


In [None]:
df_2019 = df_2019.dropna(subset=['treatment'])
df_2019.to_csv('preprocessed/grades_2019.csv')

In [None]:
plt.hist(
    [df_2019[df_2019.treatment == 'control'].early_assignments,
    df_2019[df_2019.treatment == 'experiment'].early_assignments],
    label=['control', 'experiment'],
    bins=range(0, 101, 10))
plt.title("NOTE: teamwork-control vs individual")
plt.legend()
