# TEPEI R1, R2, and R3 Individualized Report 

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt 
import matplotlib.patches as mpatches
import textwrap 
from pathlib import Path 
import os 
import subprocess 

from utils import get_latex_code 

In [2]:
# read in the necessary data 
username_name_map = pd.read_csv('./data/roster.csv').set_index('username')['name'].to_dict() 
r1_df = pd.read_parquet('./data/r1_dataset.parquet').sort_values('concept_name')
r2_df = pd.read_parquet('./data/r2_dataset.parquet').sort_values('concept_name')
r3_df = pd.read_parquet('./data/r3_dataset.parquet').sort_values('concept_name')
all_df = pd.read_parquet('./data/all_dataset.parquet').sort_values('concept_name')
nlp_df = pd.read_parquet('./data/nlp_scores.parquet')

# get the list of students who have completed at least one report 
student_universe = set(all_df['doc_id'].str[:-3].unique())

## Calculate the coverage per topic and the class avg coverage

In [3]:
r1_df = r1_df.merge(
    r1_df.groupby('doc_id')['score'].sum().rename('doc_sum').reset_index(), 
    on='doc_id', 
    how='left'
)
r1_df['coverage'] = r1_df['score'] / r1_df['doc_sum']
r1_class_distribution = r1_df.groupby('concept_name')['coverage'].mean()

In [4]:
r2_df = r2_df.merge(
    r2_df.groupby('doc_id')['score'].sum().rename('doc_sum').reset_index(), 
    on='doc_id', 
    how='left'
)
r2_df['coverage'] = r2_df['score'] / r2_df['doc_sum']
r2_class_distribution = r2_df.groupby('concept_name')['coverage'].mean()

In [5]:
r3_df = r3_df.merge(
    r3_df.groupby('doc_id')['score'].sum().rename('doc_sum').reset_index(), 
    on='doc_id', 
    how='left'
)
r3_df['coverage'] = r3_df['score'] / r3_df['doc_sum']
r3_class_distribution = r3_df.groupby('concept_name')['coverage'].mean()

In [6]:
all_df['username'] = all_df['doc_id'].str[:-3]
all_df = all_df.merge(
    all_df.groupby(['doc_id'])['score'].sum().rename('doc_sum').reset_index(), 
    on='doc_id', 
    how='left'
)
all_df['coverage'] = all_df['score'] / all_df['doc_sum']
all_class_distribution = all_df.groupby(['username', 'concept_name'])['coverage'].mean().reset_index().groupby('concept_name')['coverage'].mean()

## Prepare the latex itemize for the topic explanations

In [7]:
r1_topics = "\\begin{itemize}" 
for row in r1_df[['concept_name', 'concept_prompt']].drop_duplicates().to_dict('records'): 
    r1_topics += f"\n    \\item \\textbf{{{row['concept_name']}:}} {row['concept_prompt']}"
r1_topics += "\n\\end{itemize}"

r2_topics = "\\begin{itemize}" 
for row in r2_df[['concept_name', 'concept_prompt']].drop_duplicates().to_dict('records'): 
    r2_topics += f"\n    \\item \\textbf{{{row['concept_name']}:}} {row['concept_prompt']}"
r2_topics += "\n\\end{itemize}"

r3_topics = "\\begin{itemize}" 
for row in r3_df[['concept_name', 'concept_prompt']].drop_duplicates().to_dict('records'): 
    r3_topics += f"\n    \\item \\textbf{{{row['concept_name']}:}} {row['concept_prompt']}"
r3_topics += "\n\\end{itemize}"

all_topics = "\\begin{itemize}" 
for row in all_df[['concept_name', 'concept_prompt']].drop_duplicates().to_dict('records'): 
    all_topics += f"\n    \\item \\textbf{{{row['concept_name']}:}} {row['concept_prompt']}"
all_topics += "\n\\end{itemize}"

## Generate the individual reports 

In [8]:
for username, name in username_name_map.items(): 
    # iterate through each student
    if username in student_universe: 
        # only create a report for a student who has completed at least one assignment 

        # r1 student vs class plot 
        tmp = pd.DataFrame({
            'Class Average': r1_class_distribution, 
            'Student': r1_df.loc[r1_df['doc_id'] == f'{username}_R1'].set_index('concept_name')['coverage']
        })
        tmp.index = [textwrap.fill(item.strip(), width=40) for item in tmp.index]
        fig, ax = plt.subplots(figsize=(10, 8), facecolor='w')
        tmp[::-1].plot.barh(ax=ax)
        ax.set_xlabel('Topic Coverage (%)')
        ax.set_ylabel('Topic')
        ax.xaxis.set_major_formatter('{x:,.0%}')
        class_avg = mpatches.Patch(color='C0', label='Class Average')
        student = mpatches.Patch(color='C1', label='Student')
        fig.legend(handles=[student, class_avg], ncol=2, loc='lower center')
        ax.legend().remove()
        fig.savefig('./r1.png', bbox_inches='tight')
        plt.close() 

        # r2 student vs class plot 
        tmp = pd.DataFrame({
            'Class Average': r2_class_distribution, 
            'Student': r2_df.loc[r2_df['doc_id'] == f'{username}_R2'].set_index('concept_name')['coverage']
        })
        tmp.index = [textwrap.fill(item.strip(), width=40) for item in tmp.index]
        fig, ax = plt.subplots(figsize=(10, 8), facecolor='w')
        tmp[::-1].plot.barh(ax=ax)
        ax.set_xlabel('Topic Coverage (%)')
        ax.set_ylabel('Topic')
        ax.xaxis.set_major_formatter('{x:,.0%}')
        class_avg = mpatches.Patch(color='C0', label='Class Average')
        student = mpatches.Patch(color='C1', label='Student')
        fig.legend(handles=[student, class_avg], ncol=2, loc='lower center')
        ax.legend().remove()
        fig.savefig('./r2.png', bbox_inches='tight')
        plt.close() 

        # r3 student vs class plot 
        tmp = pd.DataFrame({
            'Class Average': r3_class_distribution, 
            'Student': r3_df.loc[r3_df['doc_id'] == f'{username}_R3'].set_index('concept_name')['coverage']
        })
        tmp.index = [textwrap.fill(item.strip(), width=40) for item in tmp.index]
        fig, ax = plt.subplots(figsize=(10, 8), facecolor='w')
        tmp[::-1].plot.barh(ax=ax)
        ax.set_xlabel('Topic Coverage (%)')
        ax.set_ylabel('Topic')
        ax.xaxis.set_major_formatter('{x:,.0%}')
        class_avg = mpatches.Patch(color='C0', label='Class Average')
        student = mpatches.Patch(color='C1', label='Student')
        fig.legend(handles=[student, class_avg], ncol=2, loc='lower center')
        ax.legend().remove()
        fig.savefig('./r3.png', bbox_inches='tight')
        plt.close() 

        # r1, r2, and r3 over time plot 
        tmp = pd.DataFrame({
            'R3': all_df.loc[all_df['doc_id'] == f'{username}_R3'].set_index('concept_name')['coverage'], 
            'R2': all_df.loc[all_df['doc_id'] == f'{username}_R2'].set_index('concept_name')['coverage'], 
            'R1': all_df.loc[all_df['doc_id'] == f'{username}_R1'].set_index('concept_name')['coverage'], 
        })
        tmp.index = [textwrap.fill(item.strip(), width=40) for item in tmp.index]
        fig, ax = plt.subplots(figsize=(10, 8), facecolor='w')
        tmp[::-1].plot.barh(ax=ax, color=['C0', 'C1', 'C2'])
        ax.set_xlabel('Topic Coverage (%)')
        ax.set_ylabel('Topic')
        ax.xaxis.set_major_formatter('{x:,.0%}')
        r3 = mpatches.Patch(color='C0', label='R3')
        r2 = mpatches.Patch(color='C1', label='R2')
        r1 = mpatches.Patch(color='C2', label='R1')
        fig.legend(handles=[r1, r2, r3], ncol=3, loc='lower center')
        ax.legend().remove()
        fig.savefig('./over_time.png', bbox_inches='tight')
        plt.close() 

        # NLP scores plot 
        for nlp_score in ['concreteness', 'subjectiveness', 'specificity']: 
            fig, ax = plt.subplots(figsize=(4, 6))
            for tmp_username in username_name_map: 
                if username == tmp_username: 
                    nlp_df.loc[nlp_df['doc_id'].str.contains(f"{tmp_username}_")].set_index('round')[nlp_score].plot.line(ax=ax, alpha=1, color='C0', linewidth=2, marker='o') 
                else: 
                    nlp_df.loc[nlp_df['doc_id'].str.contains(f"{tmp_username}_")].set_index('round')[nlp_score].plot.line(ax=ax, alpha=0.5, color='gray', linewidth=1) 
            ax.set_xticks([1, 2, 3], labels=['R1', 'R2', 'R3'])
            ax.set_xlim(0.9, 3.1)
            ax.set_xlabel('Referee Report')
            ax.set_ylabel(f"{nlp_score.capitalize()} Score")
            fig.savefig(f"{nlp_score}.png", bbox_inches='tight')
            plt.close()

        # get latex code for the document
        latex_content = get_latex_code(name, r1_topics, r2_topics, r3_topics, all_topics, 'r1.png', 'r2.png', 'r3.png', 'over_time.png', 'concreteness.png', 'subjectiveness.png', 'specificity.png')

        # Write to a temporary file 
        with open('./tmp.tex', 'w') as f:
            f.write(latex_content)

        # convert to PDF 
        subprocess.run(['/Library/TeX/texbin/pdflatex', 'tmp.tex'], stdout=subprocess.DEVNULL)

        # delete intermediate files
        for fpath in Path('.').glob('tmp.*'): 
            if not str(fpath).endswith('pdf'): 
                os.remove(fpath)
        os.remove('r1.png')
        os.remove('r2.png')
        os.remove('r3.png')
        os.remove('over_time.png')
        os.remove('concreteness.png')
        os.remove('subjectiveness.png')
        os.remove('specificity.png')

        # rename file to student name and move to output folder 
        Path('./tmp.pdf').rename(f'./output/{name}.pdf')