<h1 style="text-align: center;">Script for creating and analyzing longitudinal data from the Tec21 academic model Cohort 2019 and 2020</h1>

#### School: Engineering and Science
#### Period: AD2019-FJ2022

#### Table of contents                                                                                                                
                                            
- [1. Datasets creation](#1.-Datasets-creation)                                                                               
    - [1.1 Data subset extraction](#4.1-Data-subset-extraction)                                                                     
    - [1.2 Statistical description of the selected variables](#4.2-Statistical-description-of-the-selected-variables)               
    - [1.3 Graphical description of the selected variables](#4.3-Graphical-description-of-the-selected-variables)                   
- [2. Longitudinal analysis](#5.-Longitudinal-analysis)                                                                           

In [1]:
# Importing needed libraries
import stat
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from datetime import datetime
from ete3 import Tree
import warnings
import plotly.express as px
import seaborn as sbn
from ipywidgets import widgets
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.anova as anova
from statsmodels.stats.multitest import multipletests
import scikit_posthocs as sp
import plotly.express as px
warnings.filterwarnings('ignore')
path_files = r'EICData'
# path_files= r'/content/drive/MyDrive/Proyecto-Tec21Competences-DataAnalysis/sourcecode/assets/EIC'
# path_files = r'EIC-20230322T203344Z-001/EIC'

In [None]:
# Load the dataset
full_df = pd.read_csv(f'EICData\Transformed_full_df_with_studentid_v1.0.csv', index_col='Unnamed: 0')
print(f'Dataframe loaded! \n The tidy dataframe has {len(full_df.index)} rows and {len(full_df.columns)} columns.')

In [None]:
#pd.set_option('display.max_column', None)
#full_df[full_df['student.id']==138480]
# full_df.columns

In [None]:
# Temporary code for checking dataset

len(full_df[(full_df['program.major_id'] == 'IBQ')
        & (full_df['competence.type'] == 'Disciplinary')]['student.id'].unique())

#### Select cohort

In [None]:
# Select the Cohort
atts = full_df['student.cohort.id'].unique().tolist()
btn_atts = widgets.Button(description='Select')
mult_choice = widgets.SelectMultiple(
    options=atts,
    description='Select cohort',
    disabled=False
)
display(mult_choice)
display(btn_atts)
def btn_select_att(btn_atts):
    global cohort
    cohort = list(mult_choice.value)
btn_atts.on_click(btn_select_att)

#### Select programs

In [None]:
# Select the programs
programs = full_df['program.major_id'].unique().tolist()
atts = full_df['program.major_id'].unique().tolist()
btn_atts = widgets.Button(description='Select')
mult_choice = widgets.SelectMultiple(
    options=atts,
    description='Select programs',
    disabled=False
)
display(mult_choice)
display(btn_atts)
def btn_select_att(btn_atts):
    global programs
    programs = list(mult_choice.value)
btn_atts.on_click(btn_select_att)

#### Select region

In [None]:
# Select the region
regions = full_df['campus.region_name'].unique().tolist()
atts = full_df['campus.region_name'].unique().tolist()
btn_atts = widgets.Button(description='Select')
mult_choice = widgets.SelectMultiple(
    options=atts,
    description='Select regions',
    disabled=False
)
display(mult_choice)
display(btn_atts)
def btn_select_att(btn_atts):
    global regions
    regions = list(mult_choice.value)
btn_atts.on_click(btn_select_att)

#### Select sex

In [None]:
# Select the sex
sex = full_df['student.isWoman'].unique().tolist()
atts = full_df['student.isWoman'].unique().tolist()
btn_atts = widgets.Button(description='Select')
mult_choice = widgets.SelectMultiple(
    options=atts,
    description='Select sex',
    disabled=False
)
display(mult_choice)
display(btn_atts)
def btn_select_att(btn_atts):
    global sex
    sex = list(mult_choice.value)
btn_atts.on_click(btn_select_att)

#### Select grouping var

In [None]:
# Select the region
atts = ['campus.region_name', 'student.cohort.id','program.major_id', 'student.isWoman']
btn_atts = widgets.Button(description='Select')
cb_list = widgets.Dropdown(
    options=atts,
    description='Select grouping var',
    disabled=False,
)
display(cb_list)
display(btn_atts)
def btn_select_att(btn_atts):
    global grouping_var
    grouping_var = cb_list.value
btn_atts.on_click(btn_select_att)

In [None]:
cohort_df = full_df[(full_df['student.cohort.id'].isin(cohort))
                    & (full_df['program.major_id'].isin(programs))
                    & (full_df['campus.region_name'].isin(regions))
                    & (full_df['student.isWoman'].isin(sex))]
assert (set(cohort_df['student.cohort.id'].unique()) == set(cohort)) \
        & (set(cohort_df['program.major_id'].unique()) == set(programs)) \
        & (set(cohort_df['campus.region_name'].unique()) == set(regions)) \
        & (set(cohort_df['student.isWoman'].unique()) == set(sex)) \
                , f'Error! Cohort column includes {existing_cohort} and selected cohort is {cohort}'
print(f'Selected dataframe has {len(cohort_df.index)} rows for:\n cohort(s) {cohort},\n program(s) {programs},\n region(s) {regions},\n and sex {sex}.')

In [None]:
# Group by student_id and aggregate semesters into a set
grouped = cohort_df.groupby('student.id')['semesters_from.enrollment'].apply(set).reset_index()

# Filter students who have semesters 1, 2, 3, 4, 5, and 6
filtered_students = grouped[grouped['semesters_from.enrollment'].apply(lambda x: {1, 2, 3, 4, 5, 6}.issubset(x))]

# Get the list of student IDs who meet the criteria
valid_student_ids = filtered_students['student.id']

# Filtering rows belonging to the semesters 1, 2, 3, 4, 5, and 6.
filtered_cohort = cohort_df[cohort_df['student.id'].isin(valid_student_ids) 
                            & cohort_df['semesters_from.enrollment'].isin([1, 2, 3, 4, 5, 6])]

len(filtered_cohort['student.id'].unique())

In [None]:
sem_list = filtered_cohort['semesters_from.enrollment'].unique()
assert set(sem_list) == {1, 2, 3, 4, 5, 6}, f'Error! There is at least one incorrect semester'
print(f'Semesters list {sem_list}')

##### Computing the Observed_competencies_ratio for students

In [11]:
# Group by student ID and calculate the ratio of True evaluations
ratio_df = filtered_cohort.groupby(['student.id', 'semesters_from.enrollment'])['subcompetence.level_assigned'].mean().reset_index()
ratio_df.columns = ['student.id', 'semesters_from.enrollment', 'Observed_ratio']

# Merge the ratio back to the original dataframe
filtered_cohort = filtered_cohort.merge(ratio_df, on=['student.id', 'semesters_from.enrollment'])

In [12]:
tuple_df = ratio_df.groupby(['student.id'])['semesters_from.enrollment'].unique().apply(tuple)
assert tuple_df[tuple_df != (1, 2, 3, 4, 5, 6)].empty, 'Error! There are Students with semesters different to [1, 2, and 3]'

#### Modifying academic program.

 Version 1, all program into entry programs.

In [31]:
program_mapping = {
    'IIT' : 'IIT',
    'IIS' : 'IIT',
    'IMT' : 'IIT',
    'IC'  : 'IIT',
    'IE'  : 'IIT',
    'IID' : 'IIT',
    'IM'  : 'IIT',
    'IMD' : 'IIT',
    'ICI' : 'ICI',
    'IDM' : 'ICI',
    'INA' : 'ICI',
    'IFI' : 'ICI',
    'ICT' : 'ICT',
    'ITC' : 'ICT',
    'IRS' : 'ICT',
    'ITD' : 'ICT',
    'IBQ' : 'IBQ',
    'IDS' : 'IBQ',
    'IBT' : 'IBQ', 
    'IQ'  : 'IBQ',
    'IAG' : 'IBQ',
    'IAL' : 'IBQ'
}
filtered_cohort['program.major_id'] = filtered_cohort['program.major_id'].replace(program_mapping)



Version 2. Transforming entry program data into specific programs.

In [13]:
# def transform_program_2(group):
#     programs = list(group['program.major_id'].unique())
#     if len(programs) > 1:
#         group['program.major_id'] = programs[1] if programs[0] in ['IIT', 'ICT', 'IBQ', 'ICI'] else programs[0]
#     return group

def transform_program_2(group):
    programs = list(group['program.major_id'].unique())
    if len(programs) > 1:
        selected_program = programs[1] if programs[0] in ['IIT', 'ICT', 'IBQ', 'ICI'] else programs[0]
        group['program.major_id'] = selected_program
    return group

filtered_cohort = filtered_cohort.groupby('student.id', as_index=False).apply(transform_program_2)

In [112]:
#pd.set_option('display.max_columns', None)
#filtered_cohort[filtered_cohort.index == list(full_df['student.id'].unique())[3]]
#filtered_cohort['program.major_id'].value_counts()
#len(list(filtered_cohort['student.id'].unique()))

#### Filter dataframe

In [14]:
filtered_cohort = filtered_cohort.drop_duplicates(subset= ['student.id', 'student.nationality', 'student_originSchool.isITESM', 
                                                           'student.cohort.id', 'semesters_from.enrollment','student.isWoman', 
                                                           'campus.region_name', 'program.major_id', 'Observed_ratio'], 
                                                  keep='first')
longitudinal_df = filtered_cohort[['student.id', 'student.nationality', 'student_originSchool.isITESM', 
                                   'student.cohort.id', 'semesters_from.enrollment','student.isWoman', 
                                   'campus.region_name', 'program.major_id', 'Observed_ratio']]
longitudinal_df.set_index(keys=['student.id'], inplace=True)

In [None]:
assert len(longitudinal_df.index) == len(filtered_cohort['student.id'].unique()) * 6, 'Error! Rows number mismatch student IDs *times 3'
longitudinal_df.sort_values(by='student.id')

In [None]:
#longitudinal_df[longitudinal_df.index==395]

<font color='blue'>**Until here, we have the data filtered by students, but there are multiple rows for each student. Yet, we need to transform it to add Observed competencies ratio as columns for each student, instead of as nwe rows**</font>

In [None]:
# Pivot the DataFrame
pivot_df = longitudinal_df.pivot(columns='semesters_from.enrollment', values='Observed_ratio')

# Rename the columns to include the semester information
pivot_df.columns = [f'Observed_ratio{col}' for col in pivot_df.columns]

# Reset the index to make 'student.id' a column again
pivot_df = pivot_df.reset_index()

# Add other columns
pivot_df = pivot_df.merge(longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 1], on='student.id').drop(['Observed_ratio',
                                                                                                                     'semesters_from.enrollment'], axis=1)

pivot_df

#### Describe numeric variables 

In [None]:
pivot_df.describe()

#### Describe categorical and boolean variables

In [None]:
categorical_cols = pivot_df.select_dtypes(include=['object', 'bool', 'category'])

for col in categorical_cols:
    print(f"Column: {col}")
    print(f"Number of unique categories: {pivot_df[col].nunique()}")
    print(f"Mode: {pivot_df[col].mode()[0]}")
    print(f"Value counts:\n{pivot_df[col].value_counts()}")
    print("-" * 40)

In [None]:
import plotly.express as px
for col in categorical_cols:
    fig = px.bar(
        pivot_df[col].value_counts().reset_index(),
        x=col, 
        y='count',
        title=f'Distribution of {col}',
        labels={'index': col, col: 'Count'},
        text_auto=True
    )
    fig.update_layout(
        xaxis_title=col,
        yaxis_title='Count',
        title_x=0.5,  # Center the title
        xaxis_tickangle=-45  # Rotate x-axis labels if needed
    )
    fig.show()

In [None]:
i = 0
for col in categorical_cols:
    if i != 0 and i != 4:
        # Prepare the data for the pie chart
        pie_data = pivot_df[col].value_counts().reset_index()
        pie_data.columns = [col, 'count']
        
        # Create the pie chart
        fig = px.pie(
            pie_data, 
            names=col,   # Categorical column for slices
            values='count',  # Numerical values
            #title=f'Distribution of {col}',  # Title of the plot
            labels={col: 'Category', 'count': 'Count'},  # Customize labels
            hole=0,  # Optional: Adds a hole for a donut-style chart
        )
        
        fig.update_traces(
            textposition='inside',  # Place labels inside the slices
            textinfo='label+value+percent',  # Show both percentage and category inside the pie
            showlegend=False,  # Hide external legend
            textfont_size=38
        )
        
        fig.update_layout(
            title_x=0.5,  # Center the title
            title_font_size=38,  # Adjust title font size
            height=600,  # Adjust the overall height of the chart
            width=600,  # Adjust the overall width of the chart
        )

        # Show the figure
        fig.show()
    i+=1

In [116]:
pivot_df.to_csv(f'EICData\LongitudinalData_Cohort_{cohort}_LongPrograms_6semesters.csv')

## 1. Descriptive analysis

In [None]:
print(longitudinal_df.info())

In [None]:
longitudinal_df.describe()

In [None]:
print(longitudinal_df.isnull().sum())

In [None]:
for col in longitudinal_df.columns:
  longitudinal_df.boxplot(by=col, column='Observed_ratio')
  plt.suptitle('')
  plt.title(f'Boxplots of Observed_ratio by {col}')
  plt.show()

## 2. Longitudinal analysis

In [18]:
# Q-Q Plot and Histogram
def plot_samples(df):
    for semester in df['semesters_from.enrollment'].unique():
        data = df[df['semesters_from.enrollment'] == int(semester)]['Observed_ratio']
        
        # Q-Q plot
        plt.figure(figsize=(12, 6))
        sm.qqplot(data, line='45', fit=True)
        plt.title(f'Q-Q Plot for Semester {semester}')
        # Set axis limits
        plt.xlim([-4, 4])   
        plt.ylim([-12, 2])
        plt.show()

        # Histogram
        sbn.histplot(data, kde=True)
        plt.title(f'Histogram for Semester {semester}')
        plt.show()


In [19]:
def kolmogorov_smirnov_test(longitudinal_df):
    for semester in longitudinal_df['semesters_from.enrollment'].unique():
        stat, p = stats.kstest(longitudinal_df[longitudinal_df['semesters_from.enrollment'] == semester]['Observed_ratio'], 
                        'norm', args=(longitudinal_df['Observed_ratio'].mean(), longitudinal_df['Observed_ratio'].std()))
        print(f"\nKolmogorov-Smirnov test for semester {semester}:")
        print(f"Statistics={stat}, p-value={p}")
        if p > 0.05:
            print("Sample looks Gaussian (fail to reject H0)")
        else:
            print("Sample does not look Gaussian (reject H0)")

In [35]:
# Anderson-Darling test for normality
def anderson_darleing_test(df):
    for semester in df['semesters_from.enrollment'].unique():
        result = stats.anderson(df[df['semesters_from.enrollment'] == semester]['Observed_ratio'], dist='norm')
        print(f"\nAnderson-Darling test for semester {semester}:")
        print(f"Statistic: {result.statistic}")
        for i in range(len(result.critical_values)):
            sl, cv = result.significance_level[i], result.critical_values[i]
            if result.statistic < cv:
                print(f"At {sl}% significance level, sample looks Gaussian (fail to reject H0)")
            else:
                print(f"At {sl}% significance level, sample does not look Gaussian (reject H0)")
    return sl, cv


In [21]:
longitudinal_df = longitudinal_df.reset_index()
normal_dist = False

In [34]:
def friedman_test(df):
    # Pivot the DataFrame to wide format
    pivot_df = df.pivot(index='student.id', columns='semesters_from.enrollment', values='Observed_ratio')

    # Extract data for the Friedman test
    data_sem1 = pivot_df[1].values
    data_sem2 = pivot_df[2].values
    data_sem3 = pivot_df[3].values
    data_sem4 = pivot_df[4].values
    data_sem5 = pivot_df[5].values
    data_sem6 = pivot_df[6].values

    # Perform the Friedman test
    stat, p = stats.friedmanchisquare(data_sem1, data_sem2, data_sem3, data_sem4, data_sem5, data_sem6)

    print(f"Friedman test statistic: {stat}")
    print(f"p-value: {p}")

    # Interpretation of results
    if p < 0.05:
        print("There is a significant difference in scores across semesters (reject H0)")
    else:
        print("There is no significant difference in scores across semesters (fail to reject H0)")
    return stat, p

#### For small samples

In [None]:
# Check data requirements for small samples

# 1. Check for missing values
print("Missing values:\n", longitudinal_df.isnull().sum())

# 2. Descriptive statistics
print("\nDescriptive statistics:\n", longitudinal_df.describe())

# 3. Check for normality using Shapiro-Wilk test
for semester in longitudinal_df['semesters_from.enrollment'].unique():
    stat, p = stats.shapiro(longitudinal_df[longitudinal_df['semesters_from.enrollment'] == semester]['Observed_ratio'])
    print(f"\nShapiro-Wilk test for semester {semester}:")
    print(f"Statistics={stat}, p-value={p}")
    if p > 0.05:
        print("Sample looks Gaussian (fail to reject H0)")
    else:
        print("Sample does not look Gaussian (reject H0)")


# Visual inspection
plot_samples(longitudinal_df)

# 5. Check for homogeneity of variances using Levene's test
stat, p = stats.levene(
    longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 1]['Observed_ratio'],
    longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 2]['Observed_ratio'],
    longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 3]['Observed_ratio'],
    longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 4]['Observed_ratio']
)
print(f"\nLevene's test for homogeneity of variances:\nStatistics={stat}, p-value={p}")
if p > 0.05:
    print("Variances are equal (fail to reject H0)")
else:
    print("Variances are not equal (reject H0)")

# Data fit normal distributions, so ANOVA is feasible
# Create a wide format dataframe for repeated measures ANOVA
df_wide = longitudinal_df.pivot(index='student.id', columns='semesters_from.enrollment', values='Observed_ratio')
df_wide.columns = ['Observed_ratio_sem1', 'Observed_ratio_sem2', 'Observed_ratio_sem3']

# Reshape data for ANOVA
df_long = pd.melt(df_wide.reset_index(), id_vars=['student.id'], value_vars=['Observed_ratio_sem1', 'Observed_ratio_sem2', 'Observed_ratio_sem3'])
df_long.columns = ['student.id', 'semesters_from.enrollment', 'Observed_ratio']
df_long['semesters_from.enrollment'] = df_long['semesters_from.enrollment'].apply(lambda x: int(x[-1]))

# ANOVA model
model = ols('score ~ C(semesters_from.enrollment) + C(student.id)', data=df_long).fit()
anova_results = anova.AnovaRM(df_long, 'Observed_ratio', 'student.id', within=['semesters_from.enrollment']).fit()

print("\nANOVA results:\n", anova_results)

# Interpretation of results
p_value = anova_results.anova_table['Pr > F'][0]
print(f"\nInterpretation of ANOVA results:\nF-statistic: {anova_results.anova_table['F Value'][0]}, p-value: {p_value}")
if p_value < 0.05:
    print("There is a significant difference in scores across semesters (reject H0)")
else:
    print("There is no significant difference in scores across semesters (fail to reject H0)")

#### For large samples

In [None]:
# Check data requirements
# 1. Check for missing values
print("Missing values:\n", longitudinal_df.isnull().sum())

# 2. Descriptive statistics
print("\nDescriptive statistics:\n", longitudinal_df.describe())

# 3. Tests for normality
kolmogorov_smirnov_test(longitudinal_df)
anderson_darleing_test(longitudinal_df)

# Visual inspection
plot_samples(longitudinal_df)

# 5. Check for homogeneity of variances using Levene's test
stat, p = stats.levene(
    longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 1]['Observed_ratio'],
    longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 2]['Observed_ratio'],
    longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 3]['Observed_ratio'],
    longitudinal_df[longitudinal_df['semesters_from.enrollment'] == 4]['Observed_ratio']
)
print(f"\nLevene's test for homogeneity of variances:\nStatistics={stat}, p-value={p}")
if p > 0.05:
    print("Variances are equal (fail to reject H0)")
    normal_dist = False
else:
    print("Variances are not equal (reject H0)")
    normal_dist = True

#### If data fits normal distributions

In [None]:
# If data fits normal distributions, ANOVA is feasible
# Create a wide format dataframe for repeated measures ANOVA
df_wide = longitudinal_df.pivot(index='student.id', columns='semesters_from.enrollment', values='Observed_ratio')
df_wide.columns = ['Observed_ratio_sem1', 'Observed_ratio_sem2', 'Observed_ratio_sem3']

# Reshape data for ANOVA
df_long = pd.melt(df_wide.reset_index(), id_vars=['student.id'], value_vars=['Observed_ratio_sem1', 'Observed_ratio_sem2', 'Observed_ratio_sem3'])
df_long.columns = ['student.id', 'semesters_from.enrollment', 'Observed_ratio']
df_long['semesters_from.enrollment'] = df_long['semesters_from.enrollment'].apply(lambda x: int(x[-1]))

# ANOVA model
model = ols('score ~ C(semesters_from.enrollment) + C(student.id)', data=df_long).fit()
anova_results = anova.AnovaRM(df_long, 'Observed_ratio', 'student.id', within=['semesters_from.enrollment']).fit()

print("\nANOVA results:\n", anova_results)

# Interpretation of results
p_value = anova_results.anova_table['Pr > F'][0]
print(f"\nInterpretation of ANOVA results:\nF-statistic: {anova_results.anova_table['F Value'][0]}, p-value: {p_value}")
if p_value < 0.05:
    print("There is a significant difference in scores across semesters (reject H0)")
else:
    print("There is no significant difference in scores across semesters (fail to reject H0)")

#### If data does not fit normal distributions

In [None]:

# Longitudinal analysis with Friedman test
# friedman_test(longitudinal_df)

In [86]:

def conover_posthoc(data):
    
    # Friedman test
    f_statistic, p_value = friedman_test(data)

    print("Friedman test:")
    print("F-statistic:", f_statistic)
    print("p-value:", p_value)

    # Conover post-hoc test
    if p_value < 0.05:
        conover_results = sp.posthoc_conover(data, 
                                             p_adjust='holm', 
                                             val_col='Observed_ratio', 
                                             group_col='semesters_from.enrollment')
        print("Conover post-hoc test:")
        print(conover_results)
    else:
        print("No significant difference found in the Friedman test.")
    return conover_results


In [89]:
# Add significance annotation
def add_stat_annotation(ax, x1, x2, y, h, p, p_corr, reject):
    text = ""
    if p_corr < 0.001:
        text = "***"
    elif p_corr < 0.01:
        text = "**"
    elif p_corr < 0.05:
        text = "*"
    else:
        text = "ns"
    ax.plot([x1, x1, x2, x2], [y, y+h, y+h, y], lw=1.5, c='k')
    ax.text((x1 + x2) * .5, y + h, text, ha='center', va='bottom', color='k')

In [None]:
# Wilcoxon signed-rank test with Bonferroni correction

pivot_df = longitudinal_df.pivot(index='student.id', columns='semesters_from.enrollment', values='Observed_ratio')
# Number of semesters in the sample
semesters = 6

# Perform pairwise Wilcoxon signed-rank tests
pairwise_test_results = {}
for i in range(1, semesters):
    for j in range(i + 1, semesters + 1):
        var1 = pivot_df[i].dropna().values
        var2 = pivot_df[j].dropna().values
        stat, p = stats.wilcoxon(var1, var2)
        comparison_str = f'Sem{i} vs Sem{j}'
        pairwise_test_results[f'{i}{j}'] = (stat, p, comparison_str)

# Collect p-values for multiple testing correction
p_values = [value[1] for value in pairwise_test_results.values()]

# Apply Bonferroni correction
reject, p_corrected_bonf, _, _ = multipletests(p_values, alpha=0.05, method='bonferroni')

# Print the results
for index, (key, value) in enumerate(pairwise_test_results.items()):
    print(f"\nComparison: {value[2]}")
    print(f"Wilcoxon test statistic: {value[0]}")
    print(f"p-value: {value[1]}, corrected p-value: {p_corrected[index]}")
    if reject[index]:
        print("Significant difference (reject H0)")
    else:
        print("No significant difference (fail to reject H0)")

conover_results = conover_posthoc(longitudinal_df)
p_corrected_conover = [conover_results.iloc[i,j] for i in range(conover_results.shape[0]) for j in range(i+1, conover_results.shape[0])]

# Plotting
medians = longitudinal_df[['semesters_from.enrollment','Observed_ratio']].groupby('semesters_from.enrollment').median()
means = longitudinal_df[['semesters_from.enrollment','Observed_ratio']].groupby('semesters_from.enrollment').mean()
plt.figure(figsize=(10, 10))
plot = sbn.boxplot(x='semesters_from.enrollment', y='Observed_ratio', data=longitudinal_df, palette='Set2')
for i, mean in enumerate(means['Observed_ratio']):
    plot.annotate(str(round(mean, 4)), xy = (i, mean), horizontalalignment = 'center')

# Get the max y-value for the placement of significance annotations
y_max = longitudinal_df['Observed_ratio'].max()

# Add annotations for significant pairwise comparisons
pairs = [(i, j) for i in range(semesters) for j in range(i+1, semesters)]
step = 7
heights = np.array(range(4, (len(pairs) + 1) * step, step))
heights = list(heights/100)

for i, (x1, x2) in enumerate(pairs):
    add_stat_annotation(plot, x1, x2, y_max, heights[i], p_values[i], p_corrected_conover[i], reject[i])

plt.title('Boxplot of Scores by Semester with Significance Annotations')
plt.xlabel('Semester')
plt.ylabel('Observed ratio')
plt.show()


In [None]:
#TODO Check why this Conover test outputs a result different to the JASP implementation

# Calculate medians and means
medians = longitudinal_df.groupby('semesters_from.enrollment')['Observed_ratio'].median().reset_index()
means = longitudinal_df.groupby('semesters_from.enrollment')['Observed_ratio'].mean().reset_index()

# Create the boxplot using Plotly Express
fig = px.box(longitudinal_df, 
             x='semesters_from.enrollment', 
             y='Observed_ratio', 
             #points="all", 
             color_discrete_sequence=px.colors.qualitative.Dark24)

# Add mean annotations
for i, mean in enumerate(means['Observed_ratio']):
    fig.add_annotation(x=means['semesters_from.enrollment'][i], y=mean, 
                       text=str(round(mean, 4)), 
                       showarrow=False, 
                       font=dict(size=12),
                       xanchor='center')

# Add significance annotations
y_max = longitudinal_df['Observed_ratio'].max()

pairs = [(i, j) for i in range(semesters) for j in range(i+1, semesters)]
step = 0.07  # Adjusted for Plotly's scale
heights = [y_max + step * (i + 1) for i in range(len(pairs))]

for i, (x1, x2) in enumerate(pairs):
    #if reject[i]:
    fig.add_shape(
        type="line",
        x0=medians['semesters_from.enrollment'][x1], y0=heights[i],
        x1=medians['semesters_from.enrollment'][x2], y1=heights[i],
        line=dict(color="black", width=2)
    )
    signif = '***' if p_corrected_conover[i] < 0.001 else '**' if p_corrected_conover[i] < 0.01 else '*' if p_corrected_conover[i] < 0.05 else 'ns'
    fig.add_annotation(
        x=(medians['semesters_from.enrollment'][x1] + medians['semesters_from.enrollment'][x2]) / 2,
        y=heights[i],
        text= signif,
        showarrow=False,
        font=dict(size=10),
        xanchor="center"
    )

# Update layout
fig.update_layout(
    title='Boxplot of Scores by Semester with Significance Annotations',
    xaxis_title='Semester',
    yaxis_title='Observed ratio',
    showlegend=False
)

# Show plot
fig.show()

In [None]:
def below_diagonal(matrix):
    rows, cols = matrix.shape
    below_diag_elements = 
    
    for i in range(1, rows):  # Start from 1 to skip the diagonal
        for j in range(i):
            below_diag_elements.append(matrix[i, j])
    

#### Ploting means in dotplot

In [None]:

# Create the dotplot with a connecting line
fig = px.line(x=list(longitudinal_df['semesters_from.enrollment'].unique()), 
              y=medians['Observed_ratio'], 
              markers=True)

# Customize the plot
fig.update_layout(title=f'Means of Observed_competencies ratio by semester for cohort(s) {cohort}', xaxis_title='Semester', yaxis_title='Means of Observed_competencies ratio')

# Show the plot
fig.show()

In [None]:
import plotly.io as pio

lower_bound = longitudinal_df[['semesters_from.enrollment',
                               'Observed_ratio']].groupby('semesters_from.enrollment').apply(lambda x: stats.t.interval(0.95, 
                                                                                                                        len(x)-1, 
                                                                                                                        loc=x.mean(), 
                                                                                                                        scale=stats.sem(x))[0])
upper_bound = longitudinal_df[['semesters_from.enrollment',
                               'Observed_ratio']].groupby('semesters_from.enrollment').apply(lambda x: stats.t.interval(0.95, 
                                                                                                                        len(x)-1, 
                                                                                                                        loc=x.mean(), 
                                                                                                                        scale=stats.sem(x))[1])

temp_df = pd.DataFrame()
temp_df['means'] = means['Observed_ratio']
temp_df['lower_bound'] = [(means.iloc[i-1,0] - lower_bound[i][1]) for i in lower_bound.index]
temp_df['upper_bound'] = [(upper_bound[i][1] - means.iloc[i-1,0]) for i in upper_bound.index]
# Create the plot
fig = px.line(temp_df, 
              x=longitudinal_df['semesters_from.enrollment'].unique(),
              y='means', 
              error_y='lower_bound', 
              error_y_minus='upper_bound',
              markers=True)
fig.update_traces(marker=dict(color='red'))           
fig.update_layout(title=f'Means of Observed_competencies ratio by semester for cohort(s) {cohort}',
                  xaxis_title='Semester',
                  yaxis_title='Means of Observed_competencies ratio')

#pio.write_image(fig, 'line_plot.svg')

fig.show()


In [None]:
pio.write_image(fig, 'line_plot.pdf')

In [None]:
#lower_bound

In [None]:
# Wilcoxon signed-rank test with Bonferroni correction
# from scipy.stats import wilcoxon
# from statsmodels.stats.multitest import multipletests
# from itertools import combinations

# # Pivot the DataFrame to wide format
# pivot_df = longitudinal_df.pivot(index='student.id', columns='semesters_from.enrollment', values='Observed_ratio')

# # Calculate means
# means = longitudinal_df.groupby('semesters_from.enrollment')['Observed_ratio'].mean().reset_index()

# # Perform pairwise Wilcoxon signed-rank tests
# samples = longitudinal_df['semesters_from.enrollment'].unique()
# pairwise_combinations = list(combinations(samples, 2))
# p_values = []

# for comb in pairwise_combinations:
#     sample1 = longitudinal_df[longitudinal_df['semesters_from.enrollment'] == comb[0]]['Observed_ratio']
#     sample2 = longitudinal_df[longitudinal_df['semesters_from.enrollment'] == comb[1]]['Observed_ratio']
#     stat, p = wilcoxon(sample1, sample2)
#     p_values.append(p)

# # Apply Bonferroni correction
# bonferroni_correction = len(p_values)
# corrected_p_values = np.array(p_values) * bonferroni_correction
# corrected_p_values[corrected_p_values > 1] = 1  # Cap the maximum p-value at 1

# # Determine which pairs have significant differences
# significant_pairs = [pairwise_combinations[i] for i, p in enumerate(corrected_p_values) if p < 0.05]

# # Create plot
# fig = go.Figure()

# # Add means
# fig.add_trace(go.Scatter(
#     x=means['semesters_from.enrollment'],
#     y=means['Observed_ratio'],
#     mode='markers+text',
#     text=means['Observed_ratio'],
#     textposition='top center',
#     name='Means'
# ))

<font color='blue'>**Intersubject longitudinal study**</font>

In [None]:
from statsmodels.stats.libqsturng import qsturng
from statsmodels.sandbox.stats.multicomp import MultiComparison
from statsmodels.formula.api import ols


# Friedman Test
for group in list(longitudinal_df[grouping_var].unique().tolist()):
    group_data = longitudinal_df[longitudinal_df[grouping_var] == group]
    values = [group_data[group_data['semesters_from.enrollment'] == t]['Observed_ratio'] for t in range(1, 5)]
    stat, p = friedmanchisquare(*values)
    print(f'Friedman Test for group {group}: statistic={stat}, p-value={p}')

# Wilcoxon Signed-Rank Test
for t1, t2 in [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]:
    stat, p = wilcoxon(longitudinal_df[longitudinal_df['semesters_from.enrollment'] == t1]['Observed_ratio'],
                       longitudinal_df[longitudinal_df['semesters_from.enrollment'] == t2]['Observed_ratio'])
    print(f'Wilcoxon Signed-Rank Test between time {t1} and {t2}: statistic={stat}, p-value={p}')

# Kruskal-Wallis Test
for t in range(1, 5):
    stat, p = kruskal(longitudinal_df[longitudinal_df['semesters_from.enrollment'] == t][longitudinal_df['student.cohort.id'] == 2019.0]['Observed_ratio'],
                      longitudinal_df[longitudinal_df['semesters_from.enrollment'] == t][longitudinal_df['student.cohort.id'] == 2020.0]['Observed_ratio'])
    print(f'Kruskal-Wallis Test at time {t}: statistic={stat}, p-value={p}')

# Quade Test - Using statsmodels for factorial analysis
longitudinal_df['value_ranked'] = longitudinal_df.groupby('semesters_from.enrollment')['Observed_ratio'].rank()
model = ols("value_ranked ~ C('student.cohort.id') + C('semesters_from.enrollment') + C('student.cohort.id'):C('semesters_from.enrollment')", 
            data=longitudinal_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(f'Quade Test:\n{anova_table}')

# Aligned Rank Transform (ART)
# We'll use the ART module from the ARTool package which is not available in Python by default.
# This step is a placeholder as the ARTool package needs to be used in R or other software.
# from art import ART
# art = ART()
# art.fit(df['value'], df['group'], df['time'])
# art_anova = art.anova_table()
# print(f'Aligned Rank Transform (ART):\n{art_anova}')

# Note: ART can be done using the ARTool package in R and then imported into Python if necessary.
