In [None]:
import pandas as pd
import glob
import numpy as np

In [None]:
df = pd.read_csv('activity-study1-filtered-v2.csv')

In [None]:
df['datetime'] = df.datetime.map(lambda x: pd.to_datetime(x))

In [None]:
df['user'] = df['user'] + '-' + df['cohort'].map(str) 

In [None]:
WD = "C:/PPSVN/Katharina/Willen Moodle Logdaten/"

In [None]:
glob.glob(WD+'Learning Outcomes/Final exam/*.xlsx')

## RQ1: does support lead to more regular activity and less delayed completion?

In [None]:
df2 = df.copy().reset_index(drop=True)

In [None]:
# Sort the DataFrame by user and datetime
df2 = df2.sort_values(by=['user', 'datetime'])

# Calculate time differences in seconds
df2['time_diff'] = df2.groupby('user')['datetime'].diff().dt.total_seconds()
df2['intervention_group'] = df2.cohort.map(lambda x: 'intervention' if x in [5, 6] else 'control')

df2 = df2.dropna(subset=['intervention_group', 'user', 'time_diff']).copy()

In [None]:
df2['log_time_diff'] = np.log1p(df2['time_diff'])

In [None]:
# Plot the distribution of `time_diff`
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
df2['log_time_diff'].plot(kind='hist', bins=50, alpha=0.7, color='blue', edgecolor='black')
plt.title('Distribution of Time Differences', fontsize=16)
plt.xlabel('Time Difference (log seconds)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
## Segment activities to represent genuine sessions (15 minutes+)
df3 = df2[(df2['log_time_diff'] > np.log1p(15*60))].copy()

In [None]:
plt.figure(figsize=(8, 6))
df3['log_time_diff'].plot(kind='hist', bins=100, alpha=0.7, color='blue', edgecolor='black')
plt.title('Distribution of Time Differences', fontsize=16)
plt.xlabel('Time Difference (log seconds)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
# Mixed-effects model to test regularity of behavior between groups
from statsmodels.formula.api import mixedlm

# Fit a mixed-effects model with random intercept for user
model = mixedlm("log_time_diff ~ intervention_group", df3, groups=df3["user"])
result = model.fit()

print(result.summary())

In [None]:
(np.expm1(9.465 + 0.322) - np.expm1(9.465))/(np.expm1(9.465)) # Intervention effect in %

In [None]:
df_n = df2.groupby(['user', 'intervention_group']).size().reset_index().dropna().copy().rename(columns={0: 'n'})

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Fit a generalized linear model with Poisson family
model = smf.glm("n ~ intervention_group", data=df_n, family=sm.families.Poisson())
result = model.fit()

# Print the model summary
print(result.summary())

## Replication across times of day

In [None]:
df4 = df3[df3['day_period']!='Unknown'].copy()

In [None]:
# Mixed-effects model to test regularity of behavior between groups
from statsmodels.formula.api import mixedlm

# Fit a mixed-effects model with random intercept for user
model = mixedlm("log_time_diff ~ intervention_group*day_period", df4, groups=df4["user"])
result = model.fit()

# Print the model summary
print(result.summary())

In [None]:
df3.phase.value_counts()

In [None]:
# Mixed-effects model to test regularity of behavior between groups
from statsmodels.formula.api import mixedlm

# Fit a mixed-effects model with random intercept for user
model = mixedlm("log_time_diff ~ intervention_group*phase", df3, groups=df3["user"])
result = model.fit()

# Print the model summary
print(result.summary())

### Plot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create the plot
plt.figure(figsize=(10, 6))

# Use sns.kdeplot() with hue and palette
sns.kdeplot(data=trimmed_df, x='time_diff', hue='intervention_group', multiple='stack', palette='Set2')

# Customize the plot with title and labels
plt.title('Density Plot by Cohort and Phase')
plt.xlabel('Time Difference (mins)')
plt.ylabel('Density')

unique_labels = trimmed_df['intervention_group'].unique()

# Manually set the legend labels and titles to reflect the unique groups
plt.legend(title='Cohort', labels=unique_labels, loc='upper right', title_fontsize='13', fontsize='11')

# Add a grid for better readability
plt.grid(axis='y')

# Show the plot
plt.show()

## RQ2 Does improved regularity in activity improve better end-of-course grades?

In [None]:
fs = glob.glob('C:/Users/Katharina Teich/Desktop/Moodle Logdaten_for Conrad/RQ2/Checks (Quizze)/*')

In [None]:
dfs = []
for f in fs:
    tmp = pd.read_excel(f)
    tmp['cohort'] = f.split('Kohorte ')[-1].split('_')[0]
    tmp['user'] = tmp['User'] + '-' + tmp['cohort']
    dfs.append(tmp)
df_outcome = pd.concat(dfs)

In [None]:
df_valid = df2.groupby(['user', 'phase'])['time_diff'].median().reset_index()
df_valid = df_valid[df_valid['phase'].isin(['Lernphase 1', 'Lernphase 2'])].copy()

In [None]:
import pandas as pd
import numpy as np

# Define the maximum points for each quiz
max_points = {
    'Check 1': 12,
    'Check 2': 12,
    'Check 3': 27,
    'Check 4': 13,
    'Check 5': 16,
    'Check 6': 9
}

# Function to convert to numeric and divide by max points
def standardize(x, max_val):
    return pd.to_numeric(x, errors='coerce') / max_val

# Apply standardization to each quiz column
for quiz, max_val in max_points.items():
    df_outcome[quiz] = df_outcome[quiz].replace('-', np.nan)
    df_outcome[quiz] = standardize(df_outcome[quiz], max_val)

# Perform mean imputation for NA values
check_columns = [c for c in df_outcome.columns if 'Check' in c]

# Perform mean imputation column-wise for these columns
df_outcome[check_columns] = df_outcome[check_columns].apply(lambda col: col.fillna(col.mean()), axis=0)

In [None]:
# Average score per course segment
df_outcome['Lernphase 1'] = (df_outcome['Check 1'].map(float) + df_outcome['Check 2'].map(float) + df_outcome['Check 3'].map(float))/3
df_outcome['Lernphase 2'] = (df_outcome['Check 4'].map(float) + df_outcome['Check 5'].map(float) + df_outcome['Check 6'].map(float))/3

In [None]:
df_melt = df_outcome[['user', 'Lernphase 1', 'Lernphase 2']]

In [None]:
import pandas as pd

df_join = df_melt.melt(
    id_vars=['user'],
    value_vars=['Lernphase 1', 'Lernphase 2'],
    var_name='phase',
    value_name='outcome'
)
 

In [None]:
df_valid_full = df_valid.merge(df_join, how='left', on=['user', 'phase']).copy()

In [None]:
df_valid_full['time_diff'] = np.log1p(df_valid_full['time_diff'])

In [None]:
df_valid_full['time_diff_scaled'] = ((df_valid_full.time_diff) - np.mean(df_valid_full.time_diff)) / (np.std(df_valid_full.time_diff))

In [None]:
df_valid_full['outcome_scaled'] = ((df_valid_full.outcome) - np.mean(df_valid_full.outcome)) / (np.std(df_valid_full.outcome))

## Grade prediction model

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Make sure 'user' is treated as a categorical variable
['user'] = df_valid_full['user'].astype('category')

# Fit the linear mixed-effects model
model = smf.mixedlm("outcome_scaled ~ time_diff_scaled", data=df_valid_full, groups="user")
result = model.fit()

# Print the summary of the model
print(result.summary())


## Scatter plot

In [None]:
import matplotlib.pyplot as plt

# Set up the plot with larger figure size for better resolution
plt.figure(figsize=(8, 6))

# Create the scatter plot
plt.plot(df_valid_full['time_diff_scaled'], df_valid_full['outcome'], 'o', markersize=4, alpha=0.8)

# Set x-axis and y-axis labels with proper descriptions
plt.xlabel('Regularity in Activity (Log Scale, Z-Score)', fontsize=12)
plt.ylabel('Percentage Correct on Quizzes', fontsize=12)

# Set y-axis to range from 0 to 1 with proper ticks
plt.ylim(0, 1)
plt.yticks(ticks=[0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=['0%', '20%', '40%', '60%', '80%', '100%'], fontsize=10)

# Add grid for better readability
plt.grid(True, linestyle='--', alpha=0.6)

# Add a title
plt.title('Relationship Between Regularity in Activity and Quiz Performance', fontsize=14)

# Customize tick sizes for better readability
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

# Optional: Add a legend
plt.legend(loc='upper left', fontsize=10)

# Add a tight layout for better spacing
plt.tight_layout()

# Save the plot for publication if needed
plt.savefig('regularity_vs_quiz_performance.png', dpi=300)

# Show the plot
plt.show()
