#### Import Required Libraries 

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from collections import Counter

#### Load Dataset

In [3]:
# Load dataset
df = pd.read_excel('student_data.xlsx')

# Show all columns when displaying DataFrames
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,student_id,course,student_cohort,academic_status,failed_subjects,study_skills(attended),referral,pp_meeting,self_assessment,readiness_assessment_results,follow_up,follow_up_type,subject_1,subject_1_assess_1,subject_1_assess_2,subject_1_assess_3,subject_1_assess_4,attendance_1,learn_jcu_issues_1,lecturer_referral_1,subject_2,subject_2_assess_1,subject_2_assess_2,subject_2_assess_3,subject_2_assess_4,attendance_2,learn_jcu_issues_2,lecturer_referral_2,subject_3,subject_3_assess_1,subject_3_assess_2,subject_3_assess_3,subject_4_assess_4,attendance_3,learn_jcu_issues_3,lecturer_referral_3,comments,identified_issues
0,1,Master of Business Administration,SRI to JCUB,At Risk,,Essential Skills,Student Counsellor,Booked,Yes,L/G:9/10 N:5/10 R:8/10,Yes,No Reply,,7.7,27.22,26.51,50.09,20,Access,Concern for Welfare,,69.33,44.44,11.39,18.78,41,Access,Attendance,,39.02,17.71,94.5,9.83,89,Access,Non Submission,Week 8. Student re-engaged with tutorials. Sub...,Late Enrollment
1,2,Master of Business Administration,Continuing,Excluded,,Referencing,Student Advocate,Not relevant,Yes,L/G:9/10 N:5/10 R:8/10,No,Phone,,31.14,54.66,81.72,1.96,9,Access,Non Submission,,95.23,48.61,14.68,44.77,66,No Access,Concern for Welfare,,3.72,38.52,25.8,11.8,100,No Access,Non Submission,booked to see a doctor. Week 5. Student contac...,Poor time management
2,3,Master of Business Administration,First year,At Risk,,Writing,Enrollment,Attended,Yes,L/G:9/10 N:5/10 R:8/10,Yes,Phone,,39.09,75.39,84.62,82.66,51,No Access,Attendance,,86.2,98.8,71.57,96.08,64,No Access,Concern for Welfare,,77.77,77.27,81.95,62.35,42,Access,Attendance,Week 8. Student re-engaged with tutorials. Sub...,Poor time management
3,4,Master of Business Administration,New,Excluded,,Essential Skills,Student Advocate,Attended,No,L/G:9/10 N:5/10 R:8/10,No,F2F,,88.59,84.36,3.79,26.05,34,Access,Concern for Welfare,,7.69,25.96,49.83,17.77,57,Access,Non Submission,,95.41,8.93,3.21,99.15,51,Access,Concern for Welfare,Week 6. Student submitted assessment late. Ext...,Death in family
4,5,Master of Business Administration,Continuing,At Risk,,Essential Skills and Reading,Student Counsellor,Not relevant,No,L/G:9/10 N:5/10 R:8/10,Yes,No Reply,,0.98,13.8,53.4,39.69,45,Access,Attendance,,60.51,57.47,15.8,33.59,3,No Access,Concern for Welfare,,90.12,52.18,36.05,20.65,19,Access,Concern for Welfare,Week 3 late enrolment. Student finding it diff...,Sickness


#### Data Cleaning and Exploration

In [4]:
# Print a concise summary of the DataFrame, including column names, data types, non-null counts, and memory usage.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   student_id                    698 non-null    int64  
 1   course                        698 non-null    object 
 2   student_cohort                698 non-null    object 
 3   academic_status               698 non-null    object 
 4   failed_subjects               0 non-null      float64
 5   study_skills(attended)        698 non-null    object 
 6   referral                      698 non-null    object 
 7   pp_meeting                    698 non-null    object 
 8   self_assessment               698 non-null    object 
 9   readiness_assessment_results  698 non-null    object 
 10  follow_up                     698 non-null    object 
 11  follow_up_type                698 non-null    object 
 12  subject_1                     0 non-null      float64
 13  subje

In [5]:
# Rename Columns
df.rename(columns={'subject_4_assess_4':'subject_3_assess_4'}, inplace=True)
df.columns

Index(['student_id', 'course', 'student_cohort', 'academic_status',
       'failed_subjects', 'study_skills(attended)', 'referral', 'pp_meeting',
       'self_assessment', 'readiness_assessment_results', 'follow_up',
       'follow_up_type', 'subject_1', 'subject_1_assess_1',
       'subject_1_assess_2', 'subject_1_assess_3', 'subject_1_assess_4',
       'attendance_1', 'learn_jcu_issues_1', 'lecturer_referral_1',
       'subject_2', 'subject_2_assess_1', 'subject_2_assess_2',
       'subject_2_assess_3', 'subject_2_assess_4', 'attendance_2',
       'learn_jcu_issues_2', 'lecturer_referral_2', 'subject_3',
       'subject_3_assess_1', 'subject_3_assess_2', 'subject_3_assess_3',
       'subject_3_assess_4', 'attendance_3', 'learn_jcu_issues_3',
       'lecturer_referral_3', 'comments', 'identified_issues'],
      dtype='object')

##### Clean Course Data
This section prepares the Course data for analysis by handling missing values, correcting inconsistencies, and ensuring all course records are valid and comparable.

In [6]:
df['course'] = df['course'].str.strip().str.lower()
df['course'].unique()

array(['master of business administration',
       'master of education - master of business administration',
       'master of information technology', 'bachelor of business',
       'master of engineering management',
       'bachelor of information technology',
       'master of professional accounting',
       'master of international tourism and hospitality management',
       'bachelor of tourism, hospitality and events',
       'master of professional account and master of business administration',
       'bachelor of commerce',
       'master of international tourism and hospitality management - master of business administration',
       'master of data science (professional)',
       'master of information technology - master of business administration',
       'postgraduate qualifying program - business'], dtype=object)

In [7]:
# Map subject code to subject names using Dictionary
subject_dict = {
    "BU1002": "Accounting for Decision Making",
    "BU1003": "Principles of Economics",
    "BU1007": "Principles of Data Analysis for Business",
    "BU1112": "Business Law",
    "BX2011": "Foundation of Accounting Principles",
    "BX2014": "Principles of Finance",
    "CP1401": "Fundamentals of Problem Solving and Programming I",
    "CP1402": "Internet Fundamentals",
    "CP1404": "Programming II",
    "TO1008": "Introduction to Tourism, Hospitality and Events Management",
    "TO2117": "Food and Beverage Management",
    "TO3052": "Experience Design for Tourism Hospitality and Events",
    "LB5113": "Corporate Strategy",
    "LB5202": "Marketing Essentials",
    "LB5205": "People in Organisations",
    "MA5831": "Advanced Data Management and Analysis using SAS",
    "MA5840": "Data Science and Strategic Decision Making for Business",
    "MA5851": "Data Science Master Class 1",
    "ED5097": "Research Design and Proposal",
    "ED5880": "Educational Leadership",
    "EG5200": "Career Planning",
    "EG5220": "Advanced Asset Management and Reliability",
    "EG5310": "Professional Placement",
    "CP5046": "ICT Project 1: Analysis and Design",
    "CP5047": "ICT Project 2: Implementation and Commissioning",
    "CP5503": "Enterprise Database Systems - Oracle",
    "TO5101": "Tourism Systems Analysis",
    "TO5103": "Global Destinations and Competitiveness",
    "TO5104": "Tourist Management Strategies",
    "CO5117": "Introduction to Accounting",
    "CO5103": "Management Accounting",
    "CO5109": "Corporate Finance",
    "LB5203": "Sustainable Enterprise",
    "LB5212": "Accounting and Finance for Managers"
}


In [8]:
# Get the subject name from the subject code
subject_dict['BU1007'] # Output: Principles of Data Analysis for Business 

'Principles of Data Analysis for Business'

In [9]:
# Map each course to a list of three subject codes
course_subjects = {
    'Bachelor of Business': ['BU1002', 'BU1003', 'BU1007'],
    'Bachelor of Commerce': ['BU1112', 'BX2011', 'BX2014'],
    'Bachelor of Information Technology': ['CP1401', 'CP1402', 'CP1404'],
    'Bachelor of Tourism, Hospitality and Events': ['TO1008', 'TO2117', 'TO3052'],
    'Master of Business Administration': ['LB5113', 'LB5202', 'LB5205'],
    'Master of Data Science (Professional)': ['MA5831', 'MA5840', 'MA5851'],
    'Master of Education - Master of Business Administration': ['ED5097', 'ED5880', 'LB5113'],
    'Master of Engineering Management': ['EG5200', 'EG5220', 'EG5310'],
    'Master of Information Technology': ['CP5046', 'CP5047', 'CP5503'],
    'Master of Information Technology - Master of Business Administration': ['CP5046', 'LB5113', 'LB5202'],
    'Master of International Tourism and Hospitality Management': ['TO5101', 'TO5103', 'TO5104'],
    'Master of International Tourism and Hospitality Management - Master of Business Administration': ['TO5101', 'LB5113', 'LB5202'],
    'Master of Professional Accounting': ['CO5117', 'CO5103', 'CO5109'],
    'Master of Professional Accounting - Master of Business Administration': ['CO5117', 'CO5103', 'LB5113'],
    'Postgraduate Qualifying Program - Business': ['LB5202', 'LB5203', 'LB5212']
}


In [11]:
# Create a new dictionary with cleaned course names as keys
# For each key-value pair in the original course_subjects dictionary:
#   Remove any leading/trailing spaces from the course name (key) using strip()
#   Convert the course name to lowercase using lower()
#   Keep the value (list of subjects) unchanged
cleaned_course_subjects = {k.strip().lower(): v for k, v in course_subjects.items()}
cleaned_course_subjects

{'bachelor of business': ['BU1002', 'BU1003', 'BU1007'],
 'bachelor of commerce': ['BU1112', 'BX2011', 'BX2014'],
 'bachelor of information technology': ['CP1401', 'CP1402', 'CP1404'],
 'bachelor of tourism, hospitality and events': ['TO1008', 'TO2117', 'TO3052'],
 'master of business administration': ['LB5113', 'LB5202', 'LB5205'],
 'master of data science (professional)': ['MA5831', 'MA5840', 'MA5851'],
 'master of education - master of business administration': ['ED5097',
  'ED5880',
  'LB5113'],
 'master of engineering management': ['EG5200', 'EG5220', 'EG5310'],
 'master of information technology': ['CP5046', 'CP5047', 'CP5503'],
 'master of information technology - master of business administration': ['CP5046',
  'LB5113',
  'LB5202'],
 'master of international tourism and hospitality management': ['TO5101',
  'TO5103',
  'TO5104'],
 'master of international tourism and hospitality management - master of business administration': ['TO5101',
  'LB5113',
  'LB5202'],
 'master of pr

In [12]:
# Get unique course names in DataFrame
unique_courses = df['course'].unique()

# Find which course names are not keys in cleaned_course_subjects dictionary
courses_not_in_cleaned = [course for course in unique_courses if course not in cleaned_course_subjects]
courses_not_in_cleaned

['master of professional account and master of business administration']

In [13]:
# Replace invalid course name to a course name in cleaned course subjects dict
df['course'].replace('master of professional account and master of business administration', 'master of professional accounting - master of business administration', inplace=True)

In [14]:
# check if the course value has been updated
df['course'].unique()

array(['master of business administration',
       'master of education - master of business administration',
       'master of information technology', 'bachelor of business',
       'master of engineering management',
       'bachelor of information technology',
       'master of professional accounting',
       'master of international tourism and hospitality management',
       'bachelor of tourism, hospitality and events',
       'master of professional accounting - master of business administration',
       'bachelor of commerce',
       'master of international tourism and hospitality management - master of business administration',
       'master of data science (professional)',
       'master of information technology - master of business administration',
       'postgraduate qualifying program - business'], dtype=object)

In [15]:
# Assigns subject codes to each student based on their course.
# For each row, looks up the course in the cleaned_course_subjects dictionary.
# If the course is found, fills subject_1, subject_2, and subject_3 with the corresponding subject codes.
# If the course is not found, fills these columns with None.
def assign_subjects(row):
    subjects = cleaned_course_subjects.get(row['course'], [None, None, None])
    row['subject_1'], row['subject_2'], row['subject_3'] = subjects
    return row

df = df.apply(assign_subjects, axis=1)

In [16]:
# Check for null values in subject 1 to 3
df[['subject_1', 'subject_2', 'subject_3']].isnull().sum()

subject_1    0
subject_2    0
subject_3    0
dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   student_id                    698 non-null    int64  
 1   course                        698 non-null    object 
 2   student_cohort                698 non-null    object 
 3   academic_status               698 non-null    object 
 4   failed_subjects               0 non-null      float64
 5   study_skills(attended)        698 non-null    object 
 6   referral                      698 non-null    object 
 7   pp_meeting                    698 non-null    object 
 8   self_assessment               698 non-null    object 
 9   readiness_assessment_results  698 non-null    object 
 10  follow_up                     698 non-null    object 
 11  follow_up_type                698 non-null    object 
 12  subject_1                     698 non-null    object 
 13  subje

In [18]:
df['course'].value_counts()

master of business administration                                                                 102
master of education - master of business administration                                            90
master of information technology                                                                   77
bachelor of business                                                                               71
master of engineering management                                                                   69
bachelor of information technology                                                                 56
master of professional accounting                                                                  55
master of international tourism and hospitality management                                         50
bachelor of tourism, hospitality and events                                                        30
master of professional accounting - master of business administration             

In [19]:
df.head()

Unnamed: 0,student_id,course,student_cohort,academic_status,failed_subjects,study_skills(attended),referral,pp_meeting,self_assessment,readiness_assessment_results,follow_up,follow_up_type,subject_1,subject_1_assess_1,subject_1_assess_2,subject_1_assess_3,subject_1_assess_4,attendance_1,learn_jcu_issues_1,lecturer_referral_1,subject_2,subject_2_assess_1,subject_2_assess_2,subject_2_assess_3,subject_2_assess_4,attendance_2,learn_jcu_issues_2,lecturer_referral_2,subject_3,subject_3_assess_1,subject_3_assess_2,subject_3_assess_3,subject_3_assess_4,attendance_3,learn_jcu_issues_3,lecturer_referral_3,comments,identified_issues
0,1,master of business administration,SRI to JCUB,At Risk,,Essential Skills,Student Counsellor,Booked,Yes,L/G:9/10 N:5/10 R:8/10,Yes,No Reply,LB5113,7.7,27.22,26.51,50.09,20,Access,Concern for Welfare,LB5202,69.33,44.44,11.39,18.78,41,Access,Attendance,LB5205,39.02,17.71,94.5,9.83,89,Access,Non Submission,Week 8. Student re-engaged with tutorials. Sub...,Late Enrollment
1,2,master of business administration,Continuing,Excluded,,Referencing,Student Advocate,Not relevant,Yes,L/G:9/10 N:5/10 R:8/10,No,Phone,LB5113,31.14,54.66,81.72,1.96,9,Access,Non Submission,LB5202,95.23,48.61,14.68,44.77,66,No Access,Concern for Welfare,LB5205,3.72,38.52,25.8,11.8,100,No Access,Non Submission,booked to see a doctor. Week 5. Student contac...,Poor time management
2,3,master of business administration,First year,At Risk,,Writing,Enrollment,Attended,Yes,L/G:9/10 N:5/10 R:8/10,Yes,Phone,LB5113,39.09,75.39,84.62,82.66,51,No Access,Attendance,LB5202,86.2,98.8,71.57,96.08,64,No Access,Concern for Welfare,LB5205,77.77,77.27,81.95,62.35,42,Access,Attendance,Week 8. Student re-engaged with tutorials. Sub...,Poor time management
3,4,master of business administration,New,Excluded,,Essential Skills,Student Advocate,Attended,No,L/G:9/10 N:5/10 R:8/10,No,F2F,LB5113,88.59,84.36,3.79,26.05,34,Access,Concern for Welfare,LB5202,7.69,25.96,49.83,17.77,57,Access,Non Submission,LB5205,95.41,8.93,3.21,99.15,51,Access,Concern for Welfare,Week 6. Student submitted assessment late. Ext...,Death in family
4,5,master of business administration,Continuing,At Risk,,Essential Skills and Reading,Student Counsellor,Not relevant,No,L/G:9/10 N:5/10 R:8/10,Yes,No Reply,LB5113,0.98,13.8,53.4,39.69,45,Access,Attendance,LB5202,60.51,57.47,15.8,33.59,3,No Access,Concern for Welfare,LB5205,90.12,52.18,36.05,20.65,19,Access,Concern for Welfare,Week 3 late enrolment. Student finding it diff...,Sickness


In [20]:
df['student_cohort'].value_counts()

Return to Study    101
First year          93
Transferred         92
Continuing          89
Excluded            86
LOA                 85
New                 78
SRI to JCUB         74
Name: student_cohort, dtype: int64

In [21]:
# List of courses that falls under course group 'IT'
it_courses = [ 'master of information technology',
    'bachelor of information technology',
    'master of data science (professional)',
    'master of information technology - master of business administration',
] 

In [22]:
# Create a new column 'course_group' based on whether the course is in the IT list
df['course_group'] = df['course'].apply(
    lambda x: 'IT' if x in it_courses else 'Non-IT'
)


In [23]:
# Reset academic_status
df['academic_status'] = None

# Step 1: Decide total excluded count and IT excluded count
total_excluded = 5
excluded_in_it = 3
excluded_in_non_it = total_excluded - excluded_in_it  # 2

# Step 2: Assign cohorts to all students first (clear student_cohort if needed)
df['student_cohort'] = np.nan  # optional clear

# Find eligible indices by course_group to assign exclusions
it_indices = df[df['course_group'] == 'IT'].index.to_numpy()
non_it_indices = df[df['course_group'] == 'Non-IT'].index.to_numpy()

# Shuffle indices for randomness
np.random.seed(42)
it_excluded_indices = np.random.choice(it_indices, size=excluded_in_it, replace=False)
non_it_excluded_indices = np.random.choice(non_it_indices, size=excluded_in_non_it, replace=False)

# Assign 'Excluded' cohort and status for those selected
df.loc[it_excluded_indices, 'student_cohort'] = 'Excluded'
df.loc[non_it_excluded_indices, 'student_cohort'] = 'Excluded'

# Combine indices using pandas Index union and assign 'Excluded' status
excluded_union = pd.Index(it_excluded_indices).union(pd.Index(non_it_excluded_indices))
df.loc[excluded_union, 'academic_status'] = 'Excluded'

# Step 3: Assign cohorts to the other students (non-excluded)
remaining_indices = df.index.difference(excluded_union)
valid_cohorts = ['Return to Study', 'First year', 'Transferred', 'Continuing', 'LOA', 'New', 'SRI to JCUB']
df.loc[remaining_indices, 'student_cohort'] = np.random.choice(valid_cohorts, size=len(remaining_indices), replace=True)

# Step 4: Assign 'Satisfactory' to cohorts which are only allowed 'Satisfactory'
satisfactory_only_cohorts = ['SRI to JCUB', 'Transferred', 'New']
df.loc[df['student_cohort'].isin(satisfactory_only_cohorts), 'academic_status'] = 'Satisfactory'

# Step 5: Define your target counts
target_satisfactory = 621
target_satisfactory_it = 126
target_satisfactory_non_it = 495
target_ac = 36
target_cond = 36

# Step 6: Calculate remaining 'Satisfactory' to assign (excluding already assigned)
assigned_satisfactory = df['academic_status'].eq('Satisfactory').sum()
remaining_satisfactory = max(target_satisfactory - assigned_satisfactory, 0)

# Step 7: Filter remaining unassigned students excluding excluded cohort and satisfactory-only cohorts
remaining_mask = (
    df['academic_status'].isnull() &
    (~df['student_cohort'].isin(satisfactory_only_cohorts + ['Excluded']))
)
remaining_indices = df[remaining_mask].index

# Split remaining_indices by course_group
remaining_it_indices = df.loc[remaining_indices][df.loc[remaining_indices, 'course_group'] == 'IT'].index
remaining_non_it_indices = df.loc[remaining_indices][df.loc[remaining_indices, 'course_group'] == 'Non-IT'].index

# Calculate how many satisfactory assigned already in each course_group
assigned_satisfactory_it = df[
    (df['academic_status'] == 'Satisfactory') & (df['course_group'] == 'IT')
].shape[0]
assigned_satisfactory_non_it = df[
    (df['academic_status'] == 'Satisfactory') & (df['course_group'] == 'Non-IT')
].shape[0]

# Remaining satisfactory in each course_group
remaining_satisfactory_it = max(target_satisfactory_it - assigned_satisfactory_it, 0)
remaining_satisfactory_non_it = max(target_satisfactory_non_it - assigned_satisfactory_non_it, 0)

# Proportionally split Academic Caution and Conditional statuses
total_remaining_ac_cond = target_ac + target_cond
total_remaining_students = len(remaining_it_indices) + len(remaining_non_it_indices)

prop_it = len(remaining_it_indices) / total_remaining_students if total_remaining_students > 0 else 0
prop_non_it = len(remaining_non_it_indices) / total_remaining_students if total_remaining_students > 0 else 0

ac_it = int(round(target_ac * prop_it))
ac_non_it = target_ac - ac_it

cond_it = int(round(target_cond * prop_it))
cond_non_it = target_cond - cond_it

# Build status lists
status_it = (
    ['Satisfactory'] * remaining_satisfactory_it +
    ['Academic Caution'] * ac_it +
    ['Conditional'] * cond_it
)

status_non_it = (
    ['Satisfactory'] * remaining_satisfactory_non_it +
    ['Academic Caution'] * ac_non_it +
    ['Conditional'] * cond_non_it
)

# Function to adjust length (padding/truncation)
def adjust_length(lst, desired_len, filler):
    if len(lst) > desired_len:
        return lst[:desired_len]
    elif len(lst) < desired_len:
        return lst + [filler] * (desired_len - len(lst))
    else:
        return lst

status_it = adjust_length(status_it, len(remaining_it_indices), 'Academic Caution')
status_non_it = adjust_length(status_non_it, len(remaining_non_it_indices), 'Academic Caution')

# Shuffle status lists
np.random.seed(42)
np.random.shuffle(status_it)
np.random.shuffle(status_non_it)

# Assign statuses back
df.loc[remaining_it_indices, 'academic_status'] = status_it
df.loc[remaining_non_it_indices, 'academic_status'] = status_non_it

# Final checks
#Group by course_group and academic_status and count
status_distribution = df.groupby(['course_group', 'academic_status']).size().unstack(fill_value=0)

print("Academic status distribution by course_group:\n")
print(status_distribution)

Academic status distribution by course_group:

academic_status  Academic Caution  Conditional  Excluded  Satisfactory
course_group                                                          
IT                             21            7         3           126
Non-IT                         29           15         2           495


###### Clean the academic status column


In [24]:
# Standardize the values (strip whitespace and title case) 
# df['academic_status'] = df['academic_status'].str.strip().str.title()

In [25]:
# # Shuffle the DataFrame
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# # Define the target distribution
# target_counts = {
#     'Satisfactory': 524,
#     'Academic Caution': 70,
#     'Conditional': 99,
#     'Excluded': 5
# }

In [26]:
# # Create a new column with the desired academic status
# new_statuses = (
#     ['Satisfactory'] * target_counts['Satisfactory'] +
#     ['Academic Caution'] * target_counts['Academic Caution'] +
#     ['Conditional'] * target_counts['Conditional'] +
#     ['Excluded'] * target_counts['Excluded']
# )

In [27]:
# # Assign to the DataFrame
# df['academic_status'] = new_statuses

# # Confirm the new distribution
# df['academic_status'].value_counts()

###### Clean the student cohort column

In [28]:
df['student_cohort'].value_counts()

New                109
Return to Study    107
SRI to JCUB        104
Transferred        102
LOA                 99
First year          92
Continuing          80
Excluded             5
Name: student_cohort, dtype: int64

In [29]:
df['academic_status'].value_counts()

Satisfactory        621
Academic Caution     50
Conditional          22
Excluded              5
Name: academic_status, dtype: int64

###### Group Courses into IT and Non IT

In [30]:
# List of courses that falls under course group 'IT'
# it_courses = [ 'master of information technology',
#     'bachelor of information technology',
#     'master of data science (professional)',
#     'master of information technology - master of business administration',
# ] 

In [31]:
# Create a new column 'course_group' based on whether the course is in the IT list
# df['course_group'] = df['course'].apply(
#     lambda x: 'IT' if x in it_courses else 'Non-IT'
# )


###### Generate records of failed subjects for the course IT


In [32]:
# Masks for clarity
mask_satisfactory_it = (df['academic_status'] == 'Satisfactory') & (df['course_group'] == 'IT')
mask_it = (df['course_group'] == 'IT')

mask_satisfactory_non_it = (df['academic_status'] == 'Satisfactory') & (df['course_group'] == 'Non-IT')
mask_non_it = (df['course_group'] == 'Non-IT')

mask_excluded = (df['academic_status'] == 'Excluded')

# Excluded indices
excluded_it_indices = df[mask_excluded & mask_it].index
excluded_non_it_indices = df[mask_excluded & mask_non_it].index

# ---- Assign failed_subjects > 3 ----

# IT group
num_it_excluded = len(excluded_it_indices)
num_it_failed_gt3 = 10
num_it_remaining_gt3 = num_it_failed_gt3 - num_it_excluded

if num_it_remaining_gt3 < 0:
    raise ValueError(f"IT Excluded students ({num_it_excluded}) exceed 10 total failed_subjects>3 limit.")

eligible_it_indices = df[mask_it & ~mask_excluded].index
np.random.seed(42)
it_gt3_extra = np.random.choice(eligible_it_indices, size=num_it_remaining_gt3, replace=False)
df.loc[excluded_it_indices.union(it_gt3_extra), 'failed_subjects'] = 4

# Non-IT group
num_non_it_excluded = len(excluded_non_it_indices)
num_non_it_failed_gt3 = 7
num_non_it_remaining_gt3 = num_non_it_failed_gt3 - num_non_it_excluded

if num_non_it_remaining_gt3 < 0:
    raise ValueError(f"Non-IT Excluded students ({num_non_it_excluded}) exceed 7 total failed_subjects>3 limit.")

eligible_non_it_indices = df[mask_non_it & ~mask_excluded].index
non_it_gt3_extra = np.random.choice(eligible_non_it_indices, size=num_non_it_remaining_gt3, replace=False)
df.loc[excluded_non_it_indices.union(non_it_gt3_extra), 'failed_subjects'] = 4

# ---- IT Satisfactory: assign failed_subjects = 0 to most, leave 2-3 outliers with 1 or 2 ----

satisfactory_it_indices = df[mask_satisfactory_it].index
num_outliers_it_satisfactory = 3

np.random.seed(101)
outliers_it_satisfactory = np.random.choice(satisfactory_it_indices, size=num_outliers_it_satisfactory, replace=False)
df.loc[satisfactory_it_indices.difference(outliers_it_satisfactory), 'failed_subjects'] = 0
df.loc[outliers_it_satisfactory, 'failed_subjects'] = np.random.choice([1, 2], size=num_outliers_it_satisfactory)

# ---- IT remaining students (excluding assigned above) assign failed_subjects 1 or 2 with 2-3 zeros as outliers ----

assigned_it_failed = df.loc[mask_it, 'failed_subjects'].notna()
remaining_it = df[mask_it & ~assigned_it_failed].index
num_zeros_it_remaining = 3

np.random.seed(102)
zeros_it_remaining = np.random.choice(remaining_it, size=num_zeros_it_remaining, replace=False)
df.loc[zeros_it_remaining, 'failed_subjects'] = 0

remaining_it_other = remaining_it.difference(zeros_it_remaining)
df.loc[remaining_it_other, 'failed_subjects'] = np.random.choice([1, 2], size=len(remaining_it_other), replace=True)

# ---- Non-IT Satisfactory: assign failed_subjects = 0 to most, leave 2-3 outliers with 1 or 2 ----

satisfactory_non_it_indices = df[mask_satisfactory_non_it].index
num_outliers_non_it_satisfactory = 3

np.random.seed(103)
outliers_non_it_satisfactory = np.random.choice(satisfactory_non_it_indices, size=num_outliers_non_it_satisfactory, replace=False)
df.loc[satisfactory_non_it_indices.difference(outliers_non_it_satisfactory), 'failed_subjects'] = 0
df.loc[outliers_non_it_satisfactory, 'failed_subjects'] = np.random.choice([1, 2], size=num_outliers_non_it_satisfactory)

# ---- Non-IT remaining students (excluding assigned above) assign failed_subjects 1 or 2 with 2-3 zeros as outliers ----

assigned_non_it_failed = df.loc[mask_non_it, 'failed_subjects'].notna()
remaining_non_it = df[mask_non_it & ~assigned_non_it_failed].index
num_zeros_non_it_remaining = 3

np.random.seed(104)
zeros_non_it_remaining = np.random.choice(remaining_non_it, size=num_zeros_non_it_remaining, replace=False)
df.loc[zeros_non_it_remaining, 'failed_subjects'] = 0

remaining_non_it_other = remaining_non_it.difference(zeros_non_it_remaining)
df.loc[remaining_non_it_other, 'failed_subjects'] = np.random.choice([1, 2], size=len(remaining_non_it_other), replace=True)

# ---- Summary ----

print(f"Total students with failed_subjects > 3: {(df['failed_subjects'] > 3).sum()}")
print(f" - IT failed_subjects > 3: {df[(df['course_group'] == 'IT') & (df['failed_subjects'] > 3)].shape[0]}")
print(f" - Non-IT failed_subjects > 3: {df[(df['course_group'] == 'Non-IT') & (df['failed_subjects'] > 3)].shape[0]}")

print(f"IT Satisfactory students with failed_subjects = 0: {(df[mask_satisfactory_it & (df['failed_subjects'] == 0)]).shape[0]}")
print(f"IT Satisfactory outliers with failed_subjects 1 or 2: {(df[mask_satisfactory_it & (df['failed_subjects'] > 0)]).shape[0]}")

print(f"Non-IT Satisfactory students with failed_subjects = 0: {(df[mask_satisfactory_non_it & (df['failed_subjects'] == 0)]).shape[0]}")
print(f"Non-IT Satisfactory outliers with failed_subjects 1 or 2: {(df[mask_satisfactory_non_it & (df['failed_subjects'] > 0)]).shape[0]}")

print(f"Failed subjects distribution for IT students:\n{df[mask_it]['failed_subjects'].value_counts()}")
print(f"Failed subjects distribution for Non-IT students:\n{df[mask_non_it]['failed_subjects'].value_counts()}")


Total students with failed_subjects > 3: 8
 - IT failed_subjects > 3: 6
 - Non-IT failed_subjects > 3: 2
IT Satisfactory students with failed_subjects = 0: 123
IT Satisfactory outliers with failed_subjects 1 or 2: 3
Non-IT Satisfactory students with failed_subjects = 0: 492
Non-IT Satisfactory outliers with failed_subjects 1 or 2: 3
Failed subjects distribution for IT students:
0.0    126
2.0     15
1.0     10
4.0      6
Name: failed_subjects, dtype: int64
Failed subjects distribution for Non-IT students:
0.0    495
1.0     22
2.0     22
4.0      2
Name: failed_subjects, dtype: int64


In [33]:
# Get the row numbers (indices) for each group
# it_indices = df[df['course_group']== 'IT'].index

# # Subset of IT course
# # Check the distribution number of academic status (Course IT) 
# it_df = df.loc[it_indices].copy()
# it_df['academic_status'].value_counts()

In [34]:
# # Select 6 students from IT students with the academic status 'Satisfactory'
# satisfactory_it = it_df[it_df['academic_status'] == 'Satisfactory'].index
# # Randomly choose indices to replace
# reassign_academic_status = np.random.choice(satisfactory_it, size = 6, replace = False)

In [35]:
# # Reassign 6 Students from Satisfactory to Other Academic Statuses
# statuses_to_assign = ['Conditional', 'Academic Caution', 'Excluded']
# for i, status in enumerate(statuses_to_assign):
#     df.loc[reassign_academic_status[i*3:(i+1)*3], 'academic_status'] = status

In [36]:
# # Check updated value counts for IT academic status
# it_df = df.loc[it_indices].copy()
# it_df['academic_status'].value_counts()

In [37]:
# # Assign 0 failed subjects to Satisfactory IT students
# satisfactory_mask = it_df['academic_status'] == 'Satisfactory'
# it_df.loc[satisfactory_mask, 'failed_subjects'] = 0

# # Identify non-Satisfactory (i.e., failed) students
# failed_mask = ~satisfactory_mask
# failed_it_indices = it_df[failed_mask].index

# # Assign >3 failed subjects to 10 students (1 of them should be Excluded)

# # First, get the Excluded student
# excluded_mask = it_df['academic_status'] == 'Excluded'
# excluded_index = it_df[excluded_mask].index

# # Assign random failed_subjects = 6–8 to the Excluded student
# it_df.loc[excluded_index, 'failed_subjects'] = np.random.randint(6, 9, size=len(excluded_index))

# # Now select 9 more students (excluding Excluded) to get 4–5 failed subjects
# eligible_for_high_fail = failed_it_indices.difference(excluded_index)

# # Now select 9 more students (excluding Excluded) to get 4–5 failed subjects
# eligible_for_high_fail = failed_it_indices.difference(excluded_index)

# # Select 9 more randomly to total 10 students failing >3 subjects
# more_than_3_indices = np.random.choice(eligible_for_high_fail, size=9, replace=False)
# it_df.loc[more_than_3_indices, 'failed_subjects'] = np.random.randint(4, 6, size=9)

# # STEP 5: Remaining IT failed students — distribute 1–2 failed subjects
# remaining_failed = eligible_for_high_fail.difference(more_than_3_indices)

# # 21 failed students remaining
# # Randomly split into two groups for 1 and 2 failed subjects
# n_remain = len(remaining_failed)  # 21
# half = n_remain // 2  # 10 or 11

# remaining_failed = list(remaining_failed)
# np.random.shuffle(remaining_failed)

# low_1_fail = remaining_failed[:half]
# low_2_fail = remaining_failed[half:]

# # Assign 1s and 2s
# it_df.loc[low_1_fail, 'failed_subjects'] = 1
# it_df.loc[low_2_fail, 'failed_subjects'] = 2

In [38]:
# Update the failed_subjects column in the main data frame df
# df.loc[it_df.index, 'failed_subjects'] = it_df['failed_subjects']

In [39]:
# # Check only the IT students
# it_students = df.loc[it_indices]

# # 1. Total IT students who failed (not Satisfactory)
# it_failed = it_students[it_students['academic_status'] != 'Satisfactory']

# # 2. Total with non-zero failed_subjects
# non_zero_fails = it_failed[it_failed['failed_subjects'] > 0]
# print("Total assigned failed_subjects to IT failed students:", len(non_zero_fails))  # 31

# # 3. Number of students with >3 failed subjects
# gt3 = non_zero_fails[non_zero_fails['failed_subjects'] > 3]
# print("Number with >3 failed subjects:", len(gt3))  # 10

# # 4. Number with <=3 failed subjects
# lte3 = non_zero_fails[non_zero_fails['failed_subjects'] <= 3]
# print("Number with <=3 failed subjects:", len(lte3))  # 21

# # 5. Print failed_subjects distribution (should show a variety like 1, 2, 4, etc.)
# print("\nFailed Subjects Distribution:")
# print(non_zero_fails['failed_subjects'].value_counts().sort_index())

# # 6. Confirm no failed_subjects assigned to Satisfactory students
# sat_students = it_students[it_students['academic_status'] == 'Satisfactory']
# sats_with_fails = sat_students[sat_students['failed_subjects'] != 0]
# print("\nSatisfactory students incorrectly assigned fails:", len(sats_with_fails))  # 0


###### Generate records of failed subjects for the course Non-IT


In [40]:
# # Get non-IT indices
# nonit_indices = df[df['course_group']== 'Non-IT'].index

# # Subset of Non-IT course
# # Check the distribution number of academic status (Course Non-IT) 
# nonit_df = df.loc[nonit_indices].copy()
# nonit_df['academic_status'].value_counts()

In [41]:
# # CONTROL 'Excluded' COUNT TO 5
# current_excluded = (nonit_df['academic_status'] == 'Excluded').sum()
# desired = 5

# if current_excluded > desired:
#     # Too many: set some to 'Conditional' or similar
#     excluded = nonit_df[nonit_df['academic_status'] == 'Excluded'].index
#     to_remove = np.random.choice(excluded, current_excluded - desired, replace=False)
#     df.loc[to_remove, 'academic_status'] = 'Conditional'  # or 'Academic Caution'
# elif current_excluded < desired:
#     # Too few: set some from non-Excluded, non-Satisfactory
#     not_excluded = nonit_df[nonit_df['academic_status'] != 'Excluded'].index
#     candidates = not_excluded.intersection(nonit_df[nonit_df['academic_status'] != 'Satisfactory'].index)
#     to_add = np.random.choice(candidates, desired - current_excluded, replace=False)
#     df.loc[to_add, 'academic_status'] = 'Excluded'


In [42]:
# # Refresh
# nonit_df = df.loc[nonit_indices].copy()
# print("After setting exactly 5 'Excluded':")
# print(nonit_df['academic_status'].value_counts())

In [43]:
# # 3. NOW ASSIGN 'Satisfactory' TO NON-EXCLUDED, NON-SATISFACTORY STUDENTS
# non_satisfactory = nonit_df['academic_status'] != 'Satisfactory'
# non_excluded = nonit_df['academic_status'] != 'Excluded'
# eligible = nonit_df[non_satisfactory & non_excluded].index

# reassign_academic_status = np.random.choice(eligible, min(37, len(eligible)), replace=False)
# df.loc[reassign_academic_status, 'academic_status'] = 'Satisfactory'

# # Final counts
# nonit_df = df.loc[nonit_indices].copy()
# print("Final:")
# print(nonit_df['academic_status'].value_counts())

In [44]:
# # Assign 0 failed subjects to Satisfactory Non IT students
# satisfactory_mask = nonit_df['academic_status'] == 'Satisfactory'
# nonit_df.loc[satisfactory_mask, 'failed_subjects'] = 0

# # Identify non-Satisfactory (i.e., failed) students
# failed_mask = ~satisfactory_mask
# failed_nonit_indices = nonit_df[failed_mask].index

# # Assign >3 failed subjects to 7 students (5 of them should be Excluded)

# # First, get the Excluded student
# excluded_mask = nonit_df['academic_status'] == 'Excluded'
# excluded_index = nonit_df[excluded_mask].index

# # Assign random failed_subjects = 6–8 to the Excluded student
# nonit_df.loc[excluded_index, 'failed_subjects'] = np.random.randint(6, 9, size=len(excluded_index))

# # Now select 2 more students (excluding Excluded) to get 4–5 failed subjects
# eligible_for_high_fail = failed_nonit_indices.difference(excluded_index)

# # Select 2 more randomly to total 7 students failing >3 subjects
# more_than_3_indices = np.random.choice(eligible_for_high_fail, size=2, replace=False)
# nonit_df.loc[more_than_3_indices, 'failed_subjects'] = np.random.randint(4, 6, size=2)

# # STEP 5: Remaining Non IT failed students — distribute 1–2 failed subjects
# remaining_failed = eligible_for_high_fail.difference(more_than_3_indices)

# # 39 failed students remaining
# # Randomly split into two groups for 1 and 2 failed subjects
# n_remain = len(remaining_failed)  # 31
# half = n_remain // 2 # 15 or 16

# remaining_failed = list(remaining_failed)
# np.random.shuffle(remaining_failed)

# low_1_fail = remaining_failed[:half]
# low_2_fail = remaining_failed[half:]

# # Assign 1s and 2s
# nonit_df.loc[low_1_fail, 'failed_subjects'] = 1
# nonit_df.loc[low_2_fail, 'failed_subjects'] = 2

In [45]:
# # Update the failed_subjects column in the main data frame df
# df.loc[nonit_df.index, 'failed_subjects'] = nonit_df['failed_subjects']

In [46]:
# # Check only the Non IT students
# nonit_students = df.loc[nonit_indices]

# # 1. Total Non IT students who failed (not Satisfactory)
# nonit_failed = nonit_students[nonit_students['academic_status'] != 'Satisfactory']

# # 2. Total with non-zero failed_subjects
# non_zero_fails = nonit_failed[nonit_failed['failed_subjects'] > 0]
# print("Total assigned failed_subjects to Non IT failed students:", len(non_zero_fails))  # 46

# # 3. Number of students with >3 failed subjects
# gt3 = non_zero_fails[non_zero_fails['failed_subjects'] > 3]
# print("Number with >3 failed subjects:", len(gt3))  # 10

# # 4. Number with <=3 failed subjects
# lte3 = non_zero_fails[non_zero_fails['failed_subjects'] <= 3]
# print("Number with <=3 failed subjects:", len(lte3))  # 21

# # 5. Print failed_subjects distribution (should show a variety like 1, 2, 4, etc.)
# print("\nFailed Subjects Distribution:")
# print(non_zero_fails['failed_subjects'].value_counts().sort_index())

# # 6. Confirm no failed_subjects assigned to Satisfactory students
# sat_students = nonit_students[nonit_students['academic_status'] == 'Satisfactory']
# sats_with_fails = sat_students[sat_students['failed_subjects'] != 0]
# print("\nSatisfactory students incorrectly assigned fails:", len(sats_with_fails))  # 0


###### Balance identified issues

In [47]:
# Count how many 'New' students had 'Late Enrollment' as an identified issue
df[(df['student_cohort'] == 'New') & (df['identified_issues'] == 'Late Enrollment')].shape

(25, 39)

In [48]:
# # Reassign identified issues for New students

# new_late_enrollment_indices = df[(df['student_cohort'] == 'New') & (df['identified_issues'] == 'Late Enrollment')].index

# # Randomly choose indices to replace

# reassign_identified_issue = np.random.choice(new_late_enrollment_indices, size = len(new_late_enrollment_indices)-20, replace=False)
# df.loc[reassign_identified_issue, 'identified_issues'] = 'None'

In [49]:
# df[(df['student_cohort'] == 'New') & (df['identified_issues'] == 'Late Enrollment')].shape

In [50]:
# df[(df['student_cohort'] != 'New')  & (df['academic_status'] == 'Satisfactory')].shape

In [51]:
# # Reassign identified issues for Satisfactory Students students

# satisfactory_identified_issues_indices = df[(df['student_cohort'] != 'New')  & (df['academic_status'] == 'Satisfactory')].index

# # Randomly choose indices to replace
# # 80% of students with academic status Satisfactory (Not new)
# reassign_identified_issue = np.random.choice(satisfactory_identified_issues_indices, size = 434, replace=False)
# df.loc[reassign_identified_issue, 'identified_issues'] = 'None'

In [52]:
# df[(df['student_cohort'] != 'New')  & (df['academic_status'] == 'Satisfactory') & (df['identified_issues'] == 'None')].shape

In [53]:
# df['identified_issues'].value_counts()

###### Clean study skills column

In [54]:
df['study_skills(attended)'].value_counts()

Referencing                     140
Writing                         125
Studiocity                      117
Essential Skills                113
Essential Skills and Reading    108
4R Essential Skills              95
Name: study_skills(attended), dtype: int64

In [55]:
# # Only for first year and new students
# study_skills_index = df[~df['student_cohort'].isin(['New', 'First year'])].index
# reassign_study_skills = np.random.choice(study_skills_index, size = len(study_skills_index), replace=False)
# df.loc[reassign_study_skills, 'study_skills(attended)'] = 'None'
# df['study_skills(attended)'].value_counts()

In [56]:
# df.info()

###### Clean Attendance Data
This section prepares the attendance data for analysis by handling missing values, correcting inconsistencies, and ensuring all attendance records are valid and comparable.

In [57]:
import numpy as np
import pandas as pd

# Minimum and Maximum attendance threshold
MIN_ATTD_ASSIGN = 0
MAX_ATTD_ASSIGN = 100

def generate_attendance(row):
    """
    Generate realistic attendance for three subjects based on academic status, cohort, and failed_subjects.
    """
    base_mean = 66.5 # global research value for mean attendance
    base_std = 13

    # Academic status logic
    status = row['academic_status']
    if status == 'Satisfactory':
        mean = base_mean + 3
        std = 7
    elif status in ['Academic Caution', 'Conditional']:
        mean = base_mean - 6
        std = 13
    elif status in ['At Risk', 'Excluded']:
        mean = base_mean - 14
        std = 17
    else:
        mean = base_mean
        std = base_std

    # Cohort adjustment
    cohort = row['student_cohort'] if isinstance(row['student_cohort'], str) else ''
    if 'First year' in cohort or 'New' in cohort:
        mean -= 7
    elif 'Return' in cohort or 'Transferred' in cohort:
        mean -= 3
    elif 'Continuing' in cohort or 'LOA' in cohort:
        mean -= 1

    # Adjust for number of failed_subjects (robust to missing/NaN)
    try:
        fails = int(row.get('failed_subjects', 0))
    except:
        fails = 0
    if fails > 3:
        mean -= 12
    elif fails == 0 and status == 'Satisfactory':
        mean += 4  # these are your most regular attenders!

    # Clamp all means within sensible range
    mean = max(MIN_ATTD_ASSIGN, min(MAX_ATTD_ASSIGN, mean))
    std = min(std, 20)  # don't let stdev go excessive

    # Generate attendance for 3 subjects with some random variation
    att1 = np.clip(np.random.normal(loc=mean, scale=std), MIN_ATTD_ASSIGN, MAX_ATTD_ASSIGN)
    att2 = np.clip(np.random.normal(loc=mean + np.random.uniform(-3,3), scale=std), MIN_ATTD_ASSIGN, MAX_ATTD_ASSIGN)
    att3 = np.clip(np.random.normal(loc=mean + np.random.uniform(-3,3), scale=std), MIN_ATTD_ASSIGN, MAX_ATTD_ASSIGN)

    return pd.Series([round(att1,1), round(att2,1), round(att3,1)])

# ---- APPLY TO THE DATAFRAME ----

np.random.seed(101)  # for reproducible results

df[['attendance_1', 'attendance_2', 'attendance_3']] = df.apply(generate_attendance, axis=1)

# ---- CHECK THE RESULTING MEANS ----

print("Mean attendance_1:", df['attendance_1'].mean())
print("Mean attendance_2:", df['attendance_2'].mean())
print("Mean attendance_3:", df['attendance_3'].mean())

# Combined overall mean:
overall_attendance_mean = pd.concat([df['attendance_1'],df['attendance_2'],df['attendance_3']]).mean()
print("Overall attendance mean (all columns):", overall_attendance_mean)

# To see attendance distribution by academic status or cohort:
print("\nAttendance by Academic Status:\n", df.groupby('academic_status')[['attendance_1','attendance_2','attendance_3']].mean())
print("\nAttendance by Student Cohort:\n", df.groupby('student_cohort')[['attendance_1','attendance_2','attendance_3']].mean())


Mean attendance_1: 68.94928366762177
Mean attendance_2: 68.56375358166189
Mean attendance_3: 68.5810888252149
Overall attendance mean (all columns): 68.69804202483286

Attendance by Academic Status:
                   attendance_1  attendance_2  attendance_3
academic_status                                           
Academic Caution     56.896000     54.612000     55.014000
Conditional          61.513636     57.818182     58.213636
Excluded             48.760000     33.340000     44.720000
Satisfactory         70.345733     70.351369     70.232850

Attendance by Student Cohort:
                  attendance_1  attendance_2  attendance_3
student_cohort                                           
Continuing          69.946250     70.867500     70.945000
Excluded            48.760000     33.340000     44.720000
First year          64.881522     63.025000     63.453261
LOA                 71.006061     71.032323     71.539394
New                 66.894495     66.907339     66.609174
Return t

###### Clean Subject Assessments Score
This section prepares the subject assessments data for analysis by handling missing values, correcting inconsistencies, and ensuring all scores records are valid and comparable.

In [58]:
subject_cols = [f"subject_{i}_assess_{j}" for i in range(1,4) for j in range(1,4)]
print(df[subject_cols].describe().transpose())

                    count       mean        std   min      25%     50%  \
subject_1_assess_1  698.0  51.630702  28.580143  0.15  27.8150  52.105   
subject_1_assess_2  698.0  49.795143  29.158082  0.01  22.6075  49.780   
subject_1_assess_3  698.0  49.373854  29.734640  0.09  23.3275  47.630   
subject_2_assess_1  698.0  52.323109  28.602609  0.16  27.7725  53.340   
subject_2_assess_2  698.0  47.862307  29.092952  0.30  22.5225  46.940   
subject_2_assess_3  698.0  49.632564  28.842227  0.04  25.1425  50.740   
subject_3_assess_1  698.0  50.648266  29.296110  0.12  24.2625  52.440   
subject_3_assess_2  698.0  48.930129  27.774693  0.19  25.3175  46.675   
subject_3_assess_3  698.0  48.603209  28.930261  0.31  25.7075  48.550   

                        75%    max  
subject_1_assess_1  75.6475  99.98  
subject_1_assess_2  75.5650  99.92  
subject_1_assess_3  76.1800  99.68  
subject_2_assess_1  77.5200  99.91  
subject_2_assess_2  72.9775  99.87  
subject_2_assess_3  73.5000  99.99  


In [63]:

# Define subject assessment columns
subject_cols = [f"subject_{i}_assess_{j}"
                for i in range(1, 4)  # subjects 1 to 3
                for j in range(1, 5)]  # assessments 1 to 4

def assign_assessments(row):
    # Step 1: Immediate 0s for Excluded
    if row['academic_status'] == 'Excluded':
        return pd.Series([0] * len(subject_cols), index=subject_cols)

    # 2. Set base mean & std by prior academic_status
    status = row['academic_status']
    base = {
        'Satisfactory':     (72, 7),
        'Academic Caution': (61, 12),
        'Conditional':      (58, 13),
        'At Risk':          (52, 16)
    }
    mean, std = base.get(status, (60, 12))

    # Cohort adjustments
    cohort = str(row['student_cohort'])
    if 'First year' in cohort or 'New' in cohort:
        mean -= 4
    elif 'Return' in cohort or 'Transferred' in cohort:
        mean -= 2

    # Attendance adjustment: stronger for very low, weaker for medium/high
    attendance_cols = ['attendance_1','attendance_2','attendance_3']
    if set(attendance_cols) <= set(row.index):
        att = float(np.mean([row[col] for col in attendance_cols]))
        if att >= 80:
            mean += 2
        elif att < 60:
            mean -= 3
        elif att < 50:
            mean -= 6
        # else no adjustment
    # Clamp mean within 40-95
    mean = max(40, min(95, mean))
    std = min(std, 20)

    # Generate marks for each assessment (simulate real small ups and downs per assessment)
    scores = []
    for _ in subject_cols:
        score = np.random.normal(loc=mean + np.random.uniform(-2,2), scale=std)
        score = max(0, min(100, round(score, 1)))
        scores.append(score)
    return pd.Series(scores, index=subject_cols)

# APPLY to your df
np.random.seed(120)  # For reproducibility
df[subject_cols] = df.apply(assign_assessments, axis=1)

# Optional: quick check
print(df[subject_cols].mean().mean())   # Should be mid 60s, or to your liking
print(df.groupby('academic_status')[subject_cols].mean().mean(axis=1))  # Means per status


68.37590735434576
academic_status
Academic Caution    56.682833
Conditional         54.615152
Excluded             0.000000
Satisfactory        70.355408
dtype: float64


In [64]:
subject_cols = [f"subject_{i}_assess_{j}" for i in range(1,4) for j in range(1,4)]
print(df[subject_cols].describe().transpose())


                    count       mean        std  min     25%    50%     75%  \
subject_1_assess_1  698.0  68.599427  11.107925  0.0  64.000  69.30  74.875   
subject_1_assess_2  698.0  68.654585  10.503741  0.0  64.100  69.40  74.700   
subject_1_assess_3  698.0  68.926648  10.389201  0.0  64.925  69.80  75.000   
subject_2_assess_1  698.0  68.490401  10.873544  0.0  63.900  69.80  74.900   
subject_2_assess_2  698.0  68.488682  10.773587  0.0  64.025  69.70  74.800   
subject_2_assess_3  698.0  67.932665  10.960567  0.0  63.225  69.15  74.600   
subject_3_assess_1  698.0  68.314900  10.770884  0.0  63.500  69.60  74.700   
subject_3_assess_2  698.0  67.914900  10.705773  0.0  63.325  68.80  74.575   
subject_3_assess_3  698.0  68.148281  10.919678  0.0  63.500  69.40  74.275   

                     max  
subject_1_assess_1  93.9  
subject_1_assess_2  91.0  
subject_1_assess_3  94.5  
subject_2_assess_1  93.0  
subject_2_assess_2  90.0  
subject_2_assess_3  90.0  
subject_3_assess_1  

In [65]:
def add_outliers(
    df,
    subject_cols,
    low_pct=0.05,
    high_pct=0.05,
    filter_academic_status=None,  # e.g. ['Satisfactory', 'At Risk'], None means all
    filter_cohorts=None           # e.g. ['First year', 'New'], None means all
):
    """
    Add low and high outliers (score extremes) to subject assessment columns,
    optionally filtering by academic_status and/or cohorts.

    Params:
    - df: DataFrame containing your data
    - subject_cols: list of subject assessment column names (strings)
    - low_pct: fraction of students to make low outliers per column (e.g. 0.05 = 5%)
    - high_pct: fraction of students to make high outliers per column
    - filter_academic_status: list of academic_status to target or None for all
    - filter_cohorts: list of cohorts to target or None for all

    Returns:
    - df with modified scores in-place
    - outliers_log: dict summarizing which rows/columns were modified
    """
    np.random.seed(999)  # For reproducibility

    # Filter rows if specified
    if filter_academic_status is not None:
        mask_status = df['academic_status'].isin(filter_academic_status)
    else:
        mask_status = pd.Series(True, index=df.index)

    if filter_cohorts is not None:
        mask_cohort = df['student_cohort'].isin(filter_cohorts)
    else:
        mask_cohort = pd.Series(True, index=df.index)

    eligible_indices = df[mask_status & mask_cohort].index.tolist()
    n_students = len(eligible_indices)

    if n_students == 0:
        print("Warning: No students matching the filter criteria for outliers.")
        return df, {}

    outliers_log = {'low_outliers': {}, 'high_outliers': {}}

    for col in subject_cols:
        n_low = int(n_students * low_pct)
        n_high = int(n_students * high_pct)

        # Select random students for low outliers
        low_indices = np.random.choice(eligible_indices, size=n_low, replace=False)
        # Ensure no overlap
        remaining_for_high = list(set(eligible_indices) - set(low_indices))
        n_high = min(n_high, len(remaining_for_high))
        high_indices = np.random.choice(remaining_for_high, size=n_high, replace=False)

        # Assign low scores (uniform 0 to 40)
        low_scores = np.random.uniform(0, 40, size=n_low).round(1)
        df.loc[low_indices, col] = low_scores

        # Assign high scores (uniform 90 to 100)
        high_scores = np.random.uniform(90, 100, size=n_high).round(1)
        df.loc[high_indices, col] = high_scores

        # Log info
        outliers_log['low_outliers'][col] = list(low_indices)
        outliers_log['high_outliers'][col] = list(high_indices)

        print(f"{col}: assigned {n_low} low outliers, {n_high} high outliers")

    return df, outliers_log

# ----------------------

# List your assessment columns (adjust if needed)
subject_cols = [col for col in df.columns if col.startswith('subject_') and 'assess_' in col]

# Add outliers only for Satisfactory students (example)
df, log = add_outliers(
    df,
    subject_cols,
    low_pct=0.05,
    high_pct=0.05,
    filter_academic_status=['Satisfactory'],
    filter_cohorts=None  # Or specify cohorts like ['First year','New'] if desired
)

# Or add outliers for all students without filtering:
# df, log = add_outliers(df, subject_cols, low_pct=0.05, high_pct=0.05)

# Check summary of affected rows
total_low_outliers = sum(len(v) for v in log['low_outliers'].values())
total_high_outliers = sum(len(v) for v in log['high_outliers'].values())

print(f"Total low outliers assigned: {total_low_outliers}")
print(f"Total high outliers assigned: {total_high_outliers}")
subject_cols = [f"subject_{i}_assess_{j}"
                for i in range(1, 4)  # subjects 1 to 3
                for j in range(1, 5)]  # assessments 1 to 4

subject_1_assess_1: assigned 31 low outliers, 31 high outliers
subject_1_assess_2: assigned 31 low outliers, 31 high outliers
subject_1_assess_3: assigned 31 low outliers, 31 high outliers
subject_1_assess_4: assigned 31 low outliers, 31 high outliers
subject_2_assess_1: assigned 31 low outliers, 31 high outliers
subject_2_assess_2: assigned 31 low outliers, 31 high outliers
subject_2_assess_3: assigned 31 low outliers, 31 high outliers
subject_2_assess_4: assigned 31 low outliers, 31 high outliers
subject_3_assess_1: assigned 31 low outliers, 31 high outliers
subject_3_assess_2: assigned 31 low outliers, 31 high outliers
subject_3_assess_3: assigned 31 low outliers, 31 high outliers
subject_3_assess_4: assigned 31 low outliers, 31 high outliers
Total low outliers assigned: 372
Total high outliers assigned: 372


In [66]:

# Build dictionaries: column -> set(row indices)
low_outlier_indices = {}
high_outlier_indices = {}

for col in subject_cols:
    low_outlier_indices[col] = set(df[df[col] <= 40].index)
    high_outlier_indices[col] = set(df[df[col] >= 90].index)



In [67]:

# For low outliers:
all_low_indices = [idx for indices in low_outlier_indices.values() for idx in indices]
low_counts = Counter(all_low_indices)

# For high outliers:
all_high_indices = [idx for indices in high_outlier_indices.values() for idx in indices]
high_counts = Counter(all_high_indices)


In [68]:
# How many students are outliers in 1, 2, ... n columns?
print("Low outliers per student frequency:")
print(Counter(low_counts.values()))
print("\nHigh outliers per student frequency:")
print(Counter(high_counts.values()))


Low outliers per student frequency:
Counter({1: 242, 2: 81, 3: 16, 12: 5, 5: 2})

High outliers per student frequency:
Counter({1: 213, 2: 66, 3: 21})


In [69]:
subject_cols = [f"subject_{i}_assess_{j}" for i in range(1,4) for j in range(1,4)]
print(df[subject_cols].describe().transpose())


                    count       mean        std  min   25%    50%   75%   max
subject_1_assess_1  698.0  67.493123  16.241171  0.0  63.5  69.10  75.6  99.9
subject_1_assess_2  698.0  67.506304  15.825559  0.0  63.5  69.35  75.3  99.8
subject_1_assess_3  698.0  67.696848  15.796821  0.0  64.2  69.70  75.5  99.7
subject_2_assess_1  698.0  67.284384  16.034464  0.0  63.2  69.80  75.5  99.6
subject_2_assess_2  698.0  67.344986  15.708606  0.0  63.4  69.65  75.4  99.4
subject_2_assess_3  698.0  66.617192  16.379987  0.0  62.2  69.05  74.9  99.5
subject_3_assess_1  698.0  67.162464  15.978574  0.0  62.6  69.50  75.4  99.9
subject_3_assess_2  698.0  66.775215  15.835586  0.0  62.4  68.70  75.0  99.3
subject_3_assess_3  698.0  66.996848  16.027989  0.0  62.4  69.30  74.9  99.7


In [None]:
df.to_excel('score_attendance.xlsx', index=False)