In [1]:
# Import required libraries
import pandas as pd
import numpy as np

In [110]:
np.random.seed(42)       # For numpy

In [2]:
# Load dataset
df = pd.read_excel('student_data.xlsx')

# Show all columns when displaying DataFrames
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,student_id,course,student_cohort,academic_status,failed_subjects,study_skills(attended),referral,pp_meeting,self_assessment,readiness_assessment_results,follow_up,follow_up_type,subject_1,subject_1_assess_1,subject_1_assess_2,subject_1_assess_3,subject_1_assess_4,attendance_1,learn_jcu_issues_1,lecturer_referral_1,subject_2,subject_2_assess_1,subject_2_assess_2,subject_2_assess_3,subject_2_assess_4,attendance_2,learn_jcu_issues_2,lecturer_referral_2,subject_3,subject_3_assess_1,subject_3_assess_2,subject_3_assess_3,subject_4_assess_4,attendance_3,learn_jcu_issues_3,lecturer_referral_3,comments,identified_issues
0,1,Master of Business Administration,SRI to JCUB,At Risk,,Essential Skills,Student Counsellor,Booked,Yes,L/G:9/10 N:5/10 R:8/10,Yes,No Reply,,7.7,27.22,26.51,50.09,20,Access,Concern for Welfare,,69.33,44.44,11.39,18.78,41,Access,Attendance,,39.02,17.71,94.5,9.83,89,Access,Non Submission,Week 8. Student re-engaged with tutorials. Sub...,Late Enrollment
1,2,Master of Business Administration,Continuing,Excluded,,Referencing,Student Advocate,Not relevant,Yes,L/G:9/10 N:5/10 R:8/10,No,Phone,,31.14,54.66,81.72,1.96,9,Access,Non Submission,,95.23,48.61,14.68,44.77,66,No Access,Concern for Welfare,,3.72,38.52,25.8,11.8,100,No Access,Non Submission,booked to see a doctor. Week 5. Student contac...,Poor time management
2,3,Master of Business Administration,First year,At Risk,,Writing,Enrollment,Attended,Yes,L/G:9/10 N:5/10 R:8/10,Yes,Phone,,39.09,75.39,84.62,82.66,51,No Access,Attendance,,86.2,98.8,71.57,96.08,64,No Access,Concern for Welfare,,77.77,77.27,81.95,62.35,42,Access,Attendance,Week 8. Student re-engaged with tutorials. Sub...,Poor time management
3,4,Master of Business Administration,New,Excluded,,Essential Skills,Student Advocate,Attended,No,L/G:9/10 N:5/10 R:8/10,No,F2F,,88.59,84.36,3.79,26.05,34,Access,Concern for Welfare,,7.69,25.96,49.83,17.77,57,Access,Non Submission,,95.41,8.93,3.21,99.15,51,Access,Concern for Welfare,Week 6. Student submitted assessment late. Ext...,Death in family
4,5,Master of Business Administration,Continuing,At Risk,,Essential Skills and Reading,Student Counsellor,Not relevant,No,L/G:9/10 N:5/10 R:8/10,Yes,No Reply,,0.98,13.8,53.4,39.69,45,Access,Attendance,,60.51,57.47,15.8,33.59,3,No Access,Concern for Welfare,,90.12,52.18,36.05,20.65,19,Access,Concern for Welfare,Week 3 late enrolment. Student finding it diff...,Sickness


#### Data Cleaning and Exploration

In [3]:
# Print a concise summary of the DataFrame, including column names, data types, non-null counts, and memory usage.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   student_id                    698 non-null    int64  
 1   course                        698 non-null    object 
 2   student_cohort                698 non-null    object 
 3   academic_status               698 non-null    object 
 4   failed_subjects               0 non-null      float64
 5   study_skills(attended)        698 non-null    object 
 6   referral                      698 non-null    object 
 7   pp_meeting                    698 non-null    object 
 8   self_assessment               698 non-null    object 
 9   readiness_assessment_results  698 non-null    object 
 10  follow_up                     698 non-null    object 
 11  follow_up_type                698 non-null    object 
 12  subject_1                     0 non-null      float64
 13  subje

In [4]:
df.rename(columns={'subject_4_assess_4':'subject_3_assess_4'}, inplace=True)
df.columns

Index(['student_id', 'course', 'student_cohort', 'academic_status',
       'failed_subjects', 'study_skills(attended)', 'referral', 'pp_meeting',
       'self_assessment', 'readiness_assessment_results', 'follow_up',
       'follow_up_type', 'subject_1', 'subject_1_assess_1',
       'subject_1_assess_2', 'subject_1_assess_3', 'subject_1_assess_4',
       'attendance_1', 'learn_jcu_issues_1', 'lecturer_referral_1',
       'subject_2', 'subject_2_assess_1', 'subject_2_assess_2',
       'subject_2_assess_3', 'subject_2_assess_4', 'attendance_2',
       'learn_jcu_issues_2', 'lecturer_referral_2', 'subject_3',
       'subject_3_assess_1', 'subject_3_assess_2', 'subject_3_assess_3',
       'subject_3_assess_4', 'attendance_3', 'learn_jcu_issues_3',
       'lecturer_referral_3', 'comments', 'identified_issues'],
      dtype='object')

In [5]:
df['course'] = df['course'].str.strip().str.lower()
df['course'].unique()

array(['master of business administration',
       'master of education - master of business administration',
       'master of information technology', 'bachelor of business',
       'master of engineering management',
       'bachelor of information technology',
       'master of professional accounting',
       'master of international tourism and hospitality management',
       'bachelor of tourism, hospitality and events',
       'master of professional account and master of business administration',
       'bachelor of commerce',
       'master of international tourism and hospitality management - master of business administration',
       'master of data science (professional)',
       'master of information technology - master of business administration',
       'postgraduate qualifying program - business'], dtype=object)

In [6]:
# Map subject code to subject names using Dictionary
subject_dict = {
    "BU1002": "Accounting for Decision Making",
    "BU1003": "Principles of Economics",
    "BU1007": "Principles of Data Analysis for Business",
    "BU1112": "Business Law",
    "BX2011": "Foundation of Accounting Principles",
    "BX2014": "Principles of Finance",
    "CP1401": "Fundamentals of Problem Solving and Programming I",
    "CP1402": "Internet Fundamentals",
    "CP1404": "Programming II",
    "TO1008": "Introduction to Tourism, Hospitality and Events Management",
    "TO2117": "Food and Beverage Management",
    "TO3052": "Experience Design for Tourism Hospitality and Events",
    "LB5113": "Corporate Strategy",
    "LB5202": "Marketing Essentials",
    "LB5205": "People in Organisations",
    "MA5831": "Advanced Data Management and Analysis using SAS",
    "MA5840": "Data Science and Strategic Decision Making for Business",
    "MA5851": "Data Science Master Class 1",
    "ED5097": "Research Design and Proposal",
    "ED5880": "Educational Leadership",
    "EG5200": "Career Planning",
    "EG5220": "Advanced Asset Management and Reliability",
    "EG5310": "Professional Placement",
    "CP5046": "ICT Project 1: Analysis and Design",
    "CP5047": "ICT Project 2: Implementation and Commissioning",
    "CP5503": "Enterprise Database Systems - Oracle",
    "TO5101": "Tourism Systems Analysis",
    "TO5103": "Global Destinations and Competitiveness",
    "TO5104": "Tourist Management Strategies",
    "CO5117": "Introduction to Accounting",
    "CO5103": "Management Accounting",
    "CO5109": "Corporate Finance",
    "LB5203": "Sustainable Enterprise",
    "LB5212": "Accounting and Finance for Managers"
}


In [7]:
# Get the subject name from the subject code
subject_dict['BU1007'] # Output: Principles of Data Analysis for Business 

'Principles of Data Analysis for Business'

In [8]:
# Map each course to a list of three subject codes
course_subjects = {
    'Bachelor of Business': ['BU1002', 'BU1003', 'BU1007'],
    'Bachelor of Commerce': ['BU1112', 'BX2011', 'BX2014'],
    'Bachelor of Information Technology': ['CP1401', 'CP1402', 'CP1404'],
    'Bachelor of Tourism, Hospitality and Events': ['TO1008', 'TO2117', 'TO3052'],
    'Master of Business Administration': ['LB5113', 'LB5202', 'LB5205'],
    'Master of Data Science (Professional)': ['MA5831', 'MA5840', 'MA5851'],
    'Master of Education - Master of Business Administration': ['ED5097', 'ED5880', 'LB5113'],
    'Master of Engineering Management': ['EG5200', 'EG5220', 'EG5310'],
    'Master of Information Technology': ['CP5046', 'CP5047', 'CP5503'],
    'Master of Information Technology - Master of Business Administration': ['CP5046', 'LB5113', 'LB5202'],
    'Master of International Tourism and Hospitality Management': ['TO5101', 'TO5103', 'TO5104'],
    'Master of International Tourism and Hospitality Management - Master of Business Administration': ['TO5101', 'LB5113', 'LB5202'],
    'Master of Professional Accounting': ['CO5117', 'CO5103', 'CO5109'],
    'Master of Professional Accounting - Master of Business Administration': ['CO5117', 'CO5103', 'LB5113'],
    'Postgraduate Qualifying Program - Business': ['LB5202', 'LB5203', 'LB5212']
}


In [9]:
# Get the course subjects from the course
course_subjects['Bachelor of Information Technology'] # Output: 'CP1401', 'CP1402', 'CP1404']

['CP1401', 'CP1402', 'CP1404']

In [10]:
# Create a new dictionary with cleaned course names as keys
# For each key-value pair in the original course_subjects dictionary:
#   Remove any leading/trailing spaces from the course name (key) using strip()
#   Convert the course name to lowercase using lower()
#   Keep the value (list of subjects) unchanged
cleaned_course_subjects = {k.strip().lower(): v for k, v in course_subjects.items()}
cleaned_course_subjects

{'bachelor of business': ['BU1002', 'BU1003', 'BU1007'],
 'bachelor of commerce': ['BU1112', 'BX2011', 'BX2014'],
 'bachelor of information technology': ['CP1401', 'CP1402', 'CP1404'],
 'bachelor of tourism, hospitality and events': ['TO1008', 'TO2117', 'TO3052'],
 'master of business administration': ['LB5113', 'LB5202', 'LB5205'],
 'master of data science (professional)': ['MA5831', 'MA5840', 'MA5851'],
 'master of education - master of business administration': ['ED5097',
  'ED5880',
  'LB5113'],
 'master of engineering management': ['EG5200', 'EG5220', 'EG5310'],
 'master of information technology': ['CP5046', 'CP5047', 'CP5503'],
 'master of information technology - master of business administration': ['CP5046',
  'LB5113',
  'LB5202'],
 'master of international tourism and hospitality management': ['TO5101',
  'TO5103',
  'TO5104'],
 'master of international tourism and hospitality management - master of business administration': ['TO5101',
  'LB5113',
  'LB5202'],
 'master of pr

In [11]:
# Get unique course names in DataFrame
unique_courses = df['course'].unique()

# Find which course names are not keys in cleaned_course_subjects dictionary
courses_not_in_cleaned = [course for course in unique_courses if course not in cleaned_course_subjects]
courses_not_in_cleaned

['master of professional account and master of business administration']

In [12]:
df['course'].replace('master of professional account and master of business administration', 'master of professional accounting - master of business administration', inplace=True)

In [13]:
# check if the course value has been updated
df['course'].unique()

array(['master of business administration',
       'master of education - master of business administration',
       'master of information technology', 'bachelor of business',
       'master of engineering management',
       'bachelor of information technology',
       'master of professional accounting',
       'master of international tourism and hospitality management',
       'bachelor of tourism, hospitality and events',
       'master of professional accounting - master of business administration',
       'bachelor of commerce',
       'master of international tourism and hospitality management - master of business administration',
       'master of data science (professional)',
       'master of information technology - master of business administration',
       'postgraduate qualifying program - business'], dtype=object)

In [14]:
# Assigns subject codes to each student based on their course.
# For each row, looks up the course in the cleaned_course_subjects dictionary.
# If the course is found, fills subject_1, subject_2, and subject_3 with the corresponding subject codes.
# If the course is not found, fills these columns with None.
def assign_subjects(row):
    subjects = cleaned_course_subjects.get(row['course'], [None, None, None])
    row['subject_1'], row['subject_2'], row['subject_3'] = subjects
    return row

df = df.apply(assign_subjects, axis=1)

In [15]:
# Check for null values in subject 1 to 3
df[['subject_1', 'subject_2', 'subject_3']].isnull().sum()

subject_1    0
subject_2    0
subject_3    0
dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   student_id                    698 non-null    int64  
 1   course                        698 non-null    object 
 2   student_cohort                698 non-null    object 
 3   academic_status               698 non-null    object 
 4   failed_subjects               0 non-null      float64
 5   study_skills(attended)        698 non-null    object 
 6   referral                      698 non-null    object 
 7   pp_meeting                    698 non-null    object 
 8   self_assessment               698 non-null    object 
 9   readiness_assessment_results  698 non-null    object 
 10  follow_up                     698 non-null    object 
 11  follow_up_type                698 non-null    object 
 12  subject_1                     698 non-null    object 
 13  subje

In [17]:
df['course'].value_counts()

master of business administration                                                                 102
master of education - master of business administration                                            90
master of information technology                                                                   77
bachelor of business                                                                               71
master of engineering management                                                                   69
bachelor of information technology                                                                 56
master of professional accounting                                                                  55
master of international tourism and hospitality management                                         50
bachelor of tourism, hospitality and events                                                        30
master of professional accounting - master of business administration             

In [18]:
df.head()

Unnamed: 0,student_id,course,student_cohort,academic_status,failed_subjects,study_skills(attended),referral,pp_meeting,self_assessment,readiness_assessment_results,follow_up,follow_up_type,subject_1,subject_1_assess_1,subject_1_assess_2,subject_1_assess_3,subject_1_assess_4,attendance_1,learn_jcu_issues_1,lecturer_referral_1,subject_2,subject_2_assess_1,subject_2_assess_2,subject_2_assess_3,subject_2_assess_4,attendance_2,learn_jcu_issues_2,lecturer_referral_2,subject_3,subject_3_assess_1,subject_3_assess_2,subject_3_assess_3,subject_3_assess_4,attendance_3,learn_jcu_issues_3,lecturer_referral_3,comments,identified_issues
0,1,master of business administration,SRI to JCUB,At Risk,,Essential Skills,Student Counsellor,Booked,Yes,L/G:9/10 N:5/10 R:8/10,Yes,No Reply,LB5113,7.7,27.22,26.51,50.09,20,Access,Concern for Welfare,LB5202,69.33,44.44,11.39,18.78,41,Access,Attendance,LB5205,39.02,17.71,94.5,9.83,89,Access,Non Submission,Week 8. Student re-engaged with tutorials. Sub...,Late Enrollment
1,2,master of business administration,Continuing,Excluded,,Referencing,Student Advocate,Not relevant,Yes,L/G:9/10 N:5/10 R:8/10,No,Phone,LB5113,31.14,54.66,81.72,1.96,9,Access,Non Submission,LB5202,95.23,48.61,14.68,44.77,66,No Access,Concern for Welfare,LB5205,3.72,38.52,25.8,11.8,100,No Access,Non Submission,booked to see a doctor. Week 5. Student contac...,Poor time management
2,3,master of business administration,First year,At Risk,,Writing,Enrollment,Attended,Yes,L/G:9/10 N:5/10 R:8/10,Yes,Phone,LB5113,39.09,75.39,84.62,82.66,51,No Access,Attendance,LB5202,86.2,98.8,71.57,96.08,64,No Access,Concern for Welfare,LB5205,77.77,77.27,81.95,62.35,42,Access,Attendance,Week 8. Student re-engaged with tutorials. Sub...,Poor time management
3,4,master of business administration,New,Excluded,,Essential Skills,Student Advocate,Attended,No,L/G:9/10 N:5/10 R:8/10,No,F2F,LB5113,88.59,84.36,3.79,26.05,34,Access,Concern for Welfare,LB5202,7.69,25.96,49.83,17.77,57,Access,Non Submission,LB5205,95.41,8.93,3.21,99.15,51,Access,Concern for Welfare,Week 6. Student submitted assessment late. Ext...,Death in family
4,5,master of business administration,Continuing,At Risk,,Essential Skills and Reading,Student Counsellor,Not relevant,No,L/G:9/10 N:5/10 R:8/10,Yes,No Reply,LB5113,0.98,13.8,53.4,39.69,45,Access,Attendance,LB5202,60.51,57.47,15.8,33.59,3,No Access,Concern for Welfare,LB5205,90.12,52.18,36.05,20.65,19,Access,Concern for Welfare,Week 3 late enrolment. Student finding it diff...,Sickness


In [19]:
df['academic_status'].value_counts()

Academic Caution    156
At Risk             148
Excluded            133
Conditional         131
Satisfactory        130
Name: academic_status, dtype: int64

###### Clean the academic status column


In [20]:
# Standardize the values (strip whitespace and title case) 
df['academic_status'] = df['academic_status'].str.strip().str.title()

In [21]:
# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Define the target distribution
target_counts = {
    'Satisfactory': 524,
    'Academic Caution': 70,
    'Conditional': 99,
    'Excluded': 5
}

In [22]:
# Create a new column with the desired academic status
new_statuses = (
    ['Satisfactory'] * target_counts['Satisfactory'] +
    ['Academic Caution'] * target_counts['Academic Caution'] +
    ['Conditional'] * target_counts['Conditional'] +
    ['Excluded'] * target_counts['Excluded']
)

In [23]:
# Assign to the DataFrame
df['academic_status'] = new_statuses

# Confirm the new distribution
df['academic_status'].value_counts()

Satisfactory        524
Conditional          99
Academic Caution     70
Excluded              5
Name: academic_status, dtype: int64

###### Clean the student cohort column

In [24]:
df['student_cohort'].value_counts()

Return to Study    101
First year          93
Transferred         92
Continuing          89
Excluded            86
LOA                 85
New                 78
SRI to JCUB         74
Name: student_cohort, dtype: int64

In [None]:
# Distribute 86 excluded students
excluded_stds = 6
excluded_student_cohort_index = df[df['student_cohort']=='Excluded'].index
random_assign = np.random.choice(excluded_student_cohort_index, size = len(excluded_student_cohort_index)-excluded_stds, replace=False)

In [25]:
# List of cohort values for which academic status is 'satisfactory
satisfactory_cohorts = ['SRI to JCUB', 'Transferred', 'New']

# Update the academic_status column for those cohorts
df.loc[df['student_cohort'].isin(satisfactory_cohorts), 'academic_status'] = 'Satisfactory'

###### Group Courses into IT and Non IT

In [26]:
# List of courses that falls under course group 'IT'
it_courses = [ 'master of information technology',
    'bachelor of information technology',
    'master of data science (professional)',
    'master of information technology - master of business administration',
] 

In [27]:
# Create a new column 'course_group' based on whether the course is in the IT list
df['course_group'] = df['course'].apply(
    lambda x: 'IT' if x in it_courses else 'Non-IT'
)


###### Generate records of failed subjects for the course IT


In [28]:
# Get the row numbers (indices) for each group
it_indices = df[df['course_group']== 'IT'].index

# Subset of IT course
# Check the distribution number of academic status (Course IT) 
it_df = df.loc[it_indices].copy()
it_df['academic_status'].value_counts()

Satisfactory        132
Conditional          14
Academic Caution     10
Excluded              1
Name: academic_status, dtype: int64

In [29]:
# Select 6 students from IT students with the academic status 'Satisfactory'
satisfactory_it = it_df[it_df['academic_status'] == 'Satisfactory'].index
# Randomly choose indices to replace
reassign_academic_status = np.random.choice(satisfactory_it, size = 6, replace = False)

In [30]:
# Reassign 6 Students from Satisfactory to Other Academic Statuses
statuses_to_assign = ['Conditional', 'Academic Caution', 'Excluded']
for i, status in enumerate(statuses_to_assign):
    df.loc[reassign_academic_status[i*3:(i+1)*3], 'academic_status'] = status

In [31]:
# Check updated value counts for IT academic status
it_df = df.loc[it_indices].copy()
it_df['academic_status'].value_counts()

Satisfactory        126
Conditional          17
Academic Caution     13
Excluded              1
Name: academic_status, dtype: int64

In [32]:
# Assign 0 failed subjects to Satisfactory IT students
satisfactory_mask = it_df['academic_status'] == 'Satisfactory'
it_df.loc[satisfactory_mask, 'failed_subjects'] = 0

# Identify non-Satisfactory (i.e., failed) students
failed_mask = ~satisfactory_mask
failed_it_indices = it_df[failed_mask].index

# Assign >3 failed subjects to 10 students (1 of them should be Excluded)

# First, get the Excluded student
excluded_mask = it_df['academic_status'] == 'Excluded'
excluded_index = it_df[excluded_mask].index

# Assign random failed_subjects = 6–8 to the Excluded student
it_df.loc[excluded_index, 'failed_subjects'] = np.random.randint(6, 9, size=len(excluded_index))

# Now select 9 more students (excluding Excluded) to get 4–5 failed subjects
eligible_for_high_fail = failed_it_indices.difference(excluded_index)

# Now select 9 more students (excluding Excluded) to get 4–5 failed subjects
eligible_for_high_fail = failed_it_indices.difference(excluded_index)

# Select 9 more randomly to total 10 students failing >3 subjects
more_than_3_indices = np.random.choice(eligible_for_high_fail, size=9, replace=False)
it_df.loc[more_than_3_indices, 'failed_subjects'] = np.random.randint(4, 6, size=9)

# STEP 5: Remaining IT failed students — distribute 1–2 failed subjects
remaining_failed = eligible_for_high_fail.difference(more_than_3_indices)

# 21 failed students remaining
# Randomly split into two groups for 1 and 2 failed subjects
n_remain = len(remaining_failed)  # 21
half = n_remain // 2  # 10 or 11

remaining_failed = list(remaining_failed)
np.random.shuffle(remaining_failed)

low_1_fail = remaining_failed[:half]
low_2_fail = remaining_failed[half:]

# Assign 1s and 2s
it_df.loc[low_1_fail, 'failed_subjects'] = 1
it_df.loc[low_2_fail, 'failed_subjects'] = 2

In [33]:
# Update the failed_subjects column in the main data frame df
df.loc[it_df.index, 'failed_subjects'] = it_df['failed_subjects']

In [34]:
# Check only the IT students
it_students = df.loc[it_indices]

# 1. Total IT students who failed (not Satisfactory)
it_failed = it_students[it_students['academic_status'] != 'Satisfactory']

# 2. Total with non-zero failed_subjects
non_zero_fails = it_failed[it_failed['failed_subjects'] > 0]
print("Total assigned failed_subjects to IT failed students:", len(non_zero_fails))  # 31

# 3. Number of students with >3 failed subjects
gt3 = non_zero_fails[non_zero_fails['failed_subjects'] > 3]
print("Number with >3 failed subjects:", len(gt3))  # 10

# 4. Number with <=3 failed subjects
lte3 = non_zero_fails[non_zero_fails['failed_subjects'] <= 3]
print("Number with <=3 failed subjects:", len(lte3))  # 21

# 5. Print failed_subjects distribution (should show a variety like 1, 2, 4, etc.)
print("\nFailed Subjects Distribution:")
print(non_zero_fails['failed_subjects'].value_counts().sort_index())

# 6. Confirm no failed_subjects assigned to Satisfactory students
sat_students = it_students[it_students['academic_status'] == 'Satisfactory']
sats_with_fails = sat_students[sat_students['failed_subjects'] != 0]
print("\nSatisfactory students incorrectly assigned fails:", len(sats_with_fails))  # 0


Total assigned failed_subjects to IT failed students: 31
Number with >3 failed subjects: 10
Number with <=3 failed subjects: 21

Failed Subjects Distribution:
1.0    10
2.0    11
4.0     8
5.0     1
7.0     1
Name: failed_subjects, dtype: int64

Satisfactory students incorrectly assigned fails: 0


###### Generate records of failed subjects for the course Non-IT


In [35]:
# Get non-IT indices
nonit_indices = df[df['course_group']== 'Non-IT'].index

# Subset of Non-IT course
# Check the distribution number of academic status (Course Non-IT) 
nonit_df = df.loc[nonit_indices].copy()
nonit_df['academic_status'].value_counts()

Satisfactory        458
Conditional          48
Academic Caution     32
Excluded              3
Name: academic_status, dtype: int64

In [36]:
# CONTROL 'Excluded' COUNT TO 5
current_excluded = (nonit_df['academic_status'] == 'Excluded').sum()
desired = 5

if current_excluded > desired:
    # Too many: set some to 'Conditional' or similar
    excluded = nonit_df[nonit_df['academic_status'] == 'Excluded'].index
    to_remove = np.random.choice(excluded, current_excluded - desired, replace=False)
    df.loc[to_remove, 'academic_status'] = 'Conditional'  # or 'Academic Caution'
elif current_excluded < desired:
    # Too few: set some from non-Excluded, non-Satisfactory
    not_excluded = nonit_df[nonit_df['academic_status'] != 'Excluded'].index
    candidates = not_excluded.intersection(nonit_df[nonit_df['academic_status'] != 'Satisfactory'].index)
    to_add = np.random.choice(candidates, desired - current_excluded, replace=False)
    df.loc[to_add, 'academic_status'] = 'Excluded'


In [37]:
# Refresh
nonit_df = df.loc[nonit_indices].copy()
print("After setting exactly 5 'Excluded':")
print(nonit_df['academic_status'].value_counts())

After setting exactly 5 'Excluded':
Satisfactory        458
Conditional          47
Academic Caution     31
Excluded              5
Name: academic_status, dtype: int64


In [38]:
# 3. NOW ASSIGN 'Satisfactory' TO NON-EXCLUDED, NON-SATISFACTORY STUDENTS
non_satisfactory = nonit_df['academic_status'] != 'Satisfactory'
non_excluded = nonit_df['academic_status'] != 'Excluded'
eligible = nonit_df[non_satisfactory & non_excluded].index

reassign_academic_status = np.random.choice(eligible, min(37, len(eligible)), replace=False)
df.loc[reassign_academic_status, 'academic_status'] = 'Satisfactory'

# Final counts
nonit_df = df.loc[nonit_indices].copy()
print("Final:")
print(nonit_df['academic_status'].value_counts())

Final:
Satisfactory        495
Conditional          28
Academic Caution     13
Excluded              5
Name: academic_status, dtype: int64


In [39]:
# Assign 0 failed subjects to Satisfactory Non IT students
satisfactory_mask = nonit_df['academic_status'] == 'Satisfactory'
nonit_df.loc[satisfactory_mask, 'failed_subjects'] = 0

# Identify non-Satisfactory (i.e., failed) students
failed_mask = ~satisfactory_mask
failed_nonit_indices = nonit_df[failed_mask].index

# Assign >3 failed subjects to 7 students (5 of them should be Excluded)

# First, get the Excluded student
excluded_mask = nonit_df['academic_status'] == 'Excluded'
excluded_index = nonit_df[excluded_mask].index

# Assign random failed_subjects = 6–8 to the Excluded student
nonit_df.loc[excluded_index, 'failed_subjects'] = np.random.randint(6, 9, size=len(excluded_index))

# Now select 2 more students (excluding Excluded) to get 4–5 failed subjects
eligible_for_high_fail = failed_nonit_indices.difference(excluded_index)

# Select 2 more randomly to total 7 students failing >3 subjects
more_than_3_indices = np.random.choice(eligible_for_high_fail, size=2, replace=False)
nonit_df.loc[more_than_3_indices, 'failed_subjects'] = np.random.randint(4, 6, size=2)

# STEP 5: Remaining Non IT failed students — distribute 1–2 failed subjects
remaining_failed = eligible_for_high_fail.difference(more_than_3_indices)

# 39 failed students remaining
# Randomly split into two groups for 1 and 2 failed subjects
n_remain = len(remaining_failed)  # 31
half = n_remain // 2 # 15 or 16

remaining_failed = list(remaining_failed)
np.random.shuffle(remaining_failed)

low_1_fail = remaining_failed[:half]
low_2_fail = remaining_failed[half:]

# Assign 1s and 2s
nonit_df.loc[low_1_fail, 'failed_subjects'] = 1
nonit_df.loc[low_2_fail, 'failed_subjects'] = 2

In [40]:
# Update the failed_subjects column in the main data frame df
df.loc[nonit_df.index, 'failed_subjects'] = nonit_df['failed_subjects']

In [41]:
# Check only the Non IT students
nonit_students = df.loc[nonit_indices]

# 1. Total Non IT students who failed (not Satisfactory)
nonit_failed = nonit_students[nonit_students['academic_status'] != 'Satisfactory']

# 2. Total with non-zero failed_subjects
non_zero_fails = nonit_failed[nonit_failed['failed_subjects'] > 0]
print("Total assigned failed_subjects to Non IT failed students:", len(non_zero_fails))  # 46

# 3. Number of students with >3 failed subjects
gt3 = non_zero_fails[non_zero_fails['failed_subjects'] > 3]
print("Number with >3 failed subjects:", len(gt3))  # 10

# 4. Number with <=3 failed subjects
lte3 = non_zero_fails[non_zero_fails['failed_subjects'] <= 3]
print("Number with <=3 failed subjects:", len(lte3))  # 21

# 5. Print failed_subjects distribution (should show a variety like 1, 2, 4, etc.)
print("\nFailed Subjects Distribution:")
print(non_zero_fails['failed_subjects'].value_counts().sort_index())

# 6. Confirm no failed_subjects assigned to Satisfactory students
sat_students = nonit_students[nonit_students['academic_status'] == 'Satisfactory']
sats_with_fails = sat_students[sat_students['failed_subjects'] != 0]
print("\nSatisfactory students incorrectly assigned fails:", len(sats_with_fails))  # 0


Total assigned failed_subjects to Non IT failed students: 46
Number with >3 failed subjects: 7
Number with <=3 failed subjects: 39

Failed Subjects Distribution:
1.0    19
2.0    20
4.0     2
6.0     1
7.0     2
8.0     2
Name: failed_subjects, dtype: int64

Satisfactory students incorrectly assigned fails: 0


In [42]:
df['academic_status'].value_counts()

Satisfactory        621
Conditional          45
Academic Caution     26
Excluded              6
Name: academic_status, dtype: int64

In [43]:
df['identified_issues'].value_counts()

Late Enrollment         152
Mental health           144
Sickness                139
Poor time management    132
Death in family         131
Name: identified_issues, dtype: int64

In [44]:
df['student_cohort'].value_counts()

Return to Study    101
First year          93
Transferred         92
Continuing          89
Excluded            86
LOA                 85
New                 78
SRI to JCUB         74
Name: student_cohort, dtype: int64

###### Balance identified issues

In [45]:
# Count how many 'New' students had 'Late Enrollment' as an identified issue
df[(df['student_cohort'] == 'New') & (df['identified_issues'] == 'Late Enrollment')].shape

(23, 39)

In [46]:
# Reassign identified issues for New students

new_late_enrollment_indices = df[(df['student_cohort'] == 'New') & (df['identified_issues'] == 'Late Enrollment')].index

# Randomly choose indices to replace

reassign_identified_issue = np.random.choice(new_late_enrollment_indices, size = len(new_late_enrollment_indices)-20, replace=False)
df.loc[reassign_identified_issue, 'identified_issues'] = 'None'

In [47]:
df[(df['student_cohort'] == 'New') & (df['identified_issues'] == 'Late Enrollment')].shape

(20, 39)

In [48]:
df[(df['student_cohort'] != 'New')  & (df['academic_status'] == 'Satisfactory')].shape

(543, 39)

In [49]:
# Reassign identified issues for Satisfactory Students students

satisfactory_identified_issues_indices = df[(df['student_cohort'] != 'New')  & (df['academic_status'] == 'Satisfactory')].index

# Randomly choose indices to replace
# 80% of students with academic status Satisfactory (Not new)
reassign_identified_issue = np.random.choice(satisfactory_identified_issues_indices, size = 434, replace=False)
df.loc[reassign_identified_issue, 'identified_issues'] = 'None'

In [50]:
df[(df['student_cohort'] != 'New')  & (df['academic_status'] == 'Satisfactory') & (df['identified_issues'] == 'None')].shape

(434, 39)

In [51]:
df['identified_issues'].value_counts()

None                    437
Sickness                 55
Late Enrollment          55
Mental health            53
Death in family          52
Poor time management     46
Name: identified_issues, dtype: int64

###### Clean study skills column

In [52]:
df['study_skills(attended)'].value_counts()

Referencing                     140
Writing                         125
Studiocity                      117
Essential Skills                113
Essential Skills and Reading    108
4R Essential Skills              95
Name: study_skills(attended), dtype: int64

In [53]:
# Only for first year and new students
study_skills_index = df[~df['student_cohort'].isin(['New', 'First year'])].index
reassign_study_skills = np.random.choice(study_skills_index, size = len(study_skills_index), replace=False)
df.loc[reassign_study_skills, 'study_skills(attended)'] = 'None'
df['study_skills(attended)'].value_counts()

None                            527
Writing                          34
Studiocity                       33
Essential Skills                 29
Referencing                      28
Essential Skills and Reading     26
4R Essential Skills              21
Name: study_skills(attended), dtype: int64

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   student_id                    698 non-null    int64  
 1   course                        698 non-null    object 
 2   student_cohort                698 non-null    object 
 3   academic_status               698 non-null    object 
 4   failed_subjects               698 non-null    float64
 5   study_skills(attended)        698 non-null    object 
 6   referral                      698 non-null    object 
 7   pp_meeting                    698 non-null    object 
 8   self_assessment               698 non-null    object 
 9   readiness_assessment_results  698 non-null    object 
 10  follow_up                     698 non-null    object 
 11  follow_up_type                698 non-null    object 
 12  subject_1                     698 non-null    object 
 13  subje

###### Clean Attendance Data
This section prepares the attendance data for analysis by handling missing values, correcting inconsistencies, and ensuring all attendance records are valid and comparable.

In [55]:
# Parameters
MIN_ATTD_ASSIGN = 53   # Minimum attendance to assign (just above threshold)
MAX_ATTD_ASSIGN = 100  # Maximum attendance to assign (100%)

In [56]:
df['attendance_1'].describe()

count    698.000000
mean      51.173352
std       29.311125
min        0.000000
25%       26.000000
50%       52.000000
75%       77.000000
max      100.000000
Name: attendance_1, dtype: float64

In [57]:
df['academic_status'].value_counts()

Satisfactory        621
Conditional          45
Academic Caution     26
Excluded              6
Name: academic_status, dtype: int64

In [58]:
# Check for students with attendance less than 52%
all_low_attendance = df[df['attendance_1'] <= 52]

# Satisfactory attendance
satisfactory_low_52 = len(all_low_attendance[all_low_attendance['academic_status'] == 'Satisfactory'])

# Non-Satisfactory attendance
non_satisfactory_low_52 = len(all_low_attendance[all_low_attendance['academic_status'] != 'Satisfactory'])

print(satisfactory_low_52, non_satisfactory_low_52)
print(satisfactory_low_52 + non_satisfactory_low_52, "should equal", len(all_low_attendance))


314 43
357 should equal 357


In [59]:
# First year with attendance less than 52%
first_year_low_52 = len(all_low_attendance[all_low_attendance['student_cohort']=='First year'])
first_year_low_52



49

In [60]:
# New students with attendance less than 52%
new_low_52 = len(all_low_attendance[all_low_attendance['student_cohort']=='New'])
new_low_52

38

In [61]:
# Remaining student cohorts with less than 52%
mask = ~all_low_attendance['student_cohort'].isin(['New', 'First year'])
std_less_52 = len(all_low_attendance[mask])
std_less_52

270

In [62]:
# Get first year students with academic status 'Satisfactory' and attendance less than 52
df[(df['student_cohort']== 'First year') & (df['academic_status']=='Satisfactory') & (df['attendance_1']<=52)].shape


(43, 39)

In [63]:
df[(df['student_cohort']== 'First year') & (df['academic_status']=='Satisfactory')].shape

(75, 39)

In [64]:
df[(df['student_cohort']== 'First year') & (df['academic_status']!='Satisfactory')].shape

(18, 39)

In [65]:
# Get index of first year students with academic status 'Satisfactory' and attendance less than 52

first_year_satisfactory = df[(df['student_cohort']== 'First year') & (df['academic_status']=='Satisfactory') & (df['attendance_1']<=52)]
first_year_index = first_year_satisfactory.index

# Change the attendance of 30% first year students to more than 52%
n_assign = 38 # Maximum number of students to assign higher attendance

# Draw random sample of up to n_assign students
n_select = min(n_assign, len(first_year_index))
random_assign_attendance = np.random.choice(first_year_index, size = n_assign, replace = False)
df.loc[random_assign_attendance, 'attendance_1'] = np.random.uniform(MIN_ATTD_ASSIGN, MAX_ATTD_ASSIGN, size=n_select)

In [66]:
df[(df['student_cohort']== 'First year') & (df['academic_status']=='Satisfactory') & (df['attendance_1']<=52)].shape

(5, 39)

In [67]:
first_year_non_satisfactory= df[(df['student_cohort']== 'First year') & (df['academic_status']!='Satisfactory') & (df['attendance_1']<=52)]
first_year_non_satisfactory.shape
first_year_non_satisfactory= df[(df['student_cohort']== 'First year') & (df['academic_status']!='Satisfactory') & (df['attendance_1']>52)]
first_year_non_satisfactory.shape

(12, 39)

In [68]:
df[(df['student_cohort']== 'New') & (df['academic_status']=='Satisfactory') & (df['attendance_1']<=52)].shape

(38, 39)

In [69]:
df[(df['student_cohort']== 'New') & (df['academic_status']!='Satisfactory') & (df['attendance_1']<=52)].shape

(0, 39)

In [70]:
df[df['student_cohort']=='New']['academic_status'].value_counts()

Satisfactory    78
Name: academic_status, dtype: int64

In [71]:
# Get index of New students with academic status 'Satisfactory'

new_satisfactory = df[(df['student_cohort']=='New') & (df['academic_status']=='Satisfactory') & (df['attendance_1']<=52)]
new_index = new_satisfactory.index

# Change the attendance of 10 New students to more than 52%
n_assign = 27 # Maximum number of students to assign higher attendance

# Draw random sample of up to n_assign students
n_select = min(n_assign, len(new_index))
random_assign_attendance = np.random.choice(new_index, size = n_assign, replace = False)
df.loc[random_assign_attendance, 'attendance_1'] = np.random.uniform(MIN_ATTD_ASSIGN, MAX_ATTD_ASSIGN, size=n_select)

In [72]:
df[(df['student_cohort']=='New') & (df['attendance_1']<=52) & (df['academic_status']=='Satisfactory')].shape

(11, 39)

In [73]:
df[(~df['student_cohort'].isin(['First year', 'New'])) & (df['academic_status'] == 'Satisfactory') & (df['attendance_1'] <= 52)].shape

(233, 39)

In [74]:
# Get index of students with academic status 'Satisfactory'

stds_satisfactory = df[(~df['student_cohort'].isin(['First year', 'New'])) & (df['academic_status'] == 'Satisfactory') & (df['attendance_1'] <= 52)]
stds_satisfactory_index = stds_satisfactory.index

# Change the attendance of 3 New students to more than 52%
n_assign = 226 # Maximum number of students to assign higher attendance

# Draw random sample of up to n_assign students
n_select = min(n_assign, len(stds_satisfactory_index))
random_assign_attendance = np.random.choice(stds_satisfactory_index, size = n_assign, replace = False)
df.loc[random_assign_attendance, 'attendance_1'] = np.random.uniform(MIN_ATTD_ASSIGN, MAX_ATTD_ASSIGN, size=n_select)

In [75]:
df[(~df['student_cohort'].isin(['First year', 'New'])) & (df['academic_status'] == 'Satisfactory') & (df['attendance_1'] <= 52)].shape

(7, 39)

In [76]:
df[(~df['student_cohort'].isin(['First year', 'New'])) & (df['academic_status'] != 'Satisfactory') & (df['attendance_1'] <= 52)].shape

(37, 39)

In [77]:
# Get index of students with academic status 'Satisfactory'

stds_non_satisfactory = df[(~df['student_cohort'].isin(['First year', 'New'])) & (df['academic_status'] != 'Satisfactory') & (df['attendance_1'] <= 52)]
stds_non_satisfactory_index = stds_non_satisfactory.index

# Change the attendance of 3 New students to more than 52%
n_assign = 25# Maximum number of students to assign higher attendance

# Draw random sample of up to n_assign students
n_select = min(n_assign, len(stds_non_satisfactory_index))
random_assign_attendance = np.random.choice(stds_non_satisfactory_index, size = n_assign, replace = False)
df.loc[random_assign_attendance, 'attendance_1'] = np.random.uniform(MIN_ATTD_ASSIGN, MAX_ATTD_ASSIGN, size=n_select)

In [78]:
df[(~df['student_cohort'].isin(['First year', 'New'])) & (df['academic_status'] != 'Satisfactory') & (df['attendance_1'] <= 52)].shape

(12, 39)

In [79]:
df['attendance_1'].describe()

count    698.000000
mean      74.213310
std       18.488261
min        0.000000
25%       62.545848
50%       77.000000
75%       88.112361
max      100.000000
Name: attendance_1, dtype: float64

In [80]:
df['attendance_adj'] = df['attendance_1'] - 4.76
df['attendance_adj'] = df['attendance_adj'].clip(lower=0, upper=100)  # Stay within 0–100%
df['attendance_1'] = df['attendance_adj']

In [81]:
df = df.drop('attendance_adj', axis=1)

In [82]:
df['attendance_1'].describe()

count    698.000000
mean      69.479900
std       18.385177
min        0.000000
25%       57.785848
50%       72.240000
75%       83.352361
max       95.240000
Name: attendance_1, dtype: float64

In [83]:
df['attendance_2'].describe()

count    698.000000
mean      49.723496
std       28.779966
min        0.000000
25%       26.000000
50%       48.000000
75%       74.750000
max      100.000000
Name: attendance_2, dtype: float64

In [84]:
df['attendance_3'].describe()

count    698.000000
mean      49.939828
std       29.299558
min        0.000000
25%       26.000000
50%       49.000000
75%       75.000000
max      100.000000
Name: attendance_3, dtype: float64

In [85]:
# Parameters
std_extra = 8  # How much each subject can differ from the base
low = 30        # Minimum reasonable attendance
high = 100      # Maximum possible attendance

# Generate random differences for each new attendance column
df['attendance_2'] = np.clip(df['attendance_1'] + np.random.normal(0, std_extra, len(df)), low, high)
df['attendance_3'] = np.clip(df['attendance_1'] + np.random.normal(0, std_extra, len(df)), low, high)

In [86]:
df['attendance_2'].describe()

count    698.000000
mean      70.226146
std       17.866746
min       30.000000
25%       57.714441
50%       71.428213
75%       83.744113
max      100.000000
Name: attendance_2, dtype: float64

In [87]:
df['attendance_3'].describe()

count    698.000000
mean      69.987971
std       18.188424
min       30.000000
25%       56.340228
50%       70.844739
75%       84.459943
max      100.000000
Name: attendance_3, dtype: float64

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   student_id                    698 non-null    int64  
 1   course                        698 non-null    object 
 2   student_cohort                698 non-null    object 
 3   academic_status               698 non-null    object 
 4   failed_subjects               698 non-null    float64
 5   study_skills(attended)        698 non-null    object 
 6   referral                      698 non-null    object 
 7   pp_meeting                    698 non-null    object 
 8   self_assessment               698 non-null    object 
 9   readiness_assessment_results  698 non-null    object 
 10  follow_up                     698 non-null    object 
 11  follow_up_type                698 non-null    object 
 12  subject_1                     698 non-null    object 
 13  subje

###### Clean Subject Assessment Score Data
This section ensures the subject assessment score data is ready for analysis by addressing missing values, correcting data entry errors, and validating that all scores are within the permissible academic range. 

In [89]:
df['subject_1_assess_1'].describe()

count    698.000000
mean      51.630702
std       28.580143
min        0.150000
25%       27.815000
50%       52.105000
75%       75.647500
max       99.980000
Name: subject_1_assess_1, dtype: float64

In [90]:
df[(df['academic_status'] == 'Satisfactory') & (df['student_cohort'] == 'First year')].shape

(75, 39)

In [120]:
df[(df['academic_status'] == 'Satisfactory') & (df['student_cohort'] == 'First year') & (df['failed_subjects']>=1)].shape

(7, 39)

In [122]:
df[(df['academic_status'] != 'Satisfactory') & (df['student_cohort'] == 'First year') & (df['failed_subjects']>=1)].shape

(15, 39)

In [126]:
df[(df['academic_status'] != 'Satisfactory') & (df['student_cohort'] == 'First year') & (df['subject_1_assess_1']>52)].shape

(7, 39)

In [127]:
df[(df['academic_status'] != 'Satisfactory') & (df['student_cohort'] == 'First year') & (df['subject_1_assess_1']<=52)].shape

(11, 39)

In [128]:
df[(df['academic_status'] == 'Satisfactory') & (df['student_cohort'] == 'First year') & (df['subject_1_assess_1']>52)].shape

(38, 39)

In [129]:
df[(df['academic_status'] == 'Satisfactory') & (df['student_cohort'] == 'First year') & (df['subject_1_assess_1']<=52)].shape

(37, 39)

In [99]:
df['academic_status'].value_counts()

Satisfactory        621
Conditional          43
Academic Caution     28
Excluded              6
Name: academic_status, dtype: int64

In [100]:
df['student_cohort'].value_counts()

Return to Study    101
First year          93
Transferred         92
Continuing          89
Excluded            86
LOA                 85
New                 78
SRI to JCUB         74
Name: student_cohort, dtype: int64