# Feature Engineering for Student Risk Prediction

This notebook focuses on feature engineering techniques to create meaningful features for predicting students at high and medium risk.

## 1. Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")

In [2]:
# Load the refined dataset
df = pd.read_csv('../data/refined_data_for_model/Student_At_Risk_Student_Data.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
df.info()

Dataset shape: (698, 41)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 41 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   student_id                    698 non-null    int64  
 1   course                        698 non-null    object 
 2   student_cohort                698 non-null    object 
 3   academic_status               698 non-null    object 
 4   failed_subjects               698 non-null    int64  
 5   study_skills(attended)        698 non-null    object 
 6   referral                      698 non-null    object 
 7   pp_meeting                    698 non-null    object 
 8   self_assessment               109 non-null    object 
 9   readiness_assessment_results  698 non-null    object 
 10  follow_up                     698 non-null    object 
 11  follow_up_type                698 non-null    object 
 12  subject_1               

## 2. Phase 1: Data Preprocessing

### Step 1: Verify Risk and Country Columns

In [3]:
# Check the 'risk' column values and verify it exists
print("Risk column found!")
print(f"Risk column values: {df['risk'].unique()}")
print(f"Risk column value counts:")
print(df['risk'].value_counts())

# Check if 'country' column exists
if 'country' in df.columns:
    print(f"\nCountry column found!")
    print(f"Country column unique count: {df['country'].nunique()}")
else:
    print("Country column not found!")

Risk column found!
Risk column values: ['High' 'Low' 'Medium']
Risk column value counts:
risk
Low       416
Medium    187
High       95
Name: count, dtype: int64

Country column found!
Country column unique count: 43


### Step 2: Reorganize Columns

In [4]:
# Reorganize columns - move 'risk' to end, 'country' to 2nd position after 'student_id'
print("Original column order:")
print(list(df.columns))

# Get current column list
columns = list(df.columns)

# Remove 'risk' and 'country' from their current positions
columns.remove('risk')
columns.remove('country')

# Create new column order: student_id, country, other columns, risk
new_columns = ['student_id', 'country'] + [col for col in columns if col != 'student_id'] + ['risk']

# Reorder the dataframe
df = df[new_columns]

print(f"\nNew column order:")
print(list(df.columns))
print(f"\nDataset shape after reordering: {df.shape}")

Original column order:
['student_id', 'course', 'student_cohort', 'academic_status', 'failed_subjects', 'study_skills(attended)', 'referral', 'pp_meeting', 'self_assessment', 'readiness_assessment_results', 'follow_up', 'follow_up_type', 'subject_1', 'subject_1_assess_1', 'subject_1_assess_2', 'subject_1_assess_3', 'subject_1_assess_4', 'attendance_1', 'learn_jcu_issues_1', 'lecturer_referral_1', 'subject_2', 'subject_2_assess_1', 'subject_2_assess_2', 'subject_2_assess_3', 'subject_2_assess_4', 'attendance_2', 'learn_jcu_issues_2', 'lecturer_referral_2', 'subject_3', 'subject_3_assess_1', 'subject_3_assess_2', 'subject_3_assess_3', 'subject_3_assess_4', 'attendance_3', 'learn_jcu_issues_3', 'lecturer_referral_3', 'comments', 'identified_issues', 'course_group', 'risk', 'country']

New column order:
['student_id', 'country', 'course', 'student_cohort', 'academic_status', 'failed_subjects', 'study_skills(attended)', 'referral', 'pp_meeting', 'self_assessment', 'readiness_assessment_resu

### Step 3: Convert Text Data to Lowercase

In [5]:
# Convert all text data to lowercase
print("Converting all text data to lowercase...")

# Identify object (text) columns
text_columns = df.select_dtypes(include=['object']).columns.tolist()
print(f"Text columns to convert: {text_columns}")

# Convert all text columns to lowercase
for col in text_columns:
    if col != 'student_id':  # Skip student_id if it's text
        df[col] = df[col].astype(str).str.lower()

print("Text data conversion completed!")

# Display sample of converted data
print(f"\nSample of converted data:")
print(df[text_columns[:5]].head())

Converting all text data to lowercase...
Text columns to convert: ['country', 'course', 'student_cohort', 'academic_status', 'study_skills(attended)', 'referral', 'pp_meeting', 'self_assessment', 'readiness_assessment_results', 'follow_up', 'follow_up_type', 'subject_1', 'learn_jcu_issues_1', 'lecturer_referral_1', 'subject_2', 'learn_jcu_issues_2', 'lecturer_referral_2', 'subject_3', 'learn_jcu_issues_3', 'lecturer_referral_3', 'comments', 'identified_issues', 'course_group', 'risk']
Text data conversion completed!

Sample of converted data:
      country course   student_cohort   academic_status  \
0   australia    mba       continuing       conditional   
1   australia    mba      transferred      satisfactory   
2  bangladesh    mba              new      satisfactory   
3      bhutan    mba      sri to jcub      satisfactory   
4      bhutan    mba  return to study  academic caution   

         study_skills(attended)  
0              essential skills  
1                   referenc

### Step 4: Filter for Medium and High Risk Students

In [6]:
# Filter for only 'medium' and 'high' risk students
print("Current risk distribution:")
print(df['risk'].value_counts())

print(f"\nOriginal dataset shape: {df.shape}")

# Filter for medium and high risk students only
df_filtered = df[df['risk'].isin(['medium', 'high'])].copy()

print(f"Filtered dataset shape: {df_filtered.shape}")
print(f"\nFiltered risk distribution:")
print(df_filtered['risk'].value_counts())

# Update the main dataframe
df = df_filtered

print(f"\nFinal dataset shape after filtering: {df.shape}")
print(f"Students removed: {698 - df.shape[0]}")

Current risk distribution:
risk
low       416
medium    187
high       95
Name: count, dtype: int64

Original dataset shape: (698, 41)
Filtered dataset shape: (282, 41)

Filtered risk distribution:
risk
medium    187
high       95
Name: count, dtype: int64

Final dataset shape after filtering: (282, 41)
Students removed: 416


### Step 5: Save Processed Data

In [7]:
# Save processed data as engineered_student_data.csv
output_path = '../data/refined_data_for_model/engineered_student_data.csv'

print(f"Saving processed data to: {output_path}")
print(f"Final dataset shape: {df.shape}")
print(f"Final columns: {list(df.columns)}")

# Save the processed dataset
df.to_csv(output_path, index=False)

print(f"\nData successfully saved!")
print(f"File location: {output_path}")

# Verify the saved file
verification_df = pd.read_csv(output_path)
print(f"\nVerification - loaded file shape: {verification_df.shape}")
print(f"Verification - risk distribution:")
print(verification_df['risk'].value_counts())

Saving processed data to: ../data/refined_data_for_model/engineered_student_data.csv
Final dataset shape: (282, 41)
Final columns: ['student_id', 'country', 'course', 'student_cohort', 'academic_status', 'failed_subjects', 'study_skills(attended)', 'referral', 'pp_meeting', 'self_assessment', 'readiness_assessment_results', 'follow_up', 'follow_up_type', 'subject_1', 'subject_1_assess_1', 'subject_1_assess_2', 'subject_1_assess_3', 'subject_1_assess_4', 'attendance_1', 'learn_jcu_issues_1', 'lecturer_referral_1', 'subject_2', 'subject_2_assess_1', 'subject_2_assess_2', 'subject_2_assess_3', 'subject_2_assess_4', 'attendance_2', 'learn_jcu_issues_2', 'lecturer_referral_2', 'subject_3', 'subject_3_assess_1', 'subject_3_assess_2', 'subject_3_assess_3', 'subject_3_assess_4', 'attendance_3', 'learn_jcu_issues_3', 'lecturer_referral_3', 'comments', 'identified_issues', 'course_group', 'risk']

Data successfully saved!
File location: ../data/refined_data_for_model/engineered_student_data.csv


### Step 6: Data Exploration for Synthetic Generation

In [8]:
# Explore data patterns for synthetic data generation preparation
print("=" * 60)
print("DATA EXPLORATION FOR SYNTHETIC DATA GENERATION")
print("=" * 60)

# Basic dataset overview
print(f"\n1. DATASET OVERVIEW:")
print(f"   Shape: {df.shape}")
print(f"   Features: {df.shape[1] - 1} (excluding target)")
print(f"   Target: 'risk' column")

# Risk distribution
print(f"\n2. TARGET DISTRIBUTION:")
risk_counts = df['risk'].value_counts()
risk_percentages = df['risk'].value_counts(normalize=True) * 100
for risk_level in risk_counts.index:
    print(f"   {risk_level}: {risk_counts[risk_level]} students ({risk_percentages[risk_level]:.1f}%)")

# Data types overview
print(f"\n3. FEATURE TYPES:")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
text_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"   Numeric features: {len(numeric_cols)} columns")
print(f"   Text features: {len(text_cols)} columns")

# Missing data analysis
print(f"\n4. MISSING DATA ANALYSIS:")
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
if len(missing_data) > 0:
    print(f"   Columns with missing data:")
    for col, count in missing_data.items():
        percentage = (count / len(df)) * 100
        print(f"     {col}: {count} missing ({percentage:.1f}%)")
else:
    print("   No missing data found")

# Key categorical features
print(f"\n5. KEY CATEGORICAL FEATURES:")
categorical_features = ['country', 'course', 'academic_status', 'student_cohort']
for feature in categorical_features:
    if feature in df.columns:
        unique_count = df[feature].nunique()
        print(f"   {feature}: {unique_count} unique values")
        if unique_count <= 10:
            print(f"     Values: {list(df[feature].unique())}")

print(f"\n6. NUMERIC FEATURES SUMMARY:")
if len(numeric_cols) > 1:  # Exclude student_id
    numeric_features = [col for col in numeric_cols if col != 'student_id']
    print(f"   Assessment scores: {len([col for col in numeric_features if 'assess' in col])} columns")
    print(f"   Attendance features: {len([col for col in numeric_features if 'attendance' in col])} columns")
    print(f"   Other numeric: {len([col for col in numeric_features if 'assess' not in col and 'attendance' not in col])} columns")

DATA EXPLORATION FOR SYNTHETIC DATA GENERATION

1. DATASET OVERVIEW:
   Shape: (282, 41)
   Features: 40 (excluding target)
   Target: 'risk' column

2. TARGET DISTRIBUTION:
   medium: 187 students (66.3%)
   high: 95 students (33.7%)

3. FEATURE TYPES:
   Numeric features: 17 columns
   Text features: 24 columns

4. MISSING DATA ANALYSIS:
   No missing data found

5. KEY CATEGORICAL FEATURES:
   country: 32 unique values
   course: 14 unique values
   academic_status: 4 unique values
     Values: ['conditional', 'satisfactory', 'academic caution', 'excluded']
   student_cohort: 8 unique values
     Values: ['continuing', 'new', 'return to study', 'loa', 'first year', 'transferred', 'excluded', 'sri to jcub']

6. NUMERIC FEATURES SUMMARY:
   Assessment scores: 12 columns
   Attendance features: 3 columns
   Other numeric: 1 columns


## 3. Phase 1 Completion Summary

In [9]:
""" print("\n" + "=" * 60)
print("PHASE 1 COMPLETION SUMMARY")
print("=" * 60)

print(f"\n✅ COMPLETED TASKS:")
print(f"   1. ✅ Loaded dataset from Student_At_Risk_Student_Data.csv")
print(f"   2. ✅ Verified 'risk' outcome variable exists")
print(f"   3. ✅ Reorganized columns: 'risk' at end, 'country' at 2nd position")
print(f"   4. ✅ Converted all text data to lowercase")
print(f"   5. ✅ Filtered for 'medium' and 'high' risk students only")
print(f"   6. ✅ Saved processed data as 'engineered_student_data.csv'")
print(f"   7. ✅ Explored data patterns for synthetic generation preparation")

print(f"\n📊 FINAL DATASET STATUS:")
print(f"   • File: data/refined_data_for_model/engineered_student_data.csv")
print(f"   • Shape: {df.shape}")
print(f"   • Risk levels: {list(df['risk'].unique())}")
print(f"   • Ready for Phase 2 feature engineering")

print(f"\n🚀 NEXT STEPS (Phase 2):")
print(f"   • Correlation analysis with outcome variable")
print(f"   • Create JSON mapping for categorical variables")
print(f"   • Assign numeric weights based on correlations")
print(f"   • Sentiment analysis for comments")
print(f"   • NLP processing for text features") """

' print("\n" + "=" * 60)\nprint("PHASE 1 COMPLETION SUMMARY")\nprint("=" * 60)\n\nprint(f"\n✅ COMPLETED TASKS:")\nprint(f"   1. ✅ Loaded dataset from Student_At_Risk_Student_Data.csv")\nprint(f"   2. ✅ Verified \'risk\' outcome variable exists")\nprint(f"   3. ✅ Reorganized columns: \'risk\' at end, \'country\' at 2nd position")\nprint(f"   4. ✅ Converted all text data to lowercase")\nprint(f"   5. ✅ Filtered for \'medium\' and \'high\' risk students only")\nprint(f"   6. ✅ Saved processed data as \'engineered_student_data.csv\'")\nprint(f"   7. ✅ Explored data patterns for synthetic generation preparation")\n\nprint(f"\n📊 FINAL DATASET STATUS:")\nprint(f"   • File: data/refined_data_for_model/engineered_student_data.csv")\nprint(f"   • Shape: {df.shape}")\nprint(f"   • Risk levels: {list(df[\'risk\'].unique())}")\nprint(f"   • Ready for Phase 2 feature engineering")\n\nprint(f"\n🚀 NEXT STEPS (Phase 2):")\nprint(f"   • Correlation analysis with outcome variable")\nprint(f"   • Create J