In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

Load the data

In [2]:
data_path = '/home/lumen/Documents/school/Project/myself/Students_data.csv'
students_data = pd.read_csv(data_path)

Split the data into features (X) and target (y)

In [3]:
X = students_data.drop(columns=['GRADE'])
y = students_data['GRADE']

Encode categorical variables

In [4]:
X = pd.get_dummies(X)

Split the data into training and testing sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Scale the features

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Train a RandomForestRegressor model

In [7]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

Evaluate model performance

In [8]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.8931827586206895


Analyze feature importances

In [9]:
importances = model.feature_importances_
feature_names = X.columns
important_features = feature_names[importances.argsort()[-3:]]  # Top 3 most important features

Make a prediction for a new student (replace with new student data)

In [17]:
new_student_data = {}
print("Please enter the student's details:\n")


required_fields = [
    "Student Age (1: 18-21, 2: 22-25, 3: above 26)",
    "Sex (1: female, 2: male)",
    "Graduated high-school type: (1: private, 2: state, 3: other)",
    "Scholarship type: (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full)",
    "Additional work: (1: Yes, 2: No)",
    "Regular artistic or sports activity: (1: Yes, 2: No)",
    "Do you have a partner: (1: Yes, 2: No)",
    "Total salary if available (1: USD 135-200, 2: USD 201-270, 3: USD 271-340, 4: USD 341-410, 5: above 410)",
    "Transportation to the university: (1: Bus, 2: Private car/taxi, 3: bicycle, 4: Other)",
    "Accommodation type in Cyprus: (1: rental, 2: dormitory, 3: with family, 4: Other)",
    "Mothersâ€™ education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.)",
    "Fathersâ€™ education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.)",
    "Number of sisters/brothers (if available): (1: 1, 2: 2, 3: 3, 4: 4, 5: 5 or above)",
    "Parental status: (1: married, 2: divorced, 3: died - one of them or both)",
    "Mothersâ€™ occupation: (1: retired, 2: housewife, 3: government officer, 4: private sector employee, 5: self-employment, 6: other)",
    "Fathersâ€™ occupation: (1: retired, 2: government officer, 3: private sector employee, 4: self-employment, 5: other)",
    "Weekly study hours: (1: None, 2: <5 hours, 3: 6-10 hours, 4: 11-20 hours, 5: more than 20 hours)",
    "Reading frequency (non-scientific books/journals): (1: None, 2: Sometimes, 3: Often)",
    "Reading frequency (scientific books/journals): (1: None, 2: Sometimes, 3: Often)",
    "Attendance to the seminars/conferences related to the department: (1: Yes, 2: No)",
    "Impact of your projects/activities on your success: (1: positive, 2: negative, 3: neutral)",
    "Attendance to classes (1: always, 2: sometimes, 3: never)",
    "Preparation to midterm exams 1: (1: alone, 2: with friends, 3: not applicable)",
    "Preparation to midterm exams 2: (1: closest date to the exam, 2: regularly during the semester, 3: never)",
    "Taking notes in classes: (1: never, 2: sometimes, 3: always)",
    "Listening in classes: (1: never, 2: sometimes, 3: always)",
    "Discussion improves my interest and success in the course: (1: never, 2: sometimes, 3: always)",
    "Flip-classroom: (1: not useful, 2: useful, 3: not applicable)",
    "Cumulative grade point average in the last semester (/4.00): (1: <2.00, 2: 2.00-2.49, 3: 2.50-2.99, 4: 3.00-3.49, 5: above 3.49)",
    "Expected Cumulative grade point average in the graduation (/4.00): (1: <2.00, 2: 2.00-2.49, 3: 2.50-2.99, 4: 3.00-3.49, 5: above 3.49)",
    "Course ID",
    # "OUTPUT Grade (0: Fail, 1: DD, 2: DC, 3: CC, 4: CB, 5: BB, 6: BA, 7: AA)"
]

for prompt_detail, column_detail in zip(required_fields, X.columns):
    value = float(input(prompt_detail + " --> "))
    new_student_data[column_detail] = value
    


Please enter the student's details:



Student Age (1: 18-21, 2: 22-25, 3: above 26) -->  1
Sex (1: female, 2: male) -->  1
Graduated high-school type: (1: private, 2: state, 3: other) -->  2
Scholarship type: (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full) -->  1
Additional work: (1: Yes, 2: No) -->  1
Regular artistic or sports activity: (1: Yes, 2: No) -->  1
Do you have a partner: (1: Yes, 2: No) -->  1
Total salary if available (1: USD 135-200, 2: USD 201-270, 3: USD 271-340, 4: USD 341-410, 5: above 410) -->  1
Transportation to the university: (1: Bus, 2: Private car/taxi, 3: bicycle, 4: Other) -->  3
Accommodation type in Cyprus: (1: rental, 2: dormitory, 3: with family, 4: Other) -->  3
Mothersâ€™ education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.) -->  1
Fathersâ€™ education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.) -->  4
Number of sisters/brothers (if available): (1: 1, 2: 2, 3: 3, 4: 4, 5: 5 or above) -->  1
Parental 

Predicted performance: 2.06


In [None]:
# New prediction

new_student_data_encoded = scaler.transform(pd.DataFrame([new_student_data], columns=X.columns))
new_prediction = model.predict(new_student_data_encoded)[0]
print(f"Predicted performance: {new_prediction}")

Recommendations based on feature importances and model's prediction

In [21]:
import numpy as np

# Assuming 'model' is your trained model and 'X' is your DataFrame with the training data
importances = model.feature_importances_
feature_names = X.columns
important_features_indices = importances.argsort()[-3:]  # Top 3 most important features
important_features = feature_names[important_features_indices]

# Mapping feature indices to their human-readable names
feature_mapping = {
    1: 'Student Age (1: 18-21, 2: 22-25, 3: above 26)',
    2: 'Sex (1: female, 2: male)',
    3: 'Graduated high-school type: (1: private, 2: state, 3: other)',
    4: 'Scholarship type: (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full)',
    5: 'Additional work: (1: Yes, 2: No)',
    6: 'Regular artistic or sports activity: (1: Yes, 2: No)',
    7: 'Do you have a partner: (1: Yes, 2: No)',
    8: 'Total salary if available (1: USD 135-200, 2: USD 201-270, 3: USD 271-340, 4: USD 341-410, 5: above 410)',
    9: 'Transportation to the university: (1: Bus, 2: Private car/taxi, 3: bicycle, 4: Other)',
    10: 'Accommodation type in Cyprus: (1: rental, 2: dormitory, 3: with family, 4: Other)',
    11: 'Mothersâ€™ education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.)',
    12: 'Fathersâ€™ education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.)',
    13: 'Number of sisters/brothers (if available): (1: 1, 2:, 2, 3: 3, 4: 4, 5: 5 or above)',
    14: 'Parental status: (1: married, 2: divorced, 3: died - one of them or both)',
    15: 'Mothersâ€™ occupation: (1: retired, 2: housewife, 3: government officer, 4: private sector employee, 5: self-employment, 6: other)',
    16: 'Fathersâ€™ occupation: (1: retired, 2: government officer, 3: private sector employee, 4: self-employment, 5: other)',
    17: 'Weekly study hours: (1: None, 2: <5 hours, 3: 6-10 hours, 4: 11-20 hours, 5: more than 20 hours)',
    18: 'Reading frequency (non-scientific books/journals): (1: None, 2: Sometimes, 3: Often)',
    19: 'Reading frequency (scientific books/journals): (1: None, 2: Sometimes, 3: Often)',
    20: 'Attendance to the seminars/conferences related to the department: (1: Yes, 2: No)',
    21: 'Impact of your projects/activities on your success: (1: positive, 2: negative, 3: neutral)',
    22: 'Attendance to classes (1: always, 2: sometimes, 3: never)',
    23: 'Preparation to midterm exams 1: (1: alone, 2: with friends, 3: not applicable)',
    24: 'Preparation to midterm exams 2: (1: closest date to the exam, 2: regularly during the semester, 3: never)',
    25: 'Taking notes in classes: (1: never, 2: sometimes, 3: always)',
    26: 'Listening in classes: (1: never, 2: sometimes, 3: always)',
    27: 'Discussion improves my interest and success in the course: (1: never, 2: sometimes, 3: always)',
    28: 'Flip-classroom: (1: not useful, 2: useful, 3: not applicable)',
    29: 'Cumulative grade point average in the last semester (/4.00): (1: <2.00, 2: 2.00-2.49, 3: 2.50-2.99, 4: 3.00-3.49, 5: above 3.49)',
    30: 'Expected Cumulative grade point average in the graduation (/4.00): (1: <2.00, 2: 2.00-2.49, 3: 2.50-2.99, 4: 3.00-3.49, 5: above 3.49)',
    31: 'Course ID',
    32: 'OUTPUT Grade (0: Fail, 1: DD, 2: DC, 3: CC, 4: CB, 5: BB, 6: BA, 7: AA)'
}

recommendation_map = {
    'Student Age (1: 18-21, 2: 22-25, 3: above 26)': "Consider finding study groups or mentors who can relate to your age group.",
    'Sex (1: female, 2: male)': "Engage in study groups that balance gender diversity.",
    'Graduated high-school type: (1: private, 2: state, 3: other)': "Leverage your high school network for additional support and resources.",
    'Scholarship type: (1: None, 2: 25%, 3: 50%, 4: 75%, 5: Full)': "Take full advantage of your scholarship by accessing all available resources.",
    'Additional work: (1: Yes, 2: No)': "Balance work and study effectively, prioritizing your academic commitments.",
    'Regular artistic or sports activity: (1: Yes, 2: No)': "Incorporate artistic or sports activities to relieve stress and enhance focus.",
    'Do you have a partner: (1: Yes, 2: No)': "Maintain a healthy balance between your relationship and academic responsibilities.",
    'Total salary if available (1: USD 135-200, 2: USD 201-270, 3: USD 271-340, 4: USD 341-410, 5: above 410)': "Budget your expenses wisely to reduce financial stress.",
    'Transportation to the university: (1: Bus, 2: Private car/taxi, 3: bicycle, 4: Other)': "Optimize your transportation time for better study efficiency.",
    'Accommodation type in Cyprus: (1: rental, 2: dormitory, 3: with family, 4: Other)': "Create a conducive study environment in your accommodation.",
    'Mothersâ€™ education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.)': "Seek guidance from family members or mentors with similar educational backgrounds.",
    'Fathersâ€™ education: (1: primary school, 2: secondary school, 3: high school, 4: university, 5: MSc., 6: Ph.D.)': "Engage with family or mentors who understand your educational environment.",
    'Number of sisters/brothers (if available): (1: 1, 2:, 2, 3: 3, 4: 4, 5: 5 or above)': "Create a quiet study environment, especially if you have multiple siblings.",
    'Parental status: (1: married, 2: divorced, 3: died - one of them or both)': "Seek emotional support and balance family responsibilities with studies.",
    'Mothersâ€™ occupation: (1: retired, 2: housewife, 3: government officer, 4: private sector employee, 5: self-employment, 6: other)': "Balance study with family responsibilities, and seek advice from working parents.",
    'Fathersâ€™ occupation: (1: retired, 2: government officer, 3: private sector employee, 4: self-employment, 5: other)': "Consider balancing study with family responsibilities, and seek advice from working parents.",
    'Weekly study hours: (1: None, 2: <5 hours, 3: 6-10 hours, 4: 11-20 hours, 5: more than 20 hours)': "Adjust your study schedule to ensure adequate study time.",
    'Reading frequency (non-scientific books/journals): (1: None, 2: Sometimes, 3: Often)': "Increase reading frequency to enhance comprehension and retention.",
    'Reading frequency (scientific books/journals): (1: None, 2: Sometimes, 3: Often)': "Increase your engagement with scientific literature to improve subject understanding.",
    'Attendance to the seminars/conferences related to the department: (1: Yes, 2: No)': "Attend more seminars and conferences to broaden your knowledge base.",
    'Impact of your projects/activities on your success: (1: positive, 2: negative, 3: neutral)': "Focus on projects and activities that positively impact your success.",
    'Attendance to classes (1: always, 2: sometimes, 3: never)': "Improve your class attendance for better learning outcomes.",
    'Preparation to midterm exams 1: (1: alone, 2: with friends, 3: not applicable)': "Consider preparing for exams with friends for collaborative learning.",
    'Preparation to midterm exams 2: (1: closest date to the exam, 2: regularly during the semester, 3: never)': "Adopt a regular study schedule instead of last-minute cramming.",
    'Taking notes in classes: (1: never, 2: sometimes, 3: always)': "Increase note-taking during classes for better retention.",
    'Listening in classes: (1: never, 2: sometimes, 3: always)': "Enhance active listening in classes to improve understanding.",
    'Discussion improves my interest and success in the course: (1: never, 2: sometimes, 3: always)': "Participate in class discussions to enhance interest and success.",
    'Flip-classroom: (1: not useful, 2: useful, 3: not applicable)': "Utilize flip-classroom techniques to maximize learning efficiency.",
    'Cumulative grade point average in the last semester (/4.00): (1: <2.00, 2: 2.00-2.49, 3: 2.50-2.99, 4: 3.00-3.49, 5: above 3.49)': "Focus on improving your GPA by addressing weak areas.",
    'Expected Cumulative grade point average in the graduation (/4.00): (1: <2.00, 2: 2.00-2.49, 3: 2.50-2.99, 4: 3.00-3.49, 5: above 3.49)': "Set realistic GPA goals and work steadily towards achieving them."
}

# Recommendation function
def get_recommendations(prediction, threshold, important_features_indices):
    if prediction < threshold:
        recommendations = [
            recommendation_map.get(feature_mapping[i], 'No specific recommendation available.')
            for i in important_features_indices
        ]
    else:
        recommendations = ["Your predicted performance is good, keep up the good work!"]
    return recommendations

# Assuming new_prediction is the predicted performance of the student
# new_prediction = model.predict([new_student_data])[0]

# Correct feature indices (excluding Student ID)
important_features_corrected = [i + 1 for i in important_features_indices]  # Adjust for zero-based index

# Generating recommendations
recommendations = get_recommendations(new_prediction, threshold, important_features_corrected)

print("Recommendations for improvement:")
for rec in recommendations:
    print(rec)


Recommendations for improvement:
Engage in study groups that balance gender diversity.
Focus on improving your GPA by addressing weak areas.
No specific recommendation available.


In [15]:
print(important_features)

Index(['2', '29', 'COURSE ID'], dtype='object')


In [None]:
print("Program complete")