In [1]:
import pandas as pd # importing pandas and plotly libs
import plotly.express as px
import plotly.graph_objects as go

# Load dataset
data_from_dataset = pd.read_csv("dataset.csv")

# Show basic info
print("Dataset Shape:", data_from_dataset.shape)
print("\nColumn Names:", data_from_dataset.columns.tolist())
print("\nTarget Variable Counts:\n", data_from_dataset['Target'].value_counts())
data_from_dataset.head()


Dataset Shape: (4424, 35)

Column Names: ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP', 'Target']

Target Variable Counts:


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


# preprocessing and removal of outliers.

In [2]:
# first step here is to remove any outliers that can alter our predictions.
# First step here is to the composition and we replaced the codes to actual predictions.
def remove_outliers(data_from_dataset, columns):
    for columns_looping in columns:
        Q1 = data_from_dataset[columns_looping].quantile(0.25)
        Q3 = data_from_dataset[columns_looping].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        before = data_from_dataset.shape[0]
        data_from_dataset = data_from_dataset[(data_from_dataset[columns_looping] >= lower) & (data_from_dataset[columns_looping] <= upper)]
        after = data_from_dataset.shape[0]
        print(f"{columns_looping}: Removed {before - after} outliers")
    return data_from_dataset

# Cleaning our required columns.
cols_to_clean = [
    "Age at enrollment",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (approved)",
    "Curricular units 1st sem (grade)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)"
]

# removing the outliers .
data_clean = remove_outliers(data_from_dataset, cols_to_clean)

# finah shape after removing with resulted columns.
print("\nCleaned Dataset Shape:", data_clean.shape)


Age at enrollment: Removed 441 outliers
Curricular units 1st sem (enrolled): Removed 362 outliers
Curricular units 1st sem (approved): Removed 0 outliers
Curricular units 1st sem (grade): Removed 438 outliers
Curricular units 2nd sem (enrolled): Removed 62 outliers
Curricular units 2nd sem (approved): Removed 163 outliers
Curricular units 2nd sem (grade): Removed 7 outliers

Cleaned Dataset Shape: (2951, 35)


In [3]:
# checking of any unique values before .
print(data_clean['Gender'].unique())

# handling unknows : which is replacing those.
data_clean['Gender'] = data_clean['Gender'].replace({0: 'Female', 1: 'Male'})
data_clean['Gender'] = data_clean['Gender'].fillna('Unknown')


[1 0]


# Chart 1 : VIsualization of Marital Status Distributed across Students.
* This code here is made to show us a overall distribution of
the students on basis of the Marital Status .
*   This helps us identify which part of students are mostly enrolled , in our case , it's mostly single students.
*  This graph is visualized using a interactive donut style pie chart which is clean .



In [4]:
# labelling those cuz our dataset shows only codes ranging from 1-6 from metadata.
# essentially cleaning the 'marital status' column.
data_clean['Marital status'] = data_clean['Marital status'].replace({
    1: 'Single', 2: 'Married', 3: 'Widowed_at_present', 4: 'Divorcedoff', 5: 'Separated_for_now'
})

# Count values
Count_of_Marital_status = data_clean['Marital status'].value_counts().reset_index()
Count_of_Marital_status.columns = ['Marital status', 'Count']

# Plot
fig = px.pie(
    Count_of_Marital_status,
    names='Marital status',
    values='Count',
    title='Overall Marital Status Distribution (All Students)',
    hole=0.2
)

fig.update_traces(textinfo='label+percent', pull=[0.03]*len(Count_of_Marital_status))
fig.show()


In [5]:
data_clean['Gender'].value_counts() # this is to check how many male and female students are currently enrolled.

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Female,2112
Male,839


# Visualization 2 : Visualizing the student retention outcomes based on gender wise : STACKED BAR CHART.


*   This is done to compare the whole number of students who

1.   graduated
2.   currently enrolled
3.   Dropped out of School.


*  Stacked bar chart here makes is easy and interactive approach to find out status of students according to gender.

* this makes us easy to identify according to gender.



In [6]:
# Make sure Gender is readable with a readable labels instead of codes.

data_clean['Gender'] = data_clean['Gender'].replace({0: 'Female', 1: 'Male'})



# grouping the data in accordance by gender and target
target_according_to_gender = data_clean.groupby(['Gender', 'Target']).size().reset_index(name='Count')

## Stacked bar chart
# Comparing retention outcomes based on gender.
figure2_StackbarChart = px.bar(
    target_according_to_gender,
    x='Gender', # on x axis to show two types of students male and female.
    y='Count',  # y is the number of students.
    color='Target', # Coloring accordingly for dropouts,enrolled and enrolled.
    barmode='stack', # stacked bar chart
    title='RETENTION RATES OF STUDENTS ACCOORDING TO GENDER',
    text='Count' # shows the count on the bar.
)

figure2_StackbarChart.update_traces(textposition='inside')
figure2_StackbarChart.update_layout(xaxis_title='Gender', yaxis_title='Number of Students enrolled')
figure2_StackbarChart.show()


# Visualization 3 : Age of Enrollment vs Retention : Line Chart




*  this visualiztion we did aims to explain us that are the younger students most likely to drop out ?
*   and what age does most graduates likely to be concentrated at ?
*   is there any age groups linked with high dropout rate.



In [7]:
# 1 : group the students according to age and target the outcomes.
# creating a line chart with
# x axis: age at the time of enrollment
# y axis : no of students.
targeting_age_to_success = data_clean.groupby(['Age at enrollment', 'Target']).size().reset_index(name='Count')
figure_lineChart = px.line(targeting_age_to_success, x='Age at enrollment', y='Count', color='Target',
               title=' STUDENTS ENROLLMENT VS SUCCEESS RETENTION RATE OUTCOME')
figure_lineChart.show()

# Visualization 4 : RETENTION OUTCOMES OF SCHOLARSHIP ISSUED STUDENTS VS NON-SCHOLARSHIP STUDENTS : PIE CHART



*   This graphs will let us know if scholarship issued by students will tend more likely to graduate more?
*   does universities need to issue scholarships to increase retention outcomes.



In [8]:
# converting codes to labels
data_clean['Scholarship holder'] = data_clean['Scholarship holder'].replace({0: 'No', 1: 'Yes'})

# performing grouping on data by scholarship status
scholar_target = data_clean.groupby(['Scholarship holder', 'Target']).size().reset_index(name='Count')

# looping and plotting a pie chart according to each scholarship status.
# plotting two graphs to show received vs not received students.
for status in scholar_target['Scholarship holder'].unique():
    data_generated  = scholar_target[scholar_target['Scholarship holder'] == status]
    # creating a pie chart using above lines data
    figure_feed_on_data = px.pie(
        data_generated,
        names='Target',
        values='Count',
        title=f'Student Outcome by Scholarship status (receives): {status}',
        hole=0.4
    )
    figure_feed_on_data.update_traces(textinfo='label+percent', pull=[0.03]*len(data_generated))
    figure_feed_on_data.show()


# Visualization 5
# AVERAGE COURSES ENROLLMENT VS STUDENT OUTCOMES BASED ON CREDIT HOURS.

## Plotted using LINE CHART
### This chart will allow us to discover " will credit hours enrolled by student is linked with dropouts ? "


*   How many credit hours are ideal?
*   does many credit hours lead more droupouts?
*   Can this insights be used to decrease dropouts?



In [9]:
# Group by Target and calculate average enrolled units (1st sem)
data_for_lineChart = data_clean.groupby('Target')['Curricular units 1st sem (enrolled)'].mean().reset_index()

# Plot
visualization_5 = px.line(
    data_for_lineChart, # categories : dropout , graduates , enrolled students.
    x='Target',
    y='Curricular units 1st sem (enrolled)', # avg enrolled hours
    title='AVG NO OF ENROLLED UNITS BY STUDENTS IN 1st SEMESTER/STUDENT OUTCOME',
    markers=True
)

visualization_5.update_traces(line=dict(width=3))
visualization_5.update_layout(
    xaxis_title='Student Outcome',
    yaxis_title='Average Enrolled Units (1st Sem)',
    template='plotly_white'
)

visualization_5.show()


# VISUALIZATION 6
## BAR CHART : Average age of enrollment and their student outcomes on success



*    This Students with younger age are most possible to enroll and graduate?
*   which age has the high dropout rates?
*   aims to uncover which age groups are in need of more academic support.




In [10]:
# Group and calculate average age for each outcome
age_of_target_students = data_clean.groupby('Target')['Age at enrollment'].mean().reset_index()

# creating a bar chart to show how age differs the student retention outcomes.
# computing the mean age at enrollment for outcomes.
visualization_6 = px.bar(
    age_of_target_students,
    x='Target',
    y='Age at enrollment',
    color='Target',
    text=round(age_of_target_students['Age at enrollment'], 1),
    title='AVG AGE OF ENROLLMENT AND THIER POSSIBLE OUTCOMES'
)
# displaying the no's and layouts
visualization_6.update_traces(textposition='outside')
visualization_6.update_layout(
        xaxis_title='Student Outcome and their categories',
            yaxis_title='Average Age of students  ',
    showlegend=False,)


# VISUALIZATION 6
## PLOTTING RELATIONSHIP BETWEEN ENROLLED AND APPROVED UNITS USING SCATTER PLOT.


*   this aims to show correelation between the workload and the success assosciated with it .
*   this aims to spot outliers.



In [11]:
# Scatter Plot: Enrolled vs Approved (1st Semester)
# using raw data , on the enrolled units and approved units in columns section.
visualization_7 = px.scatter(
    data_clean,
    x='Curricular units 1st sem (enrolled)', # units the student enrolled in
    y='Curricular units 1st sem (approved)',# units the student has passed
    color='Target', # color by outcome
    title='ENROLLED VS THE APPROVED UNITS IN 1st SEMESTER',
    labels={
        'Curricular units 1st sem (enrolled)': 'Units Enrolled',
        'Curricular units 1st sem (approved)': 'Units Approved'
    },
    opacity=0.8
)
# styling.
visualization_7.update_layout(
    xaxis_title='Enrolled Units in first sem by the students',
    yaxis_title='Approved Units in first sem',
    template='plotly_white'
)

visualization_7.show()


# VISUALIZATION 8
## GROUPED BAR CHART VISUALIZATION : Average age at the time of enrollment by Gender.


## Purpose


*   This chart helps us to visualize the demographic variation in terms of gender.
*   which age gender is old on average when they join ?
*   tells if age impacts selection of course and dropout?



In [12]:
# Group and calculate average age by gender and target
# replacing age with readable instead of codes.
avg_age_on_gender = data_clean.groupby('Gender')['Age at enrollment'].mean().reset_index() # mean on gender col and age of enrollment
# grouping basis on gender.
# creating a grouped bar chart in order to compare avg age on male and females.
visualization_8 = px.bar(
    avg_age_on_gender,
    x='Gender', # gender col
    y='Age at enrollment', # mean of age of enrollment.
    color='Gender',
    text=round(avg_age_on_gender['Age at enrollment'], 1),
    title='Average Age at Enrollment by Gender'
)
# vbar chart .
visualization_8.update_traces(textposition='outside')
visualization_8.update_layout(
    xaxis_title='Gender of the student.',
    yaxis_title='Average Age of the enrollment',
    showlegend=True,
    template='plotly_white'
)

visualization_8.show()


# VISUALIZATION 9

## TREEMAP : OUTCOMES OF STUDENTS ON BASIS OF COURSES



*   Much deeper insights
*   shows us which courses have high enrollment or dropout rates accross all categories(enrolled , graduates and dropouts.)
*   helps identify hard courses.



In [13]:
# assigning course numbers.
data_clean['Course'] = 'Course ' + data_clean['Course'].astype(str)

# Grouping based on courses and target.
grouped_data_for_treemap = data_clean.groupby(['Course', 'Target']).size().reset_index(name='Count')

# Plotting
visualization_9 = px.treemap(
    grouped_data_for_treemap,
    path=['Target', 'Course'],
    values='Count',
    color='Target',
    title='Treemap of Student Outcomes on basis of  Course'
)

visualization_9.update_layout(margin=dict(t=50, l=25, r=25, b=25))
visualization_9.show()
