# STUDENT ENGAGEMENT PREDICTION

In [None]:
# import libraries

import pandas as pd #Data manipulation
import numpy as np #Data manipulation
import matplotlib.pyplot as plt # Visualization
%matplotlib inline
import seaborn as sns #Visualization

In [None]:
# The First DataSet - Students Engagement

df = pd.read_csv('Student Engagement Level.csv')
pd.set_option('display.max_columns', None)
df = df.drop(['Unnamed: 0'], axis = 1)

In [None]:
df.head(10)

In [None]:
df.isnull().any()

In [None]:
df.info()

In [None]:
df.describe().transpose()

---------
Plotting data

In [None]:
p = df.hist(figsize = (20,20))

In [None]:
# Correlation Plot
cor_mat= df[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

-----------------------
Predict with K-Means

In [None]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(df.drop(["Student ID"],axis = 1),),
        columns=['# Logins', '# Content Reads', '# Forum Reads', '# Forum Posts', '# Quiz Reviews before submission',
       'Assignment 1 lateness indicator', 'Assignment 2 lateness indicator', 'Assignment 3 lateness indicator', 'Assignment 1 duration to submit (in hours)', 
        'Assignment 2 duration to submit (in hours)', 'Assignment 3 duration to submit (in hours)', 'Average time to submit assignment (in hours)'])


In [None]:
X.head(10)

In [None]:
# Using the elbow method to find the optimal number of clusters

from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    # inertia method returns wcss for that model
    wcss.append(kmeans.inertia_)


In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(range(1, 11), wcss,marker='o',color='red')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [None]:
Result = kmeans.labels_
Result

In [None]:
#Getting the Centroids
centroids = kmeans.cluster_centers_
u_labels = np.unique(kmeans.labels_)
plt.figure(figsize=(15,7))

#plotting the results:
 
for i in u_labels:
    plt.scatter(X.iloc[y_kmeans == i , 0] , X.iloc[y_kmeans == i , 1] , label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 300, color = 'red')
plt.legend()
plt.show()

-------
Add preduction result to each student to see who has high or low engagement 

In [None]:
Engagement_Level = pd.DataFrame(Result)

In [None]:
df['Engagement_Level'] = Engagement_Level

In [None]:
df.head(10)

In [None]:
df.loc[df['Engagement_Level'] == 0, 'Engagement_Level'] = 'H'
df.loc[df['Engagement_Level'] == 1, 'Engagement_Level'] = 'L'

In [None]:
df.head(10)

In [None]:
p = df['Engagement_Level'].hist(bins=3, alpha=0.5)

# STUDENT PERFOMANCE PREDICTION

In [None]:
df2 = pd.read_csv('Student Performance Prediction.csv')
pd.set_option('display.max_columns', None)
df2 = df2.drop(['Unnamed: 0'], axis = 1)
df2.head(10)

----------------
Predict with K-Means

In [None]:
#Scaling the data

from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(df2.drop(["Student ID"],axis = 1),),
        columns=['Quiz01 [10]', 'Assignment01 [8]', 'Midterm Exam [20]', 'Assignment02 [12]', 'Assignment03 [25]',
       'Final Exam [35]', 'Course Grade', 'Total [100]'])


In [None]:
X.head(10)

In [None]:
# Using the elbow method to find the optimal number of clusters

from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    # inertia method returns wcss for that model
    wcss.append(kmeans.inertia_)


In [None]:
plt.figure(figsize=(10,5))
sns.lineplot(range(1, 11), wcss,marker='o',color='red')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [None]:
Result = kmeans.labels_
Result

In [None]:
#Getting the Centroids
centroids = kmeans.cluster_centers_
u_labels = np.unique(kmeans.labels_)
plt.figure(figsize=(15,7))

#plotting the results:
 
for i in u_labels:
    plt.scatter(X.iloc[y_kmeans == i , 0] , X.iloc[y_kmeans == i , 1] , label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 300, color = 'red')
plt.legend()
plt.show()

In [None]:
Performance = pd.DataFrame(Result)

In [None]:
df2['Performance'] = Performance

In [None]:
df2.head()

In [None]:
df2.loc[df2['Performance'] == 0, 'Performance'] = 'G'
df2.loc[df2['Performance'] == 1, 'Performance'] = 'W'

In [None]:
df2.head(10)

In [None]:
p = df2['Performance'].hist(bins=3, alpha=0.5)

# STUDENT PERFOMANCE AND ENGAGEMENT

Now let's merge above two dataframes together in order to observe the correlation between perfomance and engagement

In [None]:
df.head(10)

In [None]:
df2.head(10)

----
Merge two dataframes 

In [None]:
df_merge_col = pd.merge(df, df2, on='Student ID')
df_merge_col['Engagement Level'] = df_merge_col['Engagement_Level']
df_merge_col = df_merge_col.drop('Engagement_Level',axis = 1)
df_merge_col.head(10)

In [None]:
df_merge_col.groupby(['Performance', 'Engagement Level']).size()

In [None]:
import seaborn as sns

sns.catplot(data=df_merge_col,kind='count',x='Performance',hue='Engagement Level')