### Exploratory Data Analysis - Students Performance

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import plotly as py
import cufflinks as cf

In [3]:
from plotly.offline import iplot

In [4]:
# plotly and cufflinks in offline
py.offline.init_notebook_mode(connected= True)
cf.go_offline()

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/BharathSN/Exploratory-Data-Analysis-EDA/main/StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
# printing the column names using print function
print(df.columns.values) 

['gender' 'race/ethnicity' 'parental level of education' 'lunch'
 'test preparation course' 'math score' 'reading score' 'writing score']


In [7]:
# another method used to get the concise summary of a DataFrame is
# .info will provide the details Name of columns, Data type of columns, Rows in dataframe and non null entries in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [8]:
# column datatypes of a dataframe
df.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [9]:
# Obtain the dtypes of columns in the form of Dictionary.
dataDict = dict(df.dtypes)
dataDict

{'gender': dtype('O'),
 'race/ethnicity': dtype('O'),
 'parental level of education': dtype('O'),
 'lunch': dtype('O'),
 'test preparation course': dtype('O'),
 'math score': dtype('int64'),
 'reading score': dtype('int64'),
 'writing score': dtype('int64')}

In [10]:
# Count number of non-NA/null observations.
df.count()

gender                         1000
race/ethnicity                 1000
parental level of education    1000
lunch                          1000
test preparation course        1000
math score                     1000
reading score                  1000
writing score                  1000
dtype: int64

In [11]:
# dataframe shape (number of rows,number of columns)
df.shape

(1000, 8)

In [12]:
# Detect missing values for dataframe.
pd.isnull(df)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False


In [13]:
# another method to detect missing values for dataframe.
pd.isna(df)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False


In [14]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [15]:
df['race/ethnicity'].value_counts().iplot(kind = 'bar', colors = 'blue', bins=50,
                    xTitle = 'Race/Ethnicity',yTitle = 'Count')

In [16]:
df_female = df.loc[df['gender'] == 'female']
df_female['race/ethnicity'].value_counts().iplot(kind = 'bar', colors = 'blue', bins=50,
                    xTitle = 'Race/Ethnicity when Gender = Female ',yTitle = 'Count')

In [17]:
df_male = df.loc[df['gender'] == 'male']
df_male['race/ethnicity'].value_counts().iplot(kind = 'bar', colors = 'blue', bins=50,
                    xTitle = 'Race/Ethnicity when Gender = Male ',yTitle = 'Count')

In [18]:
df['math score'].iplot(kind = 'hist', colors = 'blue',bins = 100,
                    xTitle = 'math score',yTitle = 'Count',title = 'math score')

In [19]:
df['reading score'].iplot(kind = 'hist', colors = 'blue',bins = 100,
                    xTitle = 'math score',yTitle = 'Count',title = 'math score')

In [20]:
df['writing score'].iplot(kind = 'hist', colors = 'blue',bins = 100,
                    xTitle = 'math score',yTitle = 'Count',title = 'math score')

In [21]:
import plotly.express as px
import plotly.graph_objects as go

In [22]:
df_male = df[df['gender']=='male']['math score']
df_female = df[df['gender']== 'female']['math score']

df_male = go.Histogram(x = df_male,name = 'Male',opacity = 0.6)
df_female = go.Histogram(x = df_female,name = 'Female',opacity = 0.6)

data = [df_male,df_female]
layout = go.Layout(barmode = 'overlay',title= 'Math score Comparison in Male and Female')
figure = go.Figure(data = data,layout = layout)
figure.show()

In [23]:
df_male = df[df['gender']=='male']['reading score']
df_female = df[df['gender']== 'female']['reading score']

df_male = go.Histogram(x = df_male,name = 'Male',opacity = 0.6)
df_female = go.Histogram(x = df_female,name = 'Female',opacity = 0.6)

data = [df_male,df_female]
layout = go.Layout(barmode = 'overlay',title= 'Reading score Comparison in Male and Female')
figure = go.Figure(data = data,layout = layout)
figure.show()

In [24]:
df_male = df[df['gender']=='male']['writing score']
df_female = df[df['gender']== 'female']['writing score']

df_male = go.Histogram(x = df_male,name = 'Male',opacity = 0.6)
df_female = go.Histogram(x = df_female,name = 'Female',opacity = 0.6)

data = [df_male,df_female]
layout = go.Layout(barmode = 'overlay',title= 'Writing  score Comparison in Male and Female')
figure = go.Figure(data = data,layout = layout)
figure.show()

In [None]:
sns.jointplot(x = 'math score',y = 'reading score',data = df, kind = 'kde')

<seaborn.axisgrid.JointGrid at 0x1679ad48cd0>

In [None]:
sns.jointplot(x = 'math score',y = 'writing score',data = df, kind = 'kde')

In [None]:
sns.jointplot(x = 'writing score',y = 'reading score',data = df, kind = 'kde')

In [None]:
df['parental level of education'].value_counts().iplot(kind = 'bar', colors = 'blue', bins=50,
                    xTitle = 'parental level of education',yTitle = 'Count')

In [None]:
df_sc = df[df['parental level of education']=='some college']['math score']
df_ad = df[df['parental level of education']=="associate's degree"]['math score']
df_hs = df[df['parental level of education']== 'high school']['math score']
df_md = df[df['parental level of education']=="master's degree"]['math score']

df_sc = go.Histogram(x = df_sc,name = 'Some college',opacity = 0.6)
df_ad = go.Histogram(x = df_ad,name = 'associate degree',opacity = 0.6)
df_hs = go.Histogram(x = df_hs,name = 'high school',opacity = 0.6)
df_md = go.Histogram(x = df_md,name = 'master degree',opacity = 0.6)

data = [df_sc,df_ad,df_hs,df_md]
layout = go.Layout(barmode = 'overlay',title= 'Comparison of maths score based on Parents Education')
figure = go.Figure(data = data,layout = layout)
figure.show()

In [None]:
df_sc = df[df['parental level of education']=='some college']['reading score']
df_ad = df[df['parental level of education']=="associate's degree"]['reading score']
df_hs = df[df['parental level of education']== 'high school']['reading score']
df_md = df[df['parental level of education']=="master's degree"]['reading score']

df_sc = go.Histogram(x = df_sc,name = 'Some college',opacity = 0.6)
df_ad = go.Histogram(x = df_ad,name = 'associate degree',opacity = 0.6)
df_hs = go.Histogram(x = df_hs,name = 'high school',opacity = 0.6)
df_md = go.Histogram(x = df_md,name = 'master degree',opacity = 0.6)

data = [df_sc,df_ad,df_hs,df_md]
layout = go.Layout(barmode = 'overlay',title= 'Comparison of reading score based on Parents Education')
figure = go.Figure(data = data,layout = layout)
figure.show()

In [None]:
df_sc = df[df['parental level of education']=='some college']['writing score']
df_ad = df[df['parental level of education']=="associate's degree"]['writing score']
df_hs = df[df['parental level of education']== 'high school']['writing score']
df_md = df[df['parental level of education']=="master's degree"]['writing score']

df_sc = go.Histogram(x = df_sc,name = 'Some college',opacity = 0.6)
df_ad = go.Histogram(x = df_ad,name = 'associate degree',opacity = 0.6)
df_hs = go.Histogram(x = df_hs,name = 'high school',opacity = 0.6)
df_md = go.Histogram(x = df_md,name = 'master degree',opacity = 0.6)

data = [df_sc,df_ad,df_hs,df_md]
layout = go.Layout(barmode = 'overlay',title= 'Comparison of writing  score based on Parents Education')
figure = go.Figure(data = data,layout = layout)
figure.show()

In [None]:
temp = df.loc[(df['gender'] == 'female') & (df['race/ethnicity'] =='group B')&(df['math score'] > 10)]
temp

In [None]:
tempdf = df.loc[(df['gender']=='male') & (df['writing score'] > 25)]
tempdf

In [None]:
df_male = df[df['gender']=='male']['math score']
df_female = df[df['gender']== 'female']['math score']

df_male = go.Histogram(x = df_male,name = 'Male',opacity = 0.6)
df_female = go.Histogram(x = df_female,name = 'Female',opacity = 0.6)

data = [df_male,df_female]
layout = go.Layout(barmode = 'overlay',title= 'Math score Comparison in Male and Female')
figure = go.Figure(data = data,layout = layout)
figure.show()

In [None]:
m = {'completed':1,'none':0}
df['test preparation course'] = df['test preparation course'].map(m)
df.head(5)

#### Train and Test Split - Math Score Prediction

In [None]:
X = df[['test preparation course','reading score','writing score']]
y = df[['math score']]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=51)

In [None]:
print('X_train = ',X_train.shape)
print('X_test =',X_test.shape)
print('y_train =',y_train.shape)
print('y_test =',y_test.shape)

In [None]:
# LinearRegression
from sklearn.linear_model import LinearRegression
algo = LinearRegression()
algo.fit(X_train,y_train)
algo.score(X_test,y_test) * 100
y_pred = algo.predict(X_test)

In [None]:
# Lasso
from sklearn.linear_model import Lasso,Ridge
algo = Lasso(alpha=10)
algo.fit(X_train,y_train)
algo.score(X_test,y_test) * 100

In [None]:
# Ridge
algo = Ridge(alpha=7)
algo.fit(X_train,y_train)
algo.score(X_test,y_test) * 100

In [None]:
print(algo.intercept_)
print(algo.coef_)

In [None]:
# evaluate the performance of the LinearRegression (MAE - MSE - RMSE)

from sklearn.linear_model import LinearRegression
algo = LinearRegression()
algo.fit(X_train,y_train)
y_pred = algo.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('VarScore:',metrics.explained_variance_score(y_test,y_pred))

In [None]:
# evaluate the performance of the Lasso (MAE - MSE - RMSE)

from sklearn.linear_model import Lasso,Ridge
algo = Lasso(alpha=10)
algo.fit(X_train,y_train)
y_pred = algo.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('VarScore:',metrics.explained_variance_score(y_test,y_pred))

In [None]:
# evaluate the performance of the Ridge (MAE - MSE - RMSE)

from sklearn.linear_model import Lasso,Ridge
algo = Ridge(alpha=7)
algo.fit(X_train,y_train)
y_pred = algo.predict(X_test)

from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))  
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('VarScore:',metrics.explained_variance_score(y_test,y_pred))