## EDA (Explorary Data Analysis)
### Kaggle: Students Performance in Exams
#### To understand the influence of the parents background, test preparation etc on students performance

#### Example Research Questions
> How effective is the test preparation course?  
> Which major factors contribute to test outcomes?  
> What would be the best way to improve student scores on each test?  
> What patterns and interactions in the data can you find? Let me know in the comments section below.

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Load dataset (*.csv file)

In [None]:
df = pd.read_csv('StudentsPerformance.csv')

#### Check dataset size

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
print(f'gender: {np.unique(df["gender"])}')
print(f'race/ethnicity: {np.unique(df["race/ethnicity"])}')
print(f'parental level of education: {np.unique(df["parental level of education"])}')
print(f'lunch: {np.unique(df["lunch"])}')
print(f'test preparation course: {np.unique(df["test preparation course"])}')

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
sns.pairplot(df[['math score', 'reading score', 'writing score']])
# plt.savefig('student_pairplot.png')

In [None]:
counts = pd.crosstab(df['parental level of education'], df['lunch'])
counts['free%'] = counts['free/reduced'] / (counts['free/reduced'] + counts['standard'])
counts = counts.reindex(['some high school', 'high school', 'some college', "associate's degree", "bachelor's degree", "master's degree"])
counts

In [None]:
df_male = df[df['gender'] == 'male']
df_female = df[df['gender'] == 'female']

df1 = pd.DataFrame(
    {
        'math score': [df_male['math score'].mean(), df_female['math score'].mean()],
        'reading score': [df_male['reading score'].mean(), df_female['reading score'].mean()],
        'writing score': [df_male['writing score'].mean(), df_female['writing score'].mean()],
    },
    index=['male', 'female']
)
df1

In [None]:
grouped1 = df.groupby(by='gender')
grouped1.size()

In [None]:
grouped1.mean()

In [None]:
grouped1.describe()

In [None]:
df1 = pd.concat([
    grouped1.quantile(0.25),
    grouped1.quantile(0.5),
    grouped1.quantile(0.75),
])
df1

In [None]:
df1 = pd.concat([
    grouped1.quantile(0.25),
    grouped1.quantile(0.5),
    grouped1.quantile(0.75),
])
df1.index = [['25%', '25%', '50%', '50%', '75%', '75%'],
            df1.index.tolist()]
df1

In [None]:
df1 = pd.concat([
    grouped1.quantile(0.25),
    grouped1.quantile(0.5),
    grouped1.quantile(0.75),
])
df1 = df1.reset_index()
df1['quantile'] = ['25%', '25%', '50%', '50%', '75%', '75%']
df1

In [None]:
df1 = df1.set_index(['gender', 'quantile'])
df1 = df1.sort_index(level=0)
df1

In [None]:
df1 = df1.swaplevel(0, 1)
df1

In [None]:
df1 = df1.sort_index(0)
df1

In [None]:
grouped2 = df.groupby(['race/ethnicity'])
grouped2.size()

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15, 4))
groups = ['group A', 'group B', 'group C', 'group D', 'group E']
for i, (ax, group) in enumerate(zip(axes, groups)):
    ax.hist(df[df['race/ethnicity'] == group]['math score'], bins=10, edgecolor='k')
    ax.set_title(f'{group}: {(df["race/ethnicity"] == group).astype(int).sum()}')
    ax.set_xlabel('math score')
axes[0].set_ylabel('# of Students')
plt.tight_layout()

In [None]:
df2 = pd.DataFrame(
    {
        '25%': grouped2['math score'].quantile(0.25),
        '50%': grouped2['math score'].quantile(0.5),
        '75%': grouped2['math score'].quantile(0.75),
    }
)
df2

In [None]:
df2.T

In [None]:
df2.plot.line(marker='D')
plt.xticks(np.arange(len(df2.index)), df2.index)

In [None]:
grouped3 = df.groupby('parental level of education')
df3 = pd.DataFrame(
    {
        '25%': grouped3['math score'].quantile(0.25),
        '50%': grouped3['math score'].quantile(0.5),
        '75%': grouped3['math score'].quantile(0.75),
    }
)
df3 = df3.reindex(['some high school', 'high school', 'some college', "associate's degree", "bachelor's degree", "master's degree"])
df3

In [None]:
df3.plot.line(marker='D')
plt.xticks(np.arange(len(df3.index)), df3.index, rotation=90)

In [None]:
g = sns.catplot(x='parental level of education', y='math score', data=df)
g.set_xticklabels(rotation=90)

In [None]:
g = sns.catplot(x='parental level of education', y='math score', data=df, kind='box', width=0.4)
g.set_xticklabels(rotation=90)

In [None]:
g = sns.catplot(x='parental level of education', y='math score', col='gender', data=df, kind='box', width=0.4)
g.set_xticklabels(rotation=90)

In [None]:
g = sns.catplot(x='parental level of education', y='math score',
                col='gender', hue='lunch',
                data=df, kind='box', width=0.4)
g.set_xticklabels(rotation=90)

In [None]:
grouped4 = df.groupby('lunch')
df4 = pd.DataFrame(
    {
        'math': grouped4['math score'].median(),
        'reading': grouped4['reading score'].median(),
        'writing': grouped4['writing score'].median(),
    }
)
df4

In [None]:
df4.plot.bar()
plt.xticks(rotation=0)
plt.legend(loc=[1.05, 0.5])

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
courses = ['math score', 'reading score', 'writing score']
for (course, ax) in zip(courses, axes):
    g = sns.catplot(x='test preparation course', y=course, ax=ax, data=df, kind='box', width=0.3)
    plt.close(g.fig)
plt.tight_layout()