In [2]:
import pandas as pd #! Data Processing
import numpy as np #! linear algebra
import matplotlib.pyplot as plt #! Data visualization
import seaborn as sns #! Advance plots


### **Data Loading**

In [3]:
load_data = pd.read_csv("../Data/raw/student_exam_scores.csv", na_values='-') #! Data Loading

In [4]:
df = load_data.copy() #! Copy the data into new variable

In [5]:
df.head() #! Analyse the table

Unnamed: 0,student_id,hours_studied,sleep_hours,attendance_percent,previous_scores,exam_score
0,S001,8.0,8.8,72.1,45,30.2
1,S002,1.3,8.6,60.7,55,25.0
2,S003,4.0,8.2,73.7,86,35.8
3,S004,3.5,4.8,95.1,66,34.0
4,S005,9.1,6.4,89.8,71,40.3


In [6]:
df.columns

Index(['student_id', 'hours_studied', 'sleep_hours', 'attendance_percent',
       'previous_scores', 'exam_score'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   student_id          200 non-null    object 
 1   hours_studied       200 non-null    float64
 2   sleep_hours         200 non-null    float64
 3   attendance_percent  200 non-null    float64
 4   previous_scores     200 non-null    int64  
 5   exam_score          200 non-null    float64
dtypes: float64(4), int64(1), object(1)
memory usage: 9.5+ KB


In [8]:
df.describe()

Unnamed: 0,hours_studied,sleep_hours,attendance_percent,previous_scores,exam_score
count,200.0,200.0,200.0,200.0,200.0
mean,6.3255,6.622,74.83,66.8,33.955
std,3.227317,1.497138,14.249905,15.663869,6.789548
min,1.0,4.0,50.3,40.0,17.1
25%,3.5,5.3,62.2,54.0,29.5
50%,6.15,6.7,75.25,67.5,34.05
75%,9.0,8.025,87.425,80.0,38.75
max,12.0,9.0,100.0,95.0,51.3


In [9]:
df["student_id"].value_counts()

student_id
S001    1
S002    1
S003    1
S004    1
S005    1
       ..
S196    1
S197    1
S198    1
S199    1
S200    1
Name: count, Length: 200, dtype: int64

In [10]:
df.drop(columns=["student_id"], inplace=True)

In [11]:
df.index.name = 'index'
df.head()

Unnamed: 0_level_0,hours_studied,sleep_hours,attendance_percent,previous_scores,exam_score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8.0,8.8,72.1,45,30.2
1,1.3,8.6,60.7,55,25.0
2,4.0,8.2,73.7,86,35.8
3,3.5,4.8,95.1,66,34.0
4,9.1,6.4,89.8,71,40.3


In [12]:
# df.to_csv("../Data/processed/processed_v0.0.1.csv", index=False)

In [13]:
basic_info = {
    "shape": df.shape,
    "columns": df.columns.tolist(),
    "missing_values": df.isnull().sum().to_dict(),
    "duplicates": df.duplicated().sum(),
    "dtypes": df.dtypes.astype(str).to_dict(),
    "sample_data": df.head(5).to_dict(),
}

basic_info

{'shape': (200, 5),
 'columns': ['hours_studied',
  'sleep_hours',
  'attendance_percent',
  'previous_scores',
  'exam_score'],
 'missing_values': {'hours_studied': 0,
  'sleep_hours': 0,
  'attendance_percent': 0,
  'previous_scores': 0,
  'exam_score': 0},
 'duplicates': np.int64(0),
 'dtypes': {'hours_studied': 'float64',
  'sleep_hours': 'float64',
  'attendance_percent': 'float64',
  'previous_scores': 'int64',
  'exam_score': 'float64'},
 'sample_data': {'hours_studied': {0: 8.0, 1: 1.3, 2: 4.0, 3: 3.5, 4: 9.1},
  'sleep_hours': {0: 8.8, 1: 8.6, 2: 8.2, 3: 4.8, 4: 6.4},
  'attendance_percent': {0: 72.1, 1: 60.7, 2: 73.7, 3: 95.1, 4: 89.8},
  'previous_scores': {0: 45, 1: 55, 2: 86, 3: 66, 4: 71},
  'exam_score': {0: 30.2, 1: 25.0, 2: 35.8, 3: 34.0, 4: 40.3}}}

In [15]:
# Handle missing values

df.fillna(df.mean(), inplace=True)

In [17]:
#! Outlier detection (IQR method)
for col in df.select_dtypes("number").columns:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)

    IQR = q3 - q1
    df = df[(df[col] >= q1 - (1.5 * IQR)) & (df[col] <= q3 + (1.5 * IQR))]

In [19]:
df.head()

Unnamed: 0_level_0,hours_studied,sleep_hours,attendance_percent,previous_scores,exam_score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8.0,8.8,72.1,45,30.2
1,1.3,8.6,60.7,55,25.0
2,4.0,8.2,73.7,86,35.8
3,3.5,4.8,95.1,66,34.0
4,9.1,6.4,89.8,71,40.3


In [18]:
df.to_csv("../Data/processed/processed_v0.0.2.csv", index=False)