# Project Title: Student Performance Prediction


## 1. Problem Definition:
- Task: Predict whether a student will pass or fail a course based on various features.
- Dataset: Hypothetical dataset with features such as attendance, study hours, previous grades, etc.

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## 2. Project Flow:
- Data Preprocessing:
- Load the dataset.
- Explore the dataset to understand its structure and features.
- Handle any missing values if necessary.
- Encode categorical variables if needed.
- Create a binary target variable indicating pass or fail.
- Split the dataset into training and testing sets.

In [25]:
# Load the dataset
file_path = 'xAPI-Edu-Data.csv'
df = pd.read_csv(file_path)

In [26]:
# Explore the dataset
print(df.head())
print(df.info())

  gender NationalITy  ... StudentAbsenceDays Class
0      M          KW  ...            Under-7     M
1      M          KW  ...            Under-7     M
2      M          KW  ...            Above-7     L
3      M          KW  ...            Above-7     L
4      M          KW  ...            Above-7     M

[5 rows x 17 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    480 non-null    object
 1   NationalITy               480 non-null    object
 2   PlaceofBirth              480 non-null    object
 3   StageID                   480 non-null    object
 4   GradeID                   480 non-null    object
 5   SectionID                 480 non-null    object
 6   Topic                     480 non-null    object
 7   Semester                  480 non-null    object
 8   Relation             

In [27]:
# Handle missing values
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64


In [28]:
# Encode categorical variables
# Assuming 'gender' and 'Class' are categorical variables
encoded_df = pd.get_dummies(df, columns=['gender', 'Class'], drop_first=True)
print(encoded_df.head())
# Convert 'GradeID' to numerical values
encoded_df['GradeID'] = encoded_df['GradeID'].str.extract('(\d+)').astype(float)

  NationalITy PlaceofBirth     StageID  ... gender_M Class_L Class_M
0          KW       KuwaIT  lowerlevel  ...        1       0       1
1          KW       KuwaIT  lowerlevel  ...        1       0       1
2          KW       KuwaIT  lowerlevel  ...        1       1       0
3          KW       KuwaIT  lowerlevel  ...        1       1       0
4          KW       KuwaIT  lowerlevel  ...        1       0       1

[5 rows x 18 columns]


In [29]:
# Create a binary target variable
# Assuming 'pass' is the target variable based on grades
encoded_df['target'] = (encoded_df['GradeID'] >= 10).astype(int)
print(encoded_df.head())

  NationalITy PlaceofBirth     StageID  ...  Class_L Class_M target
0          KW       KuwaIT  lowerlevel  ...        0       1      0
1          KW       KuwaIT  lowerlevel  ...        0       1      0
2          KW       KuwaIT  lowerlevel  ...        1       0      0
3          KW       KuwaIT  lowerlevel  ...        1       0      0
4          KW       KuwaIT  lowerlevel  ...        0       1      0

[5 rows x 19 columns]


In [30]:
# Create a binary target variable
# Assuming 'pass' is the target variable based on grades
encoded_df['target'] = (encoded_df['GradeID'] >= 10).astype(int)
print(encoded_df.head())

  NationalITy PlaceofBirth     StageID  ...  Class_L Class_M target
0          KW       KuwaIT  lowerlevel  ...        0       1      0
1          KW       KuwaIT  lowerlevel  ...        0       1      0
2          KW       KuwaIT  lowerlevel  ...        1       0      0
3          KW       KuwaIT  lowerlevel  ...        1       0      0
4          KW       KuwaIT  lowerlevel  ...        0       1      0

[5 rows x 19 columns]


In [31]:
# Split the dataset into training and testing sets
X = encoded_df.drop('target', axis=1)
y = encoded_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Building

- Choose a binary classification algorithm (e.g., Logistic Regression, Random Forest, or Gradient Boosting).
- Create a pipeline for preprocessing (if needed) and model training.
- Define hyperparameters for the model.

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [33]:
# Define preprocessing for categorical and numerical features
numeric_features = ['raisedhands', 'VisITedResources', 'AnnouncementsView', 'Discussion']
categorical_features = ['gender', 'NationalITy', 'PlaceofBirth', 'StageID', 'GradeID', 'SectionID', 'Topic', 'Semester', 'Relation', 'ParentAnsweringSurvey', 'ParentschoolSatisfaction', 'StudentAbsenceDays']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])



In [34]:
# Define the model
model = RandomForestClassifier()

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Define hyperparameters for the model
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 5, 10, 15],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search

# Model Training:

- Fit the model to the training data.
- Monitor training performance.
### - Fine-tune hyperparameters if necessary.

### Model Evaluation:

- Evaluate the trained model on the testing set.
- Calculate metrics like accuracy, precision, recall, and F1-score.
- Generate a confusion matrix for a detailed understanding of performance.
- Visualize the results (e.g., using matplotlib or seaborn).


### Predictive Analysis:

- Use the trained model to predict whether new students are likely to pass or fail.
- Analyze the importance of different features in making predictions.
- Provide insights into factors that contribute to student success.