# Random Forest Regression 

In [9]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, pointbiserialr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [10]:
#import the data set
# Import the dataset
math = pd.read_csv("./student-mat.csv", sep=';', header=0)
por = pd.read_csv("./student-por.csv", sep=';', header=0)

In [11]:
#feature selection 

from scipy.stats import pearsonr, pointbiserialr
from sklearn.preprocessing import LabelEncoder

# Specify the target column
target_column = 'G3'  # Target column name

# Separate numerical and categorical columns
numerical_features = math.select_dtypes(include=['number']).columns
categorical_features = math.select_dtypes(include=['object', 'category']).columns

# Dictionary to store correlation results
correlation_results = {}

# Handle numerical columns
for col in numerical_features:
    if col != target_column:
        correlation, p_value = pearsonr(math[col], math[target_column])
        correlation_results[col] = {'correlation': abs(correlation), 'p_value': p_value}

# Handle categorical columns
for col in categorical_features:
    if col != target_column:
        # Encode categorical values
        encoded_col = LabelEncoder().fit_transform(math[col])
        correlation, p_value = pointbiserialr(encoded_col, math[target_column])
        correlation_results[col] = {'correlation': abs(correlation), 'p_value': p_value}

# Sort features by correlation
sorted_features = sorted(correlation_results.items(), key=lambda x: x[1]['correlation'], reverse=True)

# Display top features with p-values
print("Feature Correlations and P-values with Target Variable:")
for feature, stats in sorted_features:
    print(f"{feature}: Correlation = {stats['correlation']:.2f}, P-value = {stats['p_value']:.3e}")


Feature Correlations and P-values with Target Variable:
G2: Correlation = 0.90, P-value = 7.626e-148
G1: Correlation = 0.80, P-value = 9.001e-90
failures: Correlation = 0.36, P-value = 1.466e-13
Medu: Correlation = 0.22, P-value = 1.336e-05
higher: Correlation = 0.18, P-value = 2.668e-04
age: Correlation = 0.16, P-value = 1.271e-03
Fedu: Correlation = 0.15, P-value = 2.380e-03
goout: Correlation = 0.13, P-value = 8.229e-03
romantic: Correlation = 0.13, P-value = 9.713e-03
reason: Correlation = 0.12, P-value = 1.527e-02
traveltime: Correlation = 0.12, P-value = 1.987e-02
address: Correlation = 0.11, P-value = 3.563e-02
sex: Correlation = 0.10, P-value = 3.987e-02
Mjob: Correlation = 0.10, P-value = 4.259e-02
paid: Correlation = 0.10, P-value = 4.277e-02
internet: Correlation = 0.10, P-value = 5.048e-02
studytime: Correlation = 0.10, P-value = 5.206e-02
schoolsup: Correlation = 0.08, P-value = 1.004e-01
famsize: Correlation = 0.08, P-value = 1.062e-01
guardian: Correlation = 0.07, P-valu

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming 'math' is your DataFrame and 'G3' is the target column
target_column = 'G3'

# Encode categorical columns (if any)
df_encoded = math.copy()
for col in categorical_features:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Separate features and target
X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]

# Handle categorical features with encoding
X_encoded = pd.get_dummies(X, drop_first=True)  # One-hot encoding for categorical features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Fit Random Forest Regressor
regressor = RandomForestRegressor(random_state=0, n_estimators=30)
regressor.fit(X_train, y_train)

# Get feature importances
importances = regressor.feature_importances_

# Map feature names to their importance
feature_importance = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Importance': importances
})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display top features
print("Feature Importance from Random Forest:")
print(feature_importance)



Feature Importance from Random Forest:
       Feature  Importance
31          G2    0.782230
29    absences    0.111876
2          age    0.020800
23      famrel    0.008860
10      reason    0.008789
30          G1    0.007537
25       goout    0.006726
15   schoolsup    0.005189
28      health    0.004268
22    romantic    0.004108
27        Walc    0.003926
8         Mjob    0.003904
14    failures    0.003863
9         Fjob    0.003418
24    freetime    0.003360
7         Fedu    0.003133
18  activities    0.002992
11    guardian    0.002925
13   studytime    0.002257
6         Medu    0.001780
12  traveltime    0.001523
4      famsize    0.000889
1          sex    0.000810
3      address    0.000800
17        paid    0.000797
26        Dalc    0.000746
20      higher    0.000722
21    internet    0.000420
0       school    0.000405
16      famsup    0.000397
19     nursery    0.000345
5      Pstatus    0.000205


In [20]:
# Predict using the test data
y_pred = regressor.predict(X_test)

In [23]:
# Display predictions vs actual values
np.set_printoptions(precision=2)  # Display only 2 decimals after the column for the numerical values
comparison = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), axis=1)  # Fixed the axis argument
print(comparison)

[[ 8.5  10.  ]
 [11.9  12.  ]
 [ 6.87  5.  ]
 [ 9.57 10.  ]
 [ 9.07  9.  ]
 [12.63 13.  ]
 [18.87 18.  ]
 [ 7.03  6.  ]
 [ 6.53  0.  ]
 [13.4  14.  ]
 [15.7  15.  ]
 [ 6.63  7.  ]
 [14.27 15.  ]
 [11.77 10.  ]
 [14.07 14.  ]
 [ 8.4   8.  ]
 [ 3.1   8.  ]
 [10.5  11.  ]
 [15.2  15.  ]
 [ 5.1   0.  ]
 [13.97 14.  ]
 [15.73 16.  ]
 [15.27 16.  ]
 [ 6.53  6.  ]
 [ 8.63  0.  ]
 [18.37 19.  ]
 [10.4  11.  ]
 [ 8.2  12.  ]
 [17.97 17.  ]
 [10.63 10.  ]
 [ 8.3   8.  ]
 [ 8.33 10.  ]
 [15.5  15.  ]
 [13.1  13.  ]
 [ 2.23  8.  ]
 [ 6.73  5.  ]
 [ 0.    0.  ]
 [15.03 15.  ]
 [11.77 14.  ]
 [ 7.97  8.  ]
 [ 6.2   5.  ]
 [ 9.97 11.  ]
 [13.97 14.  ]
 [ 9.    9.  ]
 [15.   15.  ]
 [ 9.17 10.  ]
 [11.73 11.  ]
 [14.2  13.  ]
 [12.93 13.  ]
 [15.67 16.  ]
 [13.1  13.  ]
 [15.5  15.  ]
 [10.47 12.  ]
 [ 8.9  10.  ]
 [ 5.97  6.  ]
 [13.13 12.  ]
 [10.5  11.  ]
 [ 3.    0.  ]
 [15.4  16.  ]
 [16.17 17.  ]
 [13.67 14.  ]
 [ 9.1  10.  ]
 [ 8.6  10.  ]
 [ 6.93  6.  ]
 [ 7.53  9.  ]
 [18.   17.  ]
 [ 8.97  8

In [24]:
# Evaluating the model performance

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8253704282534027

In [28]:
# Encode categorical columns (if any)
df_encoded = math.copy()

numerical_features = math.select_dtypes(include=['number']).columns
categorical_features = math.select_dtypes(include=['object', 'category']).columns

for col in categorical_features:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Define the target column and features
X = df_encoded[['G3', 'Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'sex', 'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']]
y = df_encoded[target_column]

# Handle categorical features with encoding
X_encoded = pd.get_dummies(X, drop_first=True)  # One-hot encoding for categorical features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Fit Random Forest Regressor
regressor = RandomForestRegressor(random_state=0, n_estimators=30)
regressor.fit(X_train, y_train)

# Get feature importances
importances = regressor.feature_importances_

# Map feature names to their importance
feature_importance = pd.DataFrame({
    'Feature': X_encoded.columns,
    'Importance': importances
})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display top features
print("Feature Importance from Random Forest:")
print(feature_importance)


Feature Importance from Random Forest:
       Feature  Importance
0           G3    0.993123
5     absences    0.005914
7           G2    0.000674
2     failures    0.000094
4         Walc    0.000053
1         Medu    0.000045
9         Mjob    0.000037
12  activities    0.000027
11      famsup    0.000015
6           G1    0.000009
8          sex    0.000009
3         Dalc    0.000000
10   schoolsup    0.000000
13     nursery    0.000000
14      higher    0.000000


In [30]:
# Predict using the test data
y_pred = regressor.predict(X_test)

In [31]:
# Evaluating the model performance

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9997901103366361