Implementing Random Forest for Classification Tasks

In [2]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')  # Suppress unnecessary warning messages

# LOAD AND PREPARE THE DATA
# Load the Titanic dataset from a CSV file
titanic_data = pd.read_csv('titanic.csv')

# Drop any rows where the target variable 'Survived' is missing
titanic_data = titanic_data.dropna(subset=['Survived'])

# Select the features (inputs) we want to use for prediction
# 'Pclass' = passenger class (1st, 2nd, 3rd)
# 'Sex' = gender
# 'Age' = passenger age
# 'SibSp' = number of siblings/spouses aboard
# 'Parch' = number of parents/children aboard
# 'Fare' = ticket price
X = titanic_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

# Target variable (output) — whether the passenger survived or not
y = titanic_data['Survived']

# DATA CLEANING AND ENCODING
# Convert categorical data (Sex) into numeric form:
# female = 0, male = 1
X.loc[:, 'Sex'] = X['Sex'].map({'female': 0, 'male': 1})

# Fill missing Age values with the median age
# This prevents errors and keeps the data consistent
X.loc[:, 'Age'].fillna(X['Age'].median(), inplace=True)

# SPLIT DATA INTO TRAINING AND TESTING SETS
# We split the dataset into:
# - 80% for training (used to teach the model)
# - 20% for testing (used to evaluate model performance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CREATE AND TRAIN THE RANDOM FOREST MODEL
# Initialize the Random Forest Classifier
# n_estimators=100 means we’ll use 100 decision trees
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train (fit) the model using the training data
rf_classifier.fit(X_train, y_train)

# MAKE PREDICTIONS AND EVALUATE PERFORMANCE
# Use the trained model to predict survival on the test data
y_pred = rf_classifier.predict(X_test)

# Measure how accurate the predictions are
accuracy = accuracy_score(y_test, y_pred)

# Generate a detailed classification report:
# Includes precision, recall, F1-score for each class (Survived / Not Survived)
classification_rep = classification_report(y_test, y_pred)

# Print the model’s accuracy and report
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)

# TEST THE MODEL ON A SINGLE SAMPLE
# Take the first passenger from the test set
sample = X_test.iloc[0:1]

# Predict whether this specific passenger survived
prediction = rf_classifier.predict(sample)

# Convert the sample’s features into a dictionary for easy reading
sample_dict = sample.iloc[0].to_dict()

# Print the passenger’s data and predicted result
print(f"\nSample Passenger: {sample_dict}")
print(f"Predicted Survival: {'Survived' if prediction[0] == 1 else 'Did Not Survive'}")


Accuracy: 0.80

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


Sample Passenger: {'Pclass': 3, 'Sex': 1, 'Age': 28.0, 'SibSp': 1, 'Parch': 1, 'Fare': 15.2458}
Predicted Survival: Did Not Survive


Implementing Random Forest for Regression Tasks

In [3]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load and prepare the dataset
# Fetch the California housing dataset from sklearn
california_housing = fetch_california_housing()

# Convert the dataset into a pandas DataFrame
# Each column represents a feature such as average income, house age, etc.
california_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

# Add the target variable 'MEDV' (Median Value of homes)
california_data['MEDV'] = california_housing.target

# 2. Split features (X) and target (y)
# X = input features used for prediction
# y = target value we want to predict (median house price)
X = california_data.drop('MEDV', axis=1)
y = california_data['MEDV']

# 3. Split the data into training and testing sets
# 80% data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Create and train the Random Forest Regressor
# n_estimators=100 means it will create 100 decision trees
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit (train) the model using training data
rf_regressor.fit(X_train, y_train)

# 5. Make predictions using the trained model
# Predict the house prices for the test set
y_pred = rf_regressor.predict(X_test)

# 6. Evaluate model performance
# Mean Squared Error (MSE) measures the average squared difference between predictions and actual values
mse = mean_squared_error(y_test, y_pred)

# R-squared (R2) score shows how well the model explains the variance in the target
r2 = r2_score(y_test, y_pred)

# 7. Test the model with a single data point
# Select one row from the test set
single_data = X_test.iloc[0].values.reshape(1, -1)

# Predict the housing price for that single sample
predicted_value = rf_regressor.predict(single_data)

# Print results for the single prediction and overall performance
print(f"Predicted Value: {predicted_value[0]:.2f}")
print(f"Actual Value: {y_test.iloc[0]:.2f}")

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")


Predicted Value: 0.51
Actual Value: 0.48
Mean Squared Error: 0.26
R-squared Score: 0.81
