In [None]:
# Errol Ian Ave Acosta
# Data Science | Google Colab
# Grok 3 Free | Data Science
# February 16, 2025

""" Important Notes:

Replace 'path/to/your/dataset.csv' with the actual path to your dataset.
Adjust numeric_features and categorical_features according to your dataset's features.

The SimpleImputer strategies ('mean', 'median', 'constant') should be chosen based on the nature of your data.
You might need to tune hyperparameters of the models for better performance,
which can be done using techniques like GridSearchCV.

This template provides a structured approach to data science projects, but remember,
the specifics can vary greatly depending on your dataset and the problem you're solving.
"""

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
# Notes: Ensure the file path is correct, and the dataset is suitable for your analysis.
df = pd.read_csv('path/to/your/dataset.csv')

# Initial exploration
# Notes: Check for basic statistics, data types, missing values, etc.
print(df.info())
print(df.describe())
print(df.isnull().sum())

# Data Cleaning
# Notes: Handle missing values, outliers, and incorrect data types.
# Example: Impute missing values
imputer = SimpleImputer(strategy='mean')
df['column_with_missing_values'] = imputer.fit_transform(df[['column_with_missing_values']])

# Data Visualization
# Notes: Visualize data distribution, correlations, etc.
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Features')
plt.show()

# Distribution of a specific feature
sns.histplot(df['feature_name'], kde=True)
plt.title('Distribution of Feature Name')
plt.show()

# Feature Engineering
# Notes: Create new features that might be useful for prediction.
df['new_feature'] = df['feature1'] * df['feature2']

# Pre-processing
# Notes: Split data into features and target, then prepare for machine learning.
X = df.drop('target_column', axis=1)
y = df['target_column']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numerical and categorical data
numeric_features = ['num_feature1', 'num_feature2']
categorical_features = ['cat_feature1', 'cat_feature2']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Model Pipelines
# Notes: Define pipelines for different machine learning models with preprocessing included.
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', RandomForestClassifier())])

lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', LogisticRegression())])

svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', SVC())])

# Fit models
# Notes: Train models on the training data.
rf_pipeline.fit(X_train, y_train)
lr_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X_train, y_train)

# Predictions
# Notes: Use models to predict on test data.
y_pred_rf = rf_pipeline.predict(X_test)
y_pred_lr = lr_pipeline.predict(X_test)
y_pred_svm = svm_pipeline.predict(X_test)

# Model Evaluation
# Notes: Evaluate models using classification metrics.
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))
print("Support Vector Machine:")
print(classification_report(y_test, y_pred_svm))

# Confusion Matrix Visualization
# Notes: Visualize how well the model performs in terms of true positives, false positives, etc.
for name, y_pred in [('Random Forest', y_pred_rf), ('Logistic Regression', y_pred_lr), ('SVM', y_pred_svm)]:
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {name}')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# Feature Importance (for Random Forest only)
# Notes: Analyze which features are most important in the model.
feature_importance = rf_pipeline.named_steps['classifier'].feature_importances_
feature_names = rf_pipeline.named_steps['preprocessor'].get_feature_names_out()
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig, ax = plt.subplots(figsize=(12,6))
ax.barh(pos, feature_importance[sorted_idx], align='center')
ax.set_yticks(pos)
ax.set_yticklabels(feature_names[sorted_idx])
ax.set_xlabel('Feature Importance')
ax.set_title('Feature Importance (MDI)')
plt.tight_layout()
plt.show()