In [None]:
# ============================
# 1. Configuration and Setup
# ============================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import requests
from bs4 import BeautifulSoup
from io import StringIO

In [None]:
# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

# Define paths
DATA_PATH = "data/"  # Path to local data files
OUTPUT_PATH = "output/"  # Path to save results
os.makedirs(OUTPUT_PATH, exist_ok=True)  # Create output folder if it doesn't exist

In [None]:
# ============================
# 2. Data Loading
# ============================

# Option 1: Load data from a local file
file_path = os.path.join(DATA_PATH, "sample.csv")  # Replace with your file name
data = pd.read_csv(file_path)  # Adjust to read_excel, read_json, etc., if necessary
print("Data loaded from local file.")

# Uncomment the following block if loading from a web URL is required instead
# url = "https://example.com/sample.csv"  # Replace with your data URL
# response = requests.get(url)
# data = pd.read_csv(StringIO(response.text))
# print("Data loaded from URL.")

# Uncomment the following block if loading via web scraping is required
# web_url = "https://example.com/sample_table"  # Replace with your web scraping target URL
# response = requests.get(web_url)
# soup = BeautifulSoup(response.text, 'html.parser')
# table = soup.find('table')  # Find the table in the webpage
# data = pd.read_html(str(table))[0]  # Convert the table into a DataFrame
# print("Data loaded from web scraping.")

In [None]:
# ============================
# 3. Data Preprocessing
# ============================

print(f"Initial Data Shape: {data.shape}")

# 3.1 Handle Missing Values
# Drop columns with more than 80% missing data
data = data.dropna(thresh=int(0.8 * len(data)), axis=1)

# Fill remaining missing values with column means (numerical) or mode (categorical)
for col in data.columns:
    if data[col].dtype in ['float64', 'int64']:
        data[col].fillna(data[col].mean(), inplace=True)
    elif data[col].dtype == 'object':
        data[col].fillna(data[col].mode()[0], inplace=True)

print(f"Shape after handling missing values: {data.shape}")

# 3.2 Handle Outliers (Remove data points beyond 1.5*IQR range for numerical features)
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

print(f"Shape after outlier removal: {data.shape}")

# 3.3 Feature Engineering
# Create new features (example: total, ratio, etc.)
if 'feature1' in data.columns and 'feature2' in data.columns:
    data['feature_ratio'] = data['feature1'] / (data['feature2'] + 1e-6)
    data['feature_sum'] = data['feature1'] + data['feature2']
    print("New features created.")

# 3.4 Encode Categorical Variables
# One-hot encode categorical variables
data = pd.get_dummies(data, drop_first=True)
print("Categorical variables encoded.")

# 3.5 Feature Scaling
# Normalize numerical features using Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
print("Numerical features scaled.")

In [None]:
# ============================
# 4. Exploratory Data Analysis (EDA)
# ============================

# 4.1 Statistical Summaries
print("\nBasic Statistics:")
print(data.describe())

print("\nMissing Values:")
print(data.isnull().sum())

# 4.2 Correlation Analysis
# Compute the correlation matrix
correlation_matrix = data.corr()

# Display the correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# 4.3 Feature Distributions
# Plot distributions for all numerical columns
for column in data.select_dtypes(include='number').columns:
    plt.figure(figsize=(6, 4))
    sns.histplot(data[column], kde=True)
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.show()

# 4.4 Class Balance (for classification tasks)
if 'target' in data.columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(x='target', data=data)
    plt.title("Class Distribution")
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.show()

# 4.5 Pairwise Relationships (for small datasets)
# Visualize pairwise relationships between features
if data.shape[1] <= 10:  # Limit to small datasets
    sns.pairplot(data, diag_kind="kde")
    plt.show()

# 4.6 Detect Multicollinearity
# Identify highly correlated features (above a threshold, e.g., 0.9)
threshold = 0.9
high_corr_features = set()
for i in range(correlation_matrix.shape[0]):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            col_name = correlation_matrix.columns[i]
            high_corr_features.add(col_name)

print(f"Highly correlated features (above {threshold}): {high_corr_features}")

# 4.7 Visualizing Outliers (Boxplots)
for column in data.select_dtypes(include='number').columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=data[column])
    plt.title(f"Boxplot of {column}")
    plt.show()

In [None]:
# ============================
# 5. Train-Test Split
# ============================

# Define features and target
target_column = "target"  # Replace with your target column
features = data.drop(columns=[target_column])
target = data[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=SEED)

In [None]:
# ============================
# 6. Model Training
# ============================

# Initialize and train a Random Forest model
model = RandomForestClassifier(random_state=SEED)
model.fit(X_train, y_train)
print("Model training completed.")

In [None]:
# ============================
# 7. Model Evaluation
# ============================

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, predictions))

print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")