In [None]:
CRUD(Create, Read, Update, Delete)

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import requests
from io import StringIO

#Read

In [None]:
# ============================
# 1. Configuration and Setup
# ============================
# Define global constants and configurations
SEED = 42  # For reproducibility
DATA_PATH = 'data/'  # Path to datasets
OUTPUT_PATH = 'output/'  # Path to save models, plots, etc.

# Create necessary directories
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Helper function for reproducibility
def set_seed(seed=SEED):
    np.random.seed(seed)

set_seed()

In [None]:

# ============================
# 2. Data Loading Functions
# ============================

def load_from_local(file_path, file_type="csv"):
    """
    Load data from a local file.
    """
    try:
        if file_type == "csv":
            return pd.read_csv(file_path)
        elif file_type == "excel":
            return pd.read_excel(file_path)
        elif file_type == "json":
            return pd.read_json(file_path)
        elif file_type == "txt":
            return pd.read_csv(file_path, delimiter="\t")
        else:
            raise ValueError("Unsupported file type.")
    except Exception as e:
        print(f"Error loading local file: {e}")
        return None

def load_from_colab(file_id, file_type="csv"):
    """
    Load data from Google Drive in Colab.
    Args:
        file_id (str): Google Drive file ID.
        file_type (str): "csv" or "excel".
    Returns:
        pd.DataFrame: Data as a DataFrame.
    """
    try:
        url = f"https://drive.google.com/uc?id={file_id}"
        if file_type == "csv":
            return pd.read_csv(url)
        elif file_type == "excel":
            return pd.read_excel(url)
        else:
            raise ValueError("Unsupported file type for Colab.")
    except Exception as e:
        print(f"Error loading file from Colab: {e}")
        return None

def load_from_web(url, file_type="csv"):
    """
    Load data from a URL.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        if file_type == "csv":
            return pd.read_csv(StringIO(response.text))
        elif file_type == "json":
            return pd.read_json(StringIO(response.text))
        elif file_type == "txt":
            return pd.read_csv(StringIO(response.text), delimiter="\t")
        else:
            raise ValueError("Unsupported file type.")
    except Exception as e:
        print(f"Error loading file from web: {e}")
        return None

In [None]:
# ============================
# 3. Data Preprocessing
# ============================

def preprocess_data(df):
    """
    Perform basic data preprocessing such as handling missing values and encoding.
    """
    print(f"Initial Data Shape: {df.shape}")
    # Drop rows/columns with too many missing values
    df = df.dropna(thresh=int(0.8 * len(df)), axis=1)  # Drop columns with > 80% missing values
    df = df.dropna()  # Drop rows with missing values
    print(f"Shape after handling missing values: {df.shape}")
    return df

In [None]:
# ============================
# 4. Exploratory Data Analysis (EDA)
# ============================

def perform_eda(df):
    """
    Conduct basic exploratory data analysis.
    """
    print("Basic Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())

    # Visualize distributions
    for column in df.select_dtypes(include="number").columns:
        plt.figure(figsize=(6, 4))
        sns.histplot(df[column], kde=True)
        plt.title(f"Distribution of {column}")
        plt.show()

In [None]:

# ============================
# 5. Model Training
# ============================

def train_model(X_train, y_train):
    """
    Train a simple machine learning model (Random Forest).
    """
    model = RandomForestClassifier(random_state=SEED)
    model.fit(X_train, y_train)
    return model

In [None]:
# ============================
# 6. Model Evaluation
# ============================

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the trained model on test data.
    """
    predictions = model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")


In [None]:
# ============================
# Main Pipeline
# ============================

if __name__ == "__main__":
    # Step 1: Load the data
    local_file = os.path.join(DATA_PATH, "sample.csv")  # Replace with your file path
    data = load_from_local(local_file, file_type="csv")
    if data is None:
        print("Data loading failed. Exiting.")
        exit()

    # Step 2: Preprocess the data
    data = preprocess_data(data)

    # Step 3: Perform EDA
    perform_eda(data)

    # Step 4: Split the data into training and testing sets
    target_column = "target"  # Replace with your target column
    features = data.drop(columns=[target_column])
    target = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=SEED)

    # Step 5: Train the model
    model = train_model(X_train, y_train)

    # Step 6: Evaluate the model
    evaluate_model(model, X_test, y_test)