In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Sample dataset: REplace with dataset loading: df = pd.read_csv(path)
data = {
    'age': [25, 32, 47, 51, 62],
    'income': [50000, 64000, 120000, 110000, 150000],
    'gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'city': ['Mumbai', 'Delhi', 'Delhi', 'Bangalore', 'Mumbai'],
    'bought_insurance': [0, 1, 1, 0, 1]
}

df = pd.DataFrame(data)

# Features and target
X = df.drop('bought_insurance', axis=1) #output column
y = df['bought_insurance']

# Define column types
numeric_features = ['age', 'income']  #replace with actual numeric and categoric columns
categorical_features = ['gender', 'city']

# Define preprocessing: standard
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define pipeline
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression())  #replace with other classifiers
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train model
model.fit(X_train, y_train)

custom_input = pd.DataFrame([{
    'age': 30,
    'income': 70000,
    'gender': 'Male',
    'city': 'Delhi'
}])
# Predict
y_pred = model.predict(custom_input)
custom_prob = model.predict_proba(custom_input)
print("Probability of each class:", custom_prob)


# Output results
print("Predictions on test data:", y_pred)


import numpy as np

# Predict probabilities for custom input
proba = model.predict_proba(custom_input)

# Custom threshold
threshold = 0.2

# Apply threshold manually
custom_prediction = (proba[:, 1] >= threshold).astype(int)

print("Custom prediction:", custom_prediction)

Probability of each class: [[0.75616539 0.24383461]]
Predictions on test data: [0]
Custom prediction: [1]


In [15]:
df = pd.read_csv('play_tennis.csv')
df = df.drop('day', axis=1)
df.head()


Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import numpy as np

def logistic_regression_pipeline(
    df,                      # Input DataFrame
    target_column,           # Name of target column
    numeric_features,        # List of numeric feature column names
    categorical_features,    # List of categorical feature column names
    test_size=0.2,           # Test set size
    threshold=0.5,           # Custom threshold for prediction
    custom_input=None        # Optional custom input for prediction (as DataFrame)
):
    # Split features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # Define preprocessing
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    # Define pipeline
    model = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('classifier', LogisticRegression())
    ])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Train model
    model.fit(X_train, y_train)

    # Predict on test set
    test_proba = model.predict_proba(X_test)
    test_prediction = (test_proba[:, 1] >= threshold).astype(int)

    print("Test predictions with threshold", threshold, ":\n", test_prediction)

    # Custom input prediction (if provided)
    if custom_input is not None:
        custom_proba = model.predict_proba(custom_input)
        custom_prediction = (custom_proba[:, 1] >= threshold).astype(int)

        print("\nCustom input:")
        print(custom_input)
        print("Predicted class:", custom_prediction)
        print("Class probabilities:", custom_proba)

    return model  # Return model in case user wants to reuse it

# ------------------ Example usage ------------------

# Sample dataset
df = pd.read_csv('play_tennis.csv')
df = df.drop('day', axis=1)

# Example custom input
custom_input = pd.DataFrame([{
    'outlook': 'Sunny',
    'temp': 'Hot',
    'humidity': 'High',
    'wind': 'Weak'
}])

# Run the pipeline
logistic_regression_pipeline(
    df=df,
    target_column='play',
    numeric_features=[],
    categorical_features=['outlook', 'temp', 'humidity','wind'],
    test_size=0.4,
    threshold=0.7,
    custom_input=custom_input
)


Test predictions with threshold 0.7 :
 [1 0 0 1 1 1]

Custom input:
  outlook temp humidity  wind
0   Sunny  Hot     High  Weak
Predicted class: [0]
Class probabilities: [[0.49344756 0.50655244]]


In [19]:
import pandas as pd
import numpy as np
import math

# ---------------------- Helper Functions ----------------------

def entropy(y):
    values, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

def information_gain(df, feature, target):
    total_entropy = entropy(df[target])
    values, counts = np.unique(df[feature], return_counts=True)
    weighted_entropy = sum(
        (counts[i] / np.sum(counts)) * entropy(df[df[feature] == values[i]][target])
        for i in range(len(values))
    )
    return total_entropy - weighted_entropy

def best_split(df, features, target):
    gains = {feature: information_gain(df, feature, target) for feature in features}
    return max(gains, key=gains.get)

def majority_class(y):
    return y.value_counts().idxmax()

# ---------------------- ID3 Core ----------------------

class ID3Node:
    def __init__(self, feature=None, children=None, is_leaf=False, prediction=None):
        self.feature = feature
        self.children = children if children else {}
        self.is_leaf = is_leaf
        self.prediction = prediction

class ID3Classifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, df, target_column):
        features = [col for col in df.columns if col != target_column]
        self.tree = self._build_tree(df, features, target_column, depth=0)

    def _build_tree(self, df, features, target_column, depth):
        y = df[target_column]

        # Stopping conditions
        if len(np.unique(y)) == 1:
            return ID3Node(is_leaf=True, prediction=y.iloc[0])
        if len(features) == 0 or (self.max_depth is not None and depth >= self.max_depth):
            return ID3Node(is_leaf=True, prediction=majority_class(y))

        best_feature = best_split(df, features, target_column)
        node = ID3Node(feature=best_feature)

        for value in df[best_feature].unique():
            subset = df[df[best_feature] == value]
            if subset.empty:
                child = ID3Node(is_leaf=True, prediction=majority_class(y))
            else:
                new_features = [f for f in features if f != best_feature]
                child = self._build_tree(subset, new_features, target_column, depth + 1)
            node.children[value] = child

        return node

    def _predict_instance(self, node, instance):
        if node.is_leaf:
            return node.prediction
        feature_value = instance[node.feature]
        if feature_value in node.children:
            return self._predict_instance(node.children[feature_value], instance)
        else:
            return None  # unknown value

    def predict(self, df):
        return df.apply(lambda x: self._predict_instance(self.tree, x), axis=1)

# ---------------------- Example Usage ----------------------

# Sample dataset (same as before)
df = pd.read_csv('play_tennis.csv')
df = df.drop('day', axis=1)

# Train
model = ID3Classifier(max_depth=3)
model.fit(df, target_column='play')

# Predict on custom input
custom_input = pd.DataFrame([{
    'outlook': 'Sunny',
    'temp': 'Hot',
    'humidity': 'High',
    'wind': 'Weak'
}])

predictions = model.predict(custom_input)

print("Custom input prediction:", predictions.tolist())


Custom input prediction: ['No']
