In [1]:
#VERSION 1

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
'''
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _gini(self, y):
        gini = 1.0
        for i in [0,1]:
            p = len(y[y == i]) / len(y)
            gini -= p**2
        return gini

    def _split(self, X, y, feature, threshold):
        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _best_split(self, X, y):
        best_gini = float('inf')
        best_split = None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split(X, y, feature, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                gini_left = self._gini(y_left)
                gini_right = self._gini(y_right)
                gini = (len(y_left) / len(y)) * gini_left + (len(y_right) / len(y)) * gini_right

                if gini < best_gini:
                    best_gini = gini
                    best_split = {
                        'feature': feature,
                        'threshold': threshold,
                        'X_left': X_left,
                        'X_right': X_right,
                        'y_left': y_left,
                        'y_right': y_right
                    }
        return best_split

    def _build_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
        predicted_class = np.argmax(num_samples_per_class)
        node = {
            'depth': depth,
            'num_samples': len(y),
            'num_samples_per_class': num_samples_per_class,
            'predicted_class': predicted_class,
        }

        if depth < self.max_depth and len(y) >= self.min_samples_split:
            best_split = self._best_split(X, y)
            if best_split:
                node['feature'] = best_split['feature']
                node['threshold'] = best_split['threshold']
                node['left'] = self._build_tree(best_split['X_left'], best_split['y_left'], depth + 1)
                node['right'] = self._build_tree(best_split['X_right'], best_split['y_right'], depth + 1)
        return node

    def _traverse_tree(self, x, node):
        if 'threshold' in node:
            if x[node['feature']] <= node['threshold']:
                return self._traverse_tree(x, node['left'])
            else:
                return self._traverse_tree(x, node['right'])
        else:
            return node['predicted_class']

class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([np.bincount(tree_preds[:, i]).argmax() for i in range(X.shape[0])])

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]


# Example usage with churn dataset

# Load the dataset
df = pd.read_csv('dataset.csv')
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
    

X = df.drop(columns=['Churn'])
y = df['Churn']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
clf = DecisionTree(max_depth=5, min_samples_split=10)
#random_forest = RandomForest(n_estimators=100, max_depth=10, min_samples_split=10)

# Train the classifier
clf.fit(X_train.values, y_train.values)  # Convert to numpy arrays
#random_forest.fit(X_train.values, y_train.values)

# Make predictions on the test set
y_pred = clf.predict(X_test.values)  # Convert to numpy arrays
#y_pred_rf = random_forest.predict(X_test.values)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf}')
print('Classification Report:')
print(classification_report(y_test, y_pred_rf))
'''

'\nclass DecisionTree:\n    def __init__(self, max_depth=None, min_samples_split=2):\n        self.max_depth = max_depth\n        self.min_samples_split = min_samples_split\n        self.tree = None\n\n    def fit(self, X, y):\n        self.tree = self._build_tree(X, y)\n\n    def predict(self, X):\n        return np.array([self._traverse_tree(x, self.tree) for x in X])\n\n    def _gini(self, y):\n        gini = 1.0\n        for i in [0,1]:\n            p = len(y[y == i]) / len(y)\n            gini -= p**2\n        return gini\n\n    def _split(self, X, y, feature, threshold):\n        left_mask = X[:, feature] <= threshold\n        right_mask = X[:, feature] > threshold\n        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]\n\n    def _best_split(self, X, y):\n        best_gini = float(\'inf\')\n        best_split = None\n        for feature in range(X.shape[1]):\n            thresholds = np.unique(X[:, feature])\n            for threshold in thresholds:\n           

In [3]:
#Version 2.0 with entropy as well

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion  # 'gini' or 'entropy'
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _gini(self, y):
        gini = 1.0
        for i in [0,1]:
            p = len(y[y == i]) / len(y)
            gini -= p ** 2
        return gini

    def _entropy(self, y):
        classes = np.unique(y)
        entropy = 0.0
        for i in [0,1]:
            p = len(y[y == i]) / len(y)
            entropy -= p * np.log2(p)
        return entropy

    def _split(self, X, y, feature, threshold):
        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _best_split(self, X, y):
        best_criterion_value = float('inf')
        best_split = None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split(X, y, feature, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                if self.criterion == 'gini':
                    criterion_left = self._gini(y_left)
                    criterion_right = self._gini(y_right)
                elif self.criterion == 'entropy':
                    criterion_left = self._entropy(y_left)
                    criterion_right = self._entropy(y_right)

                criterion_value = (len(y_left) / len(y)) * criterion_left + (len(y_right) / len(y)) * criterion_right

                if criterion_value < best_criterion_value:
                    best_criterion_value = criterion_value
                    best_split = {
                        'feature': feature,
                        'threshold': threshold,
                        'X_left': X_left,
                        'X_right': X_right,
                        'y_left': y_left,
                        'y_right': y_right
                    }
        return best_split

    def _build_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
        predicted_class = np.argmax(num_samples_per_class)
        node = {
            'depth': depth,
            'num_samples': len(y),
            'num_samples_per_class': num_samples_per_class,
            'predicted_class': predicted_class,
        }

        if depth < self.max_depth and len(y) >= self.min_samples_split:
            best_split = self._best_split(X, y)
            if best_split:
                node['feature'] = best_split['feature']
                node['threshold'] = best_split['threshold']
                node['left'] = self._build_tree(best_split['X_left'], best_split['y_left'], depth + 1)
                node['right'] = self._build_tree(best_split['X_right'], best_split['y_right'], depth + 1)
        return node

    def _traverse_tree(self, x, node):
        if 'threshold' in node:
            if x[node['feature']] <= node['threshold']:
                return self._traverse_tree(x, node['left'])
            else:
                return self._traverse_tree(x, node['right'])
        else:
            return node['predicted_class']

    def get_params(self, deep=True):
        return {
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split
        }

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

df = pd.read_csv('dataset.csv')
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
    

X = df.drop(columns=['Churn'])
y = df['Churn']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


clf = DecisionTree(max_depth=5, min_samples_split=10, criterion = 'entropy')
#random_forest = RandomForest(n_estimators=100, max_depth=10, min_samples_split=10)

# Train the classifier
clf.fit(X_train.values, y_train.values)  # Convert to numpy arrays
#random_forest.fit(X_train.values, y_train.values)

# Make predictions on the test set
y_pred = clf.predict(X_test.values)  # Convert to numpy arrays
#y_pred_rf = random_forest.predict(X_test.values)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

  entropy -= p * np.log2(p)
  entropy -= p * np.log2(p)


Accuracy: 0.7977288857345636
Confusion Matrix:
 [[887 149]
 [136 237]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.86      0.86      1036
           1       0.61      0.64      0.62       373

    accuracy                           0.80      1409
   macro avg       0.74      0.75      0.74      1409
weighted avg       0.80      0.80      0.80      1409



In [10]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging_clf = BaggingClassifier(base_estimator=DecisionTree(max_depth=10, min_samples_split=10, criterion = 'entropy'), n_estimators=100, random_state=42)

# Fit the model to the training data
bagging_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred_bagging = bagging_clf.predict(X_test)

# Evaluate the model
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
print(f'Bagging Classifier Accuracy: {accuracy_bagging}')
print('Classification Report:')
print(classification_report(y_test, y_pred_bagging))

  entropy -= p * np.log2(p)
  entropy -= p * np.log2(p)


Bagging Classifier Accuracy: 0.8119233498935415
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      1036
           1       0.68      0.55      0.61       373

    accuracy                           0.81      1409
   macro avg       0.76      0.73      0.74      1409
weighted avg       0.80      0.81      0.81      1409



In [5]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

y_pred_rf = random_forest_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf}')

Random Forest Accuracy: 0.7970191625266146


In [6]:
#Better Random Forest

model_rf2 = RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1,
                                  random_state =50, 
                                   #max_features = "auto",
                                  max_leaf_nodes = 30)
model_rf2.fit(X_train, y_train)

# Make predictions
y_pred_rf2 = model_rf2.predict(X_test)
print (accuracy_score(y_test, y_pred_rf2))

0.8055358410220014


In [7]:
from sklearn.ensemble import ExtraTreesClassifier

# Initialize the Extra Trees model
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
extra_trees_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_et = extra_trees_model.predict(X_test)

# Evaluate the model
accuracy_et = accuracy_score(y_test, y_pred_et)
print(f'Extra Trees Accuracy: {accuracy_et}')

Extra Trees Accuracy: 0.794180269694819


In [8]:
from sklearn.linear_model import LogisticRegression


log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
# SVC does not support predict_proba by default, use a different classifier or enable probability estimates
# svm_clf = SVC(probability=True)

# Use classifiers that support predict_proba
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

nb_clf = GaussianNB()
rf_clf = RandomForestClassifier()
svm_clf = SVC()


# Create a voting classifier with soft voting
voting_clf_soft = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('nb', nb_clf), ('rf', rf_clf)],
    voting='soft'
)

# Fit the model
voting_clf_soft.fit(X_train, y_train)

# Make predictions
y_pred_soft = voting_clf_soft.predict(X_test)

# Evaluate the model
accuracy_soft = accuracy_score(y_test, y_pred_soft)
print(f'Soft Voting Classifier Accuracy: {accuracy_soft}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Soft Voting Classifier Accuracy: 0.7842441447835344


In [9]:
voting_clf_hard = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('svm', svm_clf)],
    voting='hard'
)

# Fit the model
voting_clf_hard.fit(X_train, y_train)

# Make predictions
y_pred_hard = voting_clf_hard.predict(X_test)

# Evaluate the model
accuracy_hard = accuracy_score(y_test, y_pred_hard)
print(f'Hard Voting Classifier Accuracy: {accuracy_hard}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Hard Voting Classifier Accuracy: 0.7885024840312278
