In [1]:
#VERSION 1

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
'''
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _gini(self, y):
        gini = 1.0
        for i in [0,1]:
            p = len(y[y == i]) / len(y)
            gini -= p**2
        return gini

    def _split(self, X, y, feature, threshold):
        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _best_split(self, X, y):
        best_gini = float('inf')
        best_split = None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split(X, y, feature, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                gini_left = self._gini(y_left)
                gini_right = self._gini(y_right)
                gini = (len(y_left) / len(y)) * gini_left + (len(y_right) / len(y)) * gini_right

                if gini < best_gini:
                    best_gini = gini
                    best_split = {
                        'feature': feature,
                        'threshold': threshold,
                        'X_left': X_left,
                        'X_right': X_right,
                        'y_left': y_left,
                        'y_right': y_right
                    }
        return best_split

    def _build_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
        predicted_class = np.argmax(num_samples_per_class)
        node = {
            'depth': depth,
            'num_samples': len(y),
            'num_samples_per_class': num_samples_per_class,
            'predicted_class': predicted_class,
        }

        if depth < self.max_depth and len(y) >= self.min_samples_split:
            best_split = self._best_split(X, y)
            if best_split:
                node['feature'] = best_split['feature']
                node['threshold'] = best_split['threshold']
                node['left'] = self._build_tree(best_split['X_left'], best_split['y_left'], depth + 1)
                node['right'] = self._build_tree(best_split['X_right'], best_split['y_right'], depth + 1)
        return node

    def _traverse_tree(self, x, node):
        if 'threshold' in node:
            if x[node['feature']] <= node['threshold']:
                return self._traverse_tree(x, node['left'])
            else:
                return self._traverse_tree(x, node['right'])
        else:
            return node['predicted_class']

class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([np.bincount(tree_preds[:, i]).argmax() for i in range(X.shape[0])])

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]


# Example usage with churn dataset

# Load the dataset
df = pd.read_csv('dataset.csv')
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
    

X = df.drop(columns=['Churn'])
y = df['Churn']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
clf = DecisionTree(max_depth=5, min_samples_split=10)
#random_forest = RandomForest(n_estimators=100, max_depth=10, min_samples_split=10)

# Train the classifier
clf.fit(X_train.values, y_train.values)  # Convert to numpy arrays
#random_forest.fit(X_train.values, y_train.values)

# Make predictions on the test set
y_pred = clf.predict(X_test.values)  # Convert to numpy arrays
#y_pred_rf = random_forest.predict(X_test.values)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {accuracy_rf}')
print('Classification Report:')
print(classification_report(y_test, y_pred_rf))
'''

'\nclass DecisionTree:\n    def __init__(self, max_depth=None, min_samples_split=2):\n        self.max_depth = max_depth\n        self.min_samples_split = min_samples_split\n        self.tree = None\n\n    def fit(self, X, y):\n        self.tree = self._build_tree(X, y)\n\n    def predict(self, X):\n        return np.array([self._traverse_tree(x, self.tree) for x in X])\n\n    def _gini(self, y):\n        gini = 1.0\n        for i in [0,1]:\n            p = len(y[y == i]) / len(y)\n            gini -= p**2\n        return gini\n\n    def _split(self, X, y, feature, threshold):\n        left_mask = X[:, feature] <= threshold\n        right_mask = X[:, feature] > threshold\n        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]\n\n    def _best_split(self, X, y):\n        best_gini = float(\'inf\')\n        best_split = None\n        for feature in range(X.shape[1]):\n            thresholds = np.unique(X[:, feature])\n            for threshold in thresholds:\n           

In [34]:
#Version 2.0 with entropy as well

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion  # 'gini' or 'entropy'
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _gini(self, y):
        gini = 1.0
        for i in [0,1]:
            p = len(y[y == i]) / len(y)
            gini -= p ** 2
        return gini

    def _entropy(self, y):
        classes = np.unique(y)
        entropy = 0.0
        for i in [0,1]:
            p = len(y[y == i]) / len(y)
            entropy -= p * np.log2(p)
        return entropy

    def _split(self, X, y, feature, threshold):
        left_mask = X[:, feature] <= threshold
        right_mask = X[:, feature] > threshold
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _best_split(self, X, y):
        best_criterion_value = float('inf')
        best_split = None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                X_left, X_right, y_left, y_right = self._split(X, y, feature, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                if self.criterion == 'gini':
                    criterion_left = self._gini(y_left)
                    criterion_right = self._gini(y_right)
                elif self.criterion == 'entropy':
                    criterion_left = self._entropy(y_left)
                    criterion_right = self._entropy(y_right)

                criterion_value = (len(y_left) / len(y)) * criterion_left + (len(y_right) / len(y)) * criterion_right

                if criterion_value < best_criterion_value:
                    best_criterion_value = criterion_value
                    best_split = {
                        'feature': feature,
                        'threshold': threshold,
                        'X_left': X_left,
                        'X_right': X_right,
                        'y_left': y_left,
                        'y_right': y_right
                    }
        return best_split

    def _build_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
        predicted_class = np.argmax(num_samples_per_class)
        node = {
            'depth': depth,
            'num_samples': len(y),
            'num_samples_per_class': num_samples_per_class,
            'predicted_class': predicted_class,
        }

        if depth < self.max_depth and len(y) >= self.min_samples_split:
            best_split = self._best_split(X, y)
            if best_split:
                node['feature'] = best_split['feature']
                node['threshold'] = best_split['threshold']
                node['left'] = self._build_tree(best_split['X_left'], best_split['y_left'], depth + 1)
                node['right'] = self._build_tree(best_split['X_right'], best_split['y_right'], depth + 1)
        return node

    def _traverse_tree(self, x, node):
        if 'threshold' in node:
            if x[node['feature']] <= node['threshold']:
                return self._traverse_tree(x, node['left'])
            else:
                return self._traverse_tree(x, node['right'])
        else:
            return node['predicted_class']

    def get_params(self, deep=True):
        return {
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split
        }

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

df = pd.read_csv('dataset.csv')
df.dropna(inplace=True)

# Encoding categorical features
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])
    

X = df.drop(columns=['Churn'])
y = df['Churn']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)  # Resample the data to balance classes

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Standardize the feature variables using StandardScaler
scaler = StandardScaler()

# Fit the scaler to the training and testing data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf = DecisionTree(max_depth=5, min_samples_split=10, criterion = 'entropy')
#random_forest = RandomForest(n_estimators=100, max_depth=10, min_samples_split=10)

# Train the classifier
clf.fit(X_train, y_train)  # Convert to numpy arrays
#random_forest.fit(X_train.values, y_train.values)

# Make predictions on the test set
y_pred = clf.predict(X_test)  # Convert to numpy arrays
#y_pred_rf = random_forest.predict(X_test.values)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

  entropy -= p * np.log2(p)
  entropy -= p * np.log2(p)


KeyboardInterrupt: 

In [3]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
predictdt_y = dt_model.predict(X_test)
accuracy_dt = dt_model.score(X_test,y_test)
print("Decision Tree accuracy is :",accuracy_dt)

Decision Tree accuracy is : 0.8


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bagging_clf = BaggingClassifier(base_estimator=DecisionTree(max_depth=10, min_samples_split=10, criterion = 'entropy'), n_estimators=100, random_state=42)

# Fit the model to the training data
bagging_clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred_bagging = bagging_clf.predict(X_test)

# Evaluate the model
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
print(f'Bagging Classifier Accuracy: {accuracy_bagging}')
print('Classification Report:')
print(classification_report(y_test, y_pred_bagging))



In [37]:
from sklearn.ensemble import RandomForestClassifier
for i in [50,100,200,300,400,500,600]:
    
    random_forest_model = RandomForestClassifier(n_estimators=i, random_state=180)
    random_forest_model.fit(X_train, y_train)
    
    y_pred_rf = random_forest_model.predict(X_test)
    
    # Evaluate the model
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    print(f'Random Forest Accuracy {i}: {accuracy_rf}')



Random Forest Accuracy 50: 0.8589371980676328
Random Forest Accuracy 100: 0.8642512077294686
Random Forest Accuracy 200: 0.8584541062801933
Random Forest Accuracy 300: 0.8584541062801933
Random Forest Accuracy 400: 0.8608695652173913
Random Forest Accuracy 500: 0.8603864734299517
Random Forest Accuracy 600: 0.8599033816425121


In [33]:
class_report_rf = classification_report(y_pred_rf, y_test)
print("Random Forest Report:\n", class_report_rf)

Random Forest Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86      1018
           1       0.87      0.87      0.87      1052

    accuracy                           0.86      2070
   macro avg       0.86      0.86      0.86      2070
weighted avg       0.86      0.86      0.86      2070



In [20]:
#Better Random Forest

model_rf2 = RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1,
                                  random_state =50, 
                                   #max_features = "auto",
                                  max_leaf_nodes = 30)
model_rf2.fit(X_train, y_train)

# Make predictions
y_pred_rf2 = model_rf2.predict(X_test)
print(accuracy_score(y_test, y_pred_rf2))

class_report_rf2 = classification_report(y_pred_rf2, y_test)
print("RF2 report:\n", class_report_rf2)

0.8342995169082126
RF2 report:
               precision    recall  f1-score   support

           0       0.82      0.84      0.83       990
           1       0.85      0.83      0.84      1080

    accuracy                           0.83      2070
   macro avg       0.83      0.83      0.83      2070
weighted avg       0.83      0.83      0.83      2070



In [18]:
from sklearn.ensemble import ExtraTreesClassifier

# Initialize the Extra Trees model
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
extra_trees_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_et = extra_trees_model.predict(X_test)

# Evaluate the model
accuracy_et = accuracy_score(y_test, y_pred_et)
print(f'Extra Trees Accuracy: {accuracy_et}')

class_report_et = classification_report(y_pred_et, y_test)
print("Extra Trees Report:\n", class_report_et)

Extra Trees Accuracy: 0.855072463768116
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.85      0.85      1041
           1       0.85      0.86      0.86      1029

    accuracy                           0.86      2070
   macro avg       0.86      0.86      0.86      2070
weighted avg       0.86      0.86      0.86      2070



In [9]:
svc_model = SVC(random_state = 1)
svc_model.fit(X_train,y_train)
predict_y = svc_model.predict(X_test)
accuracy_svc = svc_model.score(X_test,y_test)
print("SVC accuracy is :",accuracy_svc)
print(classification_report(y_test, predict_y))

SVC accuracy is : 0.851207729468599
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1021
           1       0.85      0.86      0.85      1049

    accuracy                           0.85      2070
   macro avg       0.85      0.85      0.85      2070
weighted avg       0.85      0.85      0.85      2070



In [None]:
from sklearn.linear_model import LogisticRegression


log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
# SVC does not support predict_proba by default, use a different classifier or enable probability estimates
# svm_clf = SVC(probability=True)

# Use classifiers that support predict_proba
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

nb_clf = GaussianNB()
rf_clf = RandomForestClassifier()
svm_clf = SVC()


# Create a voting classifier with soft voting
voting_clf_soft = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('nb', nb_clf), ('rf', rf_clf)],
    voting='soft'
)

# Fit the model
voting_clf_soft.fit(X_train, y_train)

# Make predictions
y_pred_soft = voting_clf_soft.predict(X_test)

# Evaluate the model
accuracy_soft = accuracy_score(y_test, y_pred_soft)
print(f'Soft Voting Classifier Accuracy: {accuracy_soft}')

In [None]:
voting_clf_hard = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('svm', svm_clf)],
    voting='hard'
)

# Fit the model
voting_clf_hard.fit(X_train, y_train)

# Make predictions
y_pred_hard = voting_clf_hard.predict(X_test)

# Evaluate the model
accuracy_hard = accuracy_score(y_test, y_pred_hard)
print(f'Hard Voting Classifier Accuracy: {accuracy_hard}')
