In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import time # to measure how long the models take
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [None]:
test = pd.read_csv('/Users/youziya/Downloads/test.csv', sep=";")
train = pd.read_csv('/Users/youziya/Downloads/train.csv', sep=";")
df = pd.concat([test,train])

### Data Preprocessing

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe(include = 'all')

In [None]:
df.isnull().sum()

In [None]:
# drop columns
df.drop(['contact','day','month','pdays','poutcome'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.nunique()

In [None]:

numerical_summary = df.describe()

# Exploring the unique values of some of the categorical variables
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']
categorical_values = {column: train_data[column].unique() for column in categorical_columns}

numerical_summary, categorical_values


In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(x = df['y'])
plt.title('Box Plot of Data with Outliers')
plt.show()

In [None]:
df_no_outliers = df[df['Value'] <= ]

In [None]:
duplicate_rows = df.duplicated(subset=df.columns.difference(['age','y','job','marital','education'])) 

In [None]:
num_duplicate_rows = duplicate_rows.sum() 
num_duplicate_rows

In [None]:
numerical_columns = ['age', 'balance', 'duration', 'campaign', 'previous']

In [None]:
num_cols = X.select_dtypes(include=['int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [None]:
for feature in cat_cols:
    plt.figure(figsize=(10, 5))
    
    # Distribution of categorical features
    sns.countplot(data=df, x=feature, hue='y')
    plt.title(f'{feature.capitalize()} Distribution by Subscription', fontsize=15)
    plt.xlabel(feature.capitalize(), fontsize=12)
    plt.ylabel('Count', fontsize=12)
    plt.xticks(rotation=45)
    
    plt.show()


In [None]:
for feature in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[feature])
    plt.title(f'Boxplot of {feature.capitalize()}', fontsize=15)
    plt.xlabel(feature.capitalize(), fontsize=12)
    plt.show()

In [None]:
sns.pairplot(df, hue='y')
plt.show()

In [None]:
# Create a box plot
sns.boxplot(x="day", y="total_bill", data=data)

# Add labels and title
plt.xlabel("Day of the Week")
plt.ylabel("Total Bill Amount")
plt.title("Box Plot of Total Bill Amount by Day")

# Show the plot
plt.show()


In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
descriptive_stats = df.describe()
print(descriptive_stats)

In [None]:
y = df["y"]
X = df.drop("y", axis=1)

In [None]:
# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

In [None]:
# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols)])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
base_estimator = DecisionTreeClassifier(max_depth=20)

# Create the BaggingClassifier with the specified base estimator
bag = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)

In [None]:
# Set up models to compare - I am adding some initial parameters
knn = KNeighborsClassifier(n_neighbors=10)
log_reg = LogisticRegression(C=0.1)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
bag = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)
voting = VotingClassifier(estimators=[('knn', knn), ('bag', bag), ('lr', log_reg)])

In [None]:
classifiers = {
    'K-Nearest Neighbors': knn,
    'Logistic Regression': log_reg,
    'Random Forest': rf,
    'AdaBoost': ada,
    'Bagging': bag,
    'Voting': voting
}

In [None]:
results = {}

In [None]:
# Initialize a dictionary to store the results
results = {}

# Loop through list of models to compare performance
for name, clf in classifiers.items():
    start_time = time.time()
    
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_true = y_test
    y_pred = pipeline.predict(X_test) 

    # Compute metrics
    precision = precision_score(y_true, y_pred, pos_label='yes')
    recall = recall_score(y_true, y_pred, pos_label='yes')
    f1 = f1_score(y_true, y_pred, pos_label='yes')
    accuracy = accuracy_score(y_test, y_pred)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Store results
    results[name] = {
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Accuracy': accuracy,
        'Time (s)': elapsed_time
    }

# Convert results to DataFrame for easier viewing
results_df = pd.DataFrame(results).T
print(results_df)


# Interpretation

### The knn methos is one of the most inefficient one as it takes longer time to process. Its precision score is high and recall score is low meaning that it correcly identiies how many people subscirbe but missess a significant number of actual subscribers. 
### The logisitc regression is the most efficent model as it takes the least time. The overall perforamnce is good but not he best.
### Random Forest model overall performance is good even though it takes 11 seconds.
### Ada Boost is faster than knn but slower than logistic regressionn. Its F1 score and accuracy is similar to logistic regresssion.
### Bagging method is relatively efficient one as it takes 6 seconds. Its precison score and recall score is overalll good however it means
### Voting method is the most inefficent one as it takes 40 seconds. Its performance is not good either.

## Tuning parameters

In [None]:
# Define the base estimator (e.g., DecisionTreeClassifier)
base_estimator = DecisionTreeClassifier(max_depth=20)

# Create the BaggingClassifier with the specified base estimator
bag = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)

In [None]:
# Set up models to compare - I am adding some initial parameters
knn = KNeighborsClassifier(n_neighbors=10)
log_reg = LogisticRegression(penalty='l2',
    C=0.1,    
    solver='liblinear',
    max_iter=100,  
    multi_class='ovr',
    class_weight='balanced')
rf = RandomForestClassifier(n_estimators=100, random_state=42)
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
bag = BaggingClassifier(base_estimator=base_estimator, n_estimators=10, random_state=42)
voting = VotingClassifier(estimators=[('knn', knn), ('bag', bag), ('lr', log_reg)])

In [None]:
knn = KNeighborsClassifier({'n_neighbors': [5, 10, 15]})
log_reg = LogisticRegression({'C': [0.001, 0.01, 0.1, 1, 10]})
rf = RandomForestClassifier({'n_estimators': [50, 100, 200]})
ada = AdaBoostClassifier({'n_estimators': [50, 100, 200]})
bag = BaggingClassifier({'n_estimators': [10, 50, 100]})
voting = VotingClassifier(estimators=[('lr', log_reg), ('knn', knn), ('rf', rf)])

In [None]:
# Hyperparameter grids for tuning
knn_params = {'classifier__n_neighbors': [3, 5, 7, 20, 30, 50, 100]}
log_reg_params = {'classifier__C': [0.1, 1, 10]}
rf_params = {'classifier__n_estimators': [50, 100, 150], 'classifier__max_depth': [None, 10, 20, 30, 50]}
ada_params = {'classifier__n_estimators': [25, 50, 75]}
bag_params = {'classifier__n_estimators': [5, 10, 20]}
voting_params = {'classifier__voting': ['hard', 'soft']}

params_dict = {
    'K-Nearest Neighbors': knn_params,
    'Logistic Regression': log_reg_params,
    'Random Forest': rf_params,
    'AdaBoost': ada_params,
    'Bagging': bag_params,
    'Voting': voting_params
}

# Initialize results dictionary for tuned models
tuned_results = {}

In [None]:
# Loop through classifiers for tuning
for name, clf in classifiers.items():
    start_time = time.time()
    
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])
    
    # Create GridSearchCV object
    grid = GridSearchCV(pipeline, params_dict[name], cv=5)
    
    # Fit the model
    grid.fit(X_train, y_train)
    
    # Get the best estimator and predict
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_true = y_test
    y_pred = pipeline.predict(X_test) 
    
    # Compute metrics
    precision = precision_score(y_true, y_pred, pos_label='yes')
    recall = recall_score(y_true, y_pred, pos_label='yes')
    f1 = f1_score(y_true, y_pred, pos_label='yes')
    accuracy = accuracy_score(y_test, y_pred)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Store results
    tuned_results[name] = {
        'Best Params': grid.best_params_,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Time (s)': elapsed_time
    }

# Convert results to DataFrame for easier viewing
tuned_results_df = pd.DataFrame(tuned_results).T
print(tuned_results_df);
