<a href="https://colab.research.google.com/github/clariceG/BIProject/blob/main/BIproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Milestone 3: Training the model

In [173]:
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np

# Loading the Dataset
file_name= 'census.csv'
df=pd.read_csv(file_name)

In [174]:
# To drop the columns that will not be used
columns_to_drop = ['race', 'native-country', 'hours-per-week', 'education-num']
df.drop(columns_to_drop, axis=1, inplace=True)


In [175]:
# To see whether the data is duplicated
# It has 16239 duplicates
df[df.duplicated()].shape[0]

16239

In [176]:
# To remove the duplicated data
df.drop_duplicates(inplace=True)

# To see whether there are any duplicates now
df[df.duplicated()].shape[0]

0

In [177]:
# To find size of dataset after droping duplicates
df.shape

(28983, 10)

In [178]:
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)

# Scale numerical data
numerical=['age','capital-gain','capital-loss']

df[numerical]=scaler.fit_transform(df[numerical])

In [179]:
df[numerical]


Unnamed: 0,age,capital-gain,capital-loss
0,0.301370,0.021740,0.0
1,0.452055,0.000000,0.0
2,0.287671,0.000000,0.0
3,0.493151,0.000000,0.0
4,0.150685,0.000000,0.0
...,...,...,...
45212,0.424658,0.000000,0.0
45216,0.424658,0.000000,0.0
45217,0.219178,0.000000,0.0
45218,0.301370,0.000000,0.0


In [180]:
# To Get raw income and drop it from our census and initialise as Y
y= df['income']
y.head()

0    <=50K
1    <=50K
2    <=50K
3    <=50K
4    <=50K
Name: income, dtype: object

In [181]:
# To drop income our census
df.drop('income',axis=1,inplace=True)


In [182]:
df

Unnamed: 0,age,workclass,education_level,marital-status,occupation,relationship,sex,capital-gain,capital-loss
0,0.301370,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,Male,0.021740,0.0
1,0.452055,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,Male,0.000000,0.0
2,0.287671,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,Male,0.000000,0.0
3,0.493151,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Male,0.000000,0.0
4,0.150685,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Female,0.000000,0.0
...,...,...,...,...,...,...,...,...,...
45212,0.424658,Private,HS-grad,Married-civ-spouse,Adm-clerical,Husband,Male,0.000000,0.0
45216,0.424658,Local-gov,Masters,Divorced,Other-service,Not-in-family,Male,0.000000,0.0
45217,0.219178,Private,Bachelors,Never-married,Prof-specialty,Own-child,Male,0.000000,0.0
45218,0.301370,Private,Bachelors,Divorced,Prof-specialty,Not-in-family,Female,0.000000,0.0


In [183]:
# Encode the 'income' data to numerical values
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
y=y.apply(lambda x:0 if x=='<=50K' else 1) # To convert it to 0 or 1

In [184]:
# Changing categorical data to numbers since a model will use numbers to make predictions
# Features after one-hot encoding
X = pd.get_dummies(df)
X


Unnamed: 0,age,capital-gain,capital-loss,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Female,sex_ Male
0,0.301370,0.021740,0.0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
1,0.452055,0.000000,0.0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0.287671,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0.493151,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0.150685,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45212,0.424658,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
45216,0.424658,0.000000,0.0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
45217,0.219178,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
45218,0.301370,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [185]:
y

0        0
1        0
2        0
3        0
4        0
        ..
45212    0
45216    0
45217    0
45218    0
45220    0
Name: income, Length: 28983, dtype: int64

## Selection of Data Mining Algorithm

In [186]:
# Spliting of the dataset
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test= train_test_split(X,y,test_size=0.3,random_state=0)

In [187]:
X_train

Unnamed: 0,age,capital-gain,capital-loss,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Female,sex_ Male
10492,0.328767,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1516,0.260274,0.0,0.000000,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
29865,0.260274,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
35185,0.493151,0.0,0.000000,1,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1
34710,0.191781,0.0,0.345271,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17305,0.219178,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
28085,0.315068,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
12319,0.301370,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
13719,0.260274,0.0,0.000000,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1


In [188]:
X_test

Unnamed: 0,age,capital-gain,capital-loss,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Female,sex_ Male
16582,0.369863,0.0,0.000000,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
5247,0.136986,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
9061,0.671233,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
38562,0.191781,0.0,0.000000,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
31559,0.520548,0.0,0.000000,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22905,0.438356,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
37941,0.452055,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
19585,0.602740,0.0,0.000000,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
41698,0.164384,0.0,0.436639,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [189]:
Y_train

10492    0
1516     0
29865    0
35185    1
34710    0
        ..
17305    1
28085    0
12319    0
13719    1
2936     0
Name: income, Length: 20288, dtype: int64

In [190]:
Y_test

16582    0
5247     0
9061     0
38562    0
31559    0
        ..
22905    0
37941    0
19585    0
41698    1
15594    0
Name: income, Length: 8695, dtype: int64

In [191]:
X

Unnamed: 0,age,capital-gain,capital-loss,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Husband,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Female,sex_ Male
0,0.301370,0.021740,0.0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
1,0.452055,0.000000,0.0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0.287671,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0.493151,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0.150685,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45212,0.424658,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
45216,0.424658,0.000000,0.0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
45217,0.219178,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
45218,0.301370,0.000000,0.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [192]:
y

0        0
1        0
2        0
3        0
4        0
        ..
45212    0
45216    0
45217    0
45218    0
45220    0
Name: income, Length: 28983, dtype: int64

### Decision Tree Classifier


In [193]:
# Import the DecisionTreeClassifier class from scikit-learn's tree module.
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.utils import resample
import numpy as np

# Create an instance of the DecisionTreeClassifier class with specific hyperparameter settings.
dt=DecisionTreeClassifier()

# Train (fit) the Decision Tree classifier on the training data.
dt.fit(X_train, Y_train)

# Print accuracy on the training data
print("Accuracy on Training Data:", dt.score(X_train, Y_train))


Accuracy on Training Data: 0.9564274447949527


In [194]:
# Bootstrapping
n_bootstraps = 1000
bootstrap_accuracies = []

for _ in range(n_bootstraps):
    # Create a bootstrap sample
    X_bootstrap, Y_bootstrap = resample(X_train, Y_train, random_state=np.random.randint(0, 100))

    # Create an instance of the DecisionTreeClassifier
    dt_bootstrap = DecisionTreeClassifier()

    # Train the Decision Tree classifier on the bootstrap sample
    dt_bootstrap.fit(X_bootstrap, Y_bootstrap)

    # Evaluate the accuracy on the original training data
    accuracy = dt_bootstrap.score(X_train, Y_train)
    bootstrap_accuracies.append(accuracy)

# Print average accuracy over bootstraps
print("Average Accuracy with Bootstrapping:", np.mean(bootstrap_accuracies))

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Adjust the number of folds as needed

# Perform cross-validation and print the mean accuracy
cross_val_scores = cross_val_score(dt, X_train, Y_train, cv=cv)
print("Cross-Validation Mean Accuracy:", np.mean(cross_val_scores))

# Make predictions on the test data
y_pred = dt.predict(X_test)

# Print accuracy on the test data
print("Accuracy on Test Data:", dt.score(X_test, Y_test))

Average Accuracy with Bootstrapping: 0.8995173501577287
Cross-Validation Mean Accuracy: 0.7705542981464355
Accuracy on Test Data: 0.7694077055779184


In [195]:
y_pred

array([1, 0, 0, ..., 0, 1, 0])

In [196]:
dt_pred = dt.predict(X_train)

dt_pred

array([0, 0, 0, ..., 0, 1, 0])

In [197]:
# Trying to make a whole new prediction
dt.predict(X_test[2:3])

array([0])

In [198]:
from sklearn.metrics import classification_report, confusion_matrix
# Generate a classification report
report = classification_report(Y_test, y_pred)

# Print the report
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.85      0.84      6345
           1       0.57      0.57      0.57      2350

    accuracy                           0.77      8695
   macro avg       0.71      0.71      0.71      8695
weighted avg       0.77      0.77      0.77      8695



In [199]:
# Generate Confusion Matrix
confusion_matrix = confusion_matrix(Y_test, y_pred)

# Print Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix)

Confusion Matrix:
 [[5362  983]
 [1022 1328]]


In [200]:
input_data = [
    [0, 0.5, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
     0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
     0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
]

# Assuming ensemble_classifier is your VotingClassifier instance
predictions = dt.predict(input_data)

print(predictions)

[0]




### Logistic Regression


In [201]:
# Import Logistic Regression
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression Model
log=LogisticRegression(max_iter=1000)

# Train (fit) the Decision Tree classifier on the training data.
log.fit(X_train, Y_train)

# Print accuracy on the training data
print("Accuracy on Training Data:", log.score(X_train, Y_train))


Accuracy on Training Data: 0.8271391955835962


In [None]:
# Bootstrapping for Logistic Regression
bootstrap_accuracies_log = []

for _ in range(n_bootstraps):
    # Create a bootstrap sample
    X_bootstrap, Y_bootstrap = resample(X_train, Y_train, random_state=np.random.randint(0, 100))

    # Create an instance of the LogisticRegression
    log_bootstrap = LogisticRegression(max_iter=1000)

    # Train the Logistic Regression model on the bootstrap sample
    log_bootstrap.fit(X_bootstrap, Y_bootstrap)

    # Evaluate the accuracy on the original training data
    accuracy_log = log_bootstrap.score(X_train, Y_train)
    bootstrap_accuracies_log.append(accuracy_log)

# Print average accuracy over bootstraps for Logistic Regression
print("Average Accuracy with Bootstrapping :", np.mean(bootstrap_accuracies_log))

# Cross-validation for Logistic Regression
cross_val_scores_log = cross_val_score(log, X_train, Y_train, cv=cv)
print("Cross-Validation Mean Accuracy :", np.mean(cross_val_scores_log))

# Make predictions on the test data
lg_pred = log.predict(X_test)

# Print accuracy on the test data
print("Accuracy on Test Data:", log.score(X_test, Y_test))

In [109]:
lg_pred

array([0, 0, 0, ..., 0, 0, 0])

In [110]:
log_pred = log.predict(X_train)

log_pred

array([0, 0, 0, ..., 0, 1, 0])

In [113]:
from sklearn.metrics import confusion_matrix as sk_confusion_matrix

# Generate Confusion Matrix
matrix = sk_confusion_matrix(Y_test, lg_pred)

# Print Confusion Matrix
print("Confusion Matrix for Logistic Regression:\n", matrix)

Confusion Matrix for Logistic Regression:
 [[5833  512]
 [ 995 1355]]


In [114]:
# Generate a classification report
report = classification_report(Y_test, lg_pred)

# Print the report
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.92      0.89      6345
           1       0.73      0.58      0.64      2350

    accuracy                           0.83      8695
   macro avg       0.79      0.75      0.76      8695
weighted avg       0.82      0.83      0.82      8695



In [127]:
from sklearn.metrics import classification_report

# Model Performance comparison using resamples
print("\nModel Performance Comparison:")
print("Decision Tree:")
dt_predictions = dt.predict(X_train)
print(classification_report(Y_train, dt_predictions))

print("Logistic Regression:")
log_predictions = log.predict(X_train)
print(classification_report(Y_train, log_predictions))



Model Performance Comparison:
Decision Tree:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     14850
           1       1.00      0.84      0.91      5438

    accuracy                           0.96     20288
   macro avg       0.97      0.92      0.94     20288
weighted avg       0.96      0.96      0.96     20288

Logistic Regression:
              precision    recall  f1-score   support

           0       0.86      0.92      0.89     14850
           1       0.72      0.57      0.64      5438

    accuracy                           0.83     20288
   macro avg       0.79      0.75      0.76     20288
weighted avg       0.82      0.83      0.82     20288



In [171]:
input_data = [
    [0, 0.5, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
     0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
     0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
]

# Assuming ensemble_classifier is your VotingClassifier instance
predictions = log.predict(input_data)

print(predictions)

[1]




# Milestone 4: Hyperparameter Tuning


## Hypertuning of Decision Tree

In [128]:
# GridSearchCV is a class in scikit-learn's model_selection module that facilitates
# hyperparameter tuning for machine learning models through an exhaustive search
# over a specified parameter grid.

#This process helps find the optimal combination of hyperparameters that yields
#the best performance for a given model and dataset.
from sklearn.model_selection import GridSearchCV

In [129]:
# Specifying the parameters
parameters={'min_samples_split':range(10,500,20),'max_depth':range(1,20,2),'criterion':['gini','entropy']}

clf_tree=DecisionTreeClassifier()

# Create an Instance:
clf=GridSearchCV(clf_tree,parameters,cv=5)

# Fitting the Grid Search
clf.fit(X_train,Y_train)

In [130]:
# Best_estimator_ represents the best-performing model (estimator)
# found during the hyperparameter tuning process.
clf.best_estimator_

In [131]:
# Make a prediction using X_test
clf_predict=clf.predict(X_test)

In [132]:
# To calculate the accuracy score
accuracy_score(clf_predict,Y_test)

0.8427832087406556

In [134]:
# Generate Confusion Matrix
confusion_matrix = sk_confusion_matrix(Y_test,clf_predict)

# Print Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix)

Confusion Matrix:
 [[5989  356]
 [1011 1339]]


In [135]:
# Generate Classification report
report= classification_report(Y_test,clf_predict)

# Print the report
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.94      0.90      6345
           1       0.79      0.57      0.66      2350

    accuracy                           0.84      8695
   macro avg       0.82      0.76      0.78      8695
weighted avg       0.84      0.84      0.83      8695



### Decision Tree with Parameters

In [136]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(max_depth=17, min_samples_split=110,criterion='entropy') # The parameters from hypertuning
dt.fit(X_train,Y_train)


print("Accuracy on Training Data:",(dt.score(X_train,Y_train)))

Accuracy on Training Data: 0.8552839116719243


In [139]:
prediction=dt.predict(X_test)

# Print accuracy on the test data
print("Accuracy on Test Data:", clf.score(X_test, Y_test))

Accuracy on Test Data: 0.8427832087406556


In [140]:
prediction

array([1, 0, 0, ..., 0, 1, 0])

In [142]:
# Generate Confusion Matrix
confusion_matrix = sk_confusion_matrix(Y_test,prediction)

# Print Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix)

Confusion Matrix:
 [[5989  356]
 [1011 1339]]


In [143]:
from sklearn.metrics import classification_report, confusion_matrix
# Generate Classification Report
classification_report= classification_report(Y_test,prediction)

# Print the report
print("Classification Report:\n", classification_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.94      0.90      6345
           1       0.79      0.57      0.66      2350

    accuracy                           0.84      8695
   macro avg       0.82      0.76      0.78      8695
weighted avg       0.84      0.84      0.83      8695



In [170]:
input_data = [
    [0, 0.5, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
     0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
     0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
]

# Assuming ensemble_classifier is your VotingClassifier instance
predictions = dt.predict(input_data)

print(predictions)

[0]




## Hypretuning of Logistic Regression

In [144]:
# Create a Logistic Regression Model
model = LogisticRegression(max_iter=1000)

# Define Hyperparameter Grid
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01, 0.001]

parameters = dict(solver=solvers, penalty=penalty, C=c_values)

# Set up Grid Search
grid_search_log = GridSearchCV(estimator=model, param_grid=parameters, cv=5)

# Fit the Grid Search
grid_search_log.fit(X_train, Y_train)


In [145]:
print("Best Hyperparameters:", grid_search_log.best_params_)
print("Best Estimator:", grid_search_log.best_estimator_)
print("Best Cross-validated Score:", grid_search_log.best_score_)

Best Hyperparameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Estimator: LogisticRegression(C=100, max_iter=1000, solver='liblinear')
Best Cross-validated Score: 0.8285192172216199


In [146]:
grid_log=grid_search_log.predict(X_test)

In [148]:
grid_log

array([0, 0, 0, ..., 0, 0, 0])

In [149]:
print("Accuracy score on Test data:",accuracy_score(grid_log,Y_test))

Accuracy score on Test data: 0.8289821736630247


### Logistic Regression with Parameters


In [151]:
# Import Logistic Regression
from sklearn.linear_model import LogisticRegression

# Create a Logistic Regression Model
log=LogisticRegression(C=100, max_iter=1000, solver='liblinear')

# Fit the Model
log.fit(X_train,Y_train)

# Print the Accuracy
print("Accuracy score on Test data:",log.score(X_train,Y_train))

Accuracy score on Test data: 0.8299980283911672


In [152]:
# To Display the Model
log

In [153]:
# Make prediction
log_test=log.predict(X_test)

In [154]:
log_test

array([0, 0, 0, ..., 0, 0, 0])

In [155]:
# Generate Confusion Matrix
confusion_matrix = sk_confusion_matrix(log_test,Y_test)

# Print Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix)


Confusion Matrix:
 [[5846  988]
 [ 499 1362]]


In [156]:
from sklearn.metrics import classification_report, confusion_matrix
classification_report= classification_report(log_test,Y_test)

# Print the report
print("Classification Report:\n", classification_report)


Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.86      0.89      6834
           1       0.58      0.73      0.65      1861

    accuracy                           0.83      8695
   macro avg       0.75      0.79      0.77      8695
weighted avg       0.85      0.83      0.84      8695



In [169]:
input_data = [
    [0, 0.5, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
     0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
     0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
]

# Assuming ensemble_classifier is your VotingClassifier instance
predictions = log.predict(input_data)

print(predictions)

[1]




## Ensembles


In [157]:
# use ensemble methods like VotingClassifier from scikit-learn
# to combine the predictions of multiple classifiers,
# such as Decision Tree and Logistic Regression.
from sklearn.ensemble import VotingClassifier

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create individual classifiers
dt_classifier = DecisionTreeClassifier()
log_classifier = LogisticRegression(max_iter=1000)

# Create an ensemble of classifiers using a VotingClassifier
ensemble_classifier = VotingClassifier(estimators=[('decision_tree', dt_classifier), ('logistic_regression', log_classifier)], voting='hard')

# Train (fit) the ensemble classifier on the training data
ensemble_classifier.fit(X_train, Y_train)

# Make predictions on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

# Evaluate the accuracy of the ensemble classifier
ensemble_accuracy = accuracy_score(Y_test, ensemble_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

Ensemble Accuracy: 0.8115008625646923


In [162]:
# Generate Confusion Matrix
confusion_matrix = sk_confusion_matrix(ensemble_predictions, Y_test)

# Print Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix)

Confusion Matrix:
 [[6039 1362]
 [ 277 1017]]


In [163]:
from sklearn.metrics import classification_report, confusion_matrix
classification_report= classification_report(ensemble_predictions,Y_test)

# Print the report
print("Classification Report:\n", classification_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.82      0.88      7401
           1       0.43      0.79      0.55      1294

    accuracy                           0.81      8695
   macro avg       0.69      0.80      0.72      8695
weighted avg       0.88      0.81      0.83      8695



In [168]:
input_data = [
    [0, 0.5, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
     0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
     0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
]

# Assuming ensemble_classifier is your VotingClassifier instance
predictions = ensemble_classifier.predict(input_data)

print(predictions)

[0]


