In [518]:
import warnings
# Set the warnings to be ignored
warnings.filterwarnings('ignore')

import os
import sys
import logging
import pandas as pd 
import numpy as np 
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import pickle

In [519]:
df = pd.read_csv("D:\Data-Science-D-drive\sensor_placement_LogisticRegression\data\sensor_placement\df_resampled.csv")

In [520]:
df.head()

Unnamed: 0,Time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,Output
0,1.069633,0.766911,-0.431809,2.254039,0.127606,2.528193,-0.795727,bending1
1,1.076813,-0.590542,-0.388056,-1.512858,1.40578,1.893224,-0.070364,bending1
2,1.091174,-0.500045,-0.431809,-0.591224,-0.609801,1.321752,-0.740403,bending1
3,1.141435,0.721663,-0.247077,1.814568,1.092381,2.59169,-0.900229,bending1
4,1.148616,-0.138058,-0.524174,0.119464,0.029285,2.337703,-0.611313,bending1


# Loading Dependent and Independent Features

In [521]:
x = pd.read_csv("D:\Data-Science-D-drive\sensor_placement_LogisticRegression\data\sensor_placement\X_scaled_resampled.csv")

In [522]:
x.head()

Unnamed: 0,Time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23
0,1.069633,0.766911,-0.431809,2.254039,0.127606,2.528193,-0.795727
1,1.076813,-0.590542,-0.388056,-1.512858,1.40578,1.893224,-0.070364
2,1.091174,-0.500045,-0.431809,-0.591224,-0.609801,1.321752,-0.740403
3,1.141435,0.721663,-0.247077,1.814568,1.092381,2.59169,-0.900229
4,1.148616,-0.138058,-0.524174,0.119464,0.029285,2.337703,-0.611313


In [523]:
y = pd.read_csv("D:\Data-Science-D-drive/sensor_placement_LogisticRegression/data/sensor_placement/Y_resampled.csv")

In [524]:
y.head()

Unnamed: 0,Output
0,0
1,0
2,0
3,0
4,0


In [525]:
x = x.values

In [526]:
y = y.values

In [527]:
len(np.unique(y))

7

# Train Test Split:

In [528]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.30, random_state = 144)

## Stratified Sampling: 

Maintain class distribution in each subset.

In [529]:
np.unique(y_train)

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

In [530]:
len(np.unique(y_train))

7

In [531]:
len(np.unique(y_test))

7

# Model Selection:

## Newton-CG

### l2:

In [532]:
logr_newcg_l2 = LogisticRegression(verbose = 1, solver = 'newton-cg', penalty = 'l2')

In [533]:
logr_newcg_l2.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s finished


In [534]:
logr_newcg_l2.predict([x_test[1]])

array([2], dtype=int64)

In [535]:
y_test[1]

array([2], dtype=int64)

In [536]:
y_pred_newcg_l2 = logr_newcg_l2.predict(x_test)

In [537]:
confusion_matrix(y_test,y_pred_newcg_l2)

array([[1657,   29,   24,    0,  157,   41,    0],
       [ 127, 1434,   67,  103,   33,    5,   17],
       [  34,   82, 1055,    5,   87,   16,  561],
       [   0,   74,   11, 1658,   21,  102,    8],
       [ 387,  143,   41,  218,  547,  482,    4],
       [  72,    9,   37,  141,  408, 1181,    6],
       [   3,   77,  646,    3,   11,    3, 1219]], dtype=int64)

## Lib-Linear

In [538]:
# Liblinear wont perform well here as its not advised for multi-class classification
logr_liblinear = LogisticRegression(verbose=1,solver='liblinear')

In [539]:
logr_liblinear.fit(x_train,y_train)

[LibLinear]

In [540]:
logr_liblinear.predict([x_test[1]])

array([2], dtype=int64)

In [541]:
y_test[1]

array([2], dtype=int64)

In [542]:
logr_liblinear

In [543]:
y_pred_liblinear = logr_liblinear.predict(x_test)
y_pred_liblinear

array([3, 2, 3, ..., 4, 0, 6], dtype=int64)

In [544]:
confusion_matrix(y_test,y_pred_liblinear)

array([[1684,   32,   14,    1,  104,   73,    0],
       [ 130, 1428,   56,  126,   16,   13,   17],
       [  54,  120,  890,   11,   58,   30,  677],
       [   0,   90,   10, 1693,    8,   66,    7],
       [ 471,  172,   48,  261,  275,  588,    7],
       [  98,   15,   45,  193,  257, 1239,    7],
       [   2,  108,  463,    3,    5,    8, 1373]], dtype=int64)

## Default-lbfgs:

In [545]:
logr = LogisticRegression(verbose=1)

In [546]:
logr.fit(x_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s finished


In [547]:
logr

In [548]:
y_pred_default = logr.predict(x_test)

In [549]:
y_pred_default

array([3, 2, 3, ..., 4, 0, 6], dtype=int64)

In [550]:
confusion_matrix(y_test,y_pred_default)

array([[1657,   29,   24,    0,  157,   41,    0],
       [ 127, 1434,   67,  103,   33,    5,   17],
       [  34,   82, 1055,    5,   87,   16,  561],
       [   0,   74,   11, 1658,   21,  102,    8],
       [ 387,  143,   41,  218,  547,  482,    4],
       [  72,    9,   37,  141,  407, 1182,    6],
       [   3,   77,  646,    3,   11,    3, 1219]], dtype=int64)

## Automated

In [551]:
solver_params = {
    'lbfgs': ['l2', None],
    'liblinear': ['l1', 'l2'],
    'newton-cg': ['l2', None],
    'newton-cholesky': ['l2', None],
    'sag': ['l2', None],
    'saga': ['elasticnet', 'l1', 'l2', None]
}

In [552]:
solver_list = list(solver_params.keys())

In [553]:
solver_list

['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

In [554]:
penalty_list = list(solver_params.values())

In [555]:
penalty_list

[['l2', None],
 ['l1', 'l2'],
 ['l2', None],
 ['l2', None],
 ['l2', None],
 ['elasticnet', 'l1', 'l2', None]]

In [556]:
total_length = 0
for inner_list in penalty_list:
    total_length = total_length + len(inner_list)

In [557]:
total_length

14

In [558]:
for solver in solver_list:
    print(solver)

lbfgs
liblinear
newton-cg
newton-cholesky
sag
saga


In [559]:
for penalty in penalty_list[0]:
    print(penalty)

l2
None


In [560]:
solver_params

{'lbfgs': ['l2', None],
 'liblinear': ['l1', 'l2'],
 'newton-cg': ['l2', None],
 'newton-cholesky': ['l2', None],
 'sag': ['l2', None],
 'saga': ['elasticnet', 'l1', 'l2', None]}

In [561]:
for solver_f in solver_params:
    print(solver_f)
    #for i in range(len(solver_params[solver_f])):
    for penalty_f in solver_params[solver_f]:
        print(penalty_f)

lbfgs
l2
None
liblinear
l1
l2
newton-cg
l2
None
newton-cholesky
l2
None
sag
l2
None
saga
elasticnet
l1
l2
None


### Calculation:

In [562]:
"""
# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm)

# Plot the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(7), yticklabels=range(7))
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

# Additional metrics
accuracy = metrics.accuracy_score(y_true, y_pred)
precision = metrics.precision_score(y_true, y_pred, average='weighted')
recall = metrics.recall_score(y_true, y_pred, average='weighted')
f1_score = metrics.f1_score(y_true, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")
"""

'\n# Calculate confusion matrix\ncm = confusion_matrix(y_true, y_pred)\n\n# Display the confusion matrix\nprint("Confusion Matrix:")\nprint(cm)\n\n# Plot the confusion matrix using seaborn\nplt.figure(figsize=(8, 6))\nsns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(7), yticklabels=range(7))\nplt.xlabel("Predicted")\nplt.ylabel("True")\nplt.title("Confusion Matrix")\nplt.show()\n\n# Additional metrics\naccuracy = metrics.accuracy_score(y_true, y_pred)\nprecision = metrics.precision_score(y_true, y_pred, average=\'weighted\')\nrecall = metrics.recall_score(y_true, y_pred, average=\'weighted\')\nf1_score = metrics.f1_score(y_true, y_pred, average=\'weighted\')\n\nprint(f"Accuracy: {accuracy:.4f}")\nprint(f"Precision: {precision:.4f}")\nprint(f"Recall: {recall:.4f}")\nprint(f"F1 Score: {f1_score:.4f}")\n'

In [563]:
models = {}
predictions = {}
conf_m_list = {}

for solver_f in solver_params:
    for penalty_f in solver_params[solver_f]:
        model_name = f"{solver_f}_{penalty_f}"
        
        if penalty_f == 'elasticnet':
            # Specify a value for l1_ratio, e.g., 0.5 (you can adjust this value)
            l1_ratio_value = 0.5
        else:
            # If penalty is not elasticnet, set l1_ratio to None
            l1_ratio_value = None
        
        models[model_name] = LogisticRegression(verbose=1, solver=solver_f, penalty=penalty_f, l1_ratio=l1_ratio_value)
        models[model_name].fit(x_train, y_train)
        
        pred = f"y_pred_{solver_f}_{penalty_f}"
        predictions[pred] = models[model_name].predict(x_test)
        predictions[pred] = predictions[pred].reshape(-1,1)
        
        conf_m = f"confm{solver_f}_{penalty_f}"
        conf_m_list[conf_m] = confusion_matrix(y_test,predictions[pred])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished


[LibLinear][LibLinear]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Newton iter=1
  Check Convergence
    1. max |gradient| 0.08081939195860625 <= 0.0001
Newton iter=2
  Check Convergence
    1. max |gradient| 0.033050454116673136 <= 0.0001
Newton iter=3
  Check Convergence
    1. max |gradient| 0.013369321605954488 <= 0.0001
Newton iter=4
  Check Convergence
    1. max |gradient| 0.004489517833530798 <= 0.0001
Newton iter=5
  Check Convergence
    1. max |gradient| 0.0010136922624206862 <= 0.0001
Newton iter=6
  Check Convergence
    1. max |gradient| 0.00012232196364289128 <= 0.0001
Newton iter=7
  Check Convergence
    1. max |gradient| 3.196774448739338e-06 <= 0.0001
    2. Newton decrement 5.518555613160944e-06 <= 0.0001
  Solver did converge at loss = 0.16808671056012006.
Newton iter=1
  Check Convergence
    1. max |gradient| 0.08317947282223107 <= 0.0001
Newton iter=2
  Check Convergence
    1. max |gradient| 0.027890923331777583 <= 0.0001
Newton iter=3
  Check Convergence
    1. max |gradient| 0.008643381348009956 <= 0.0001
Newton iter=4
  Che

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Newton iter=1
  Check Convergence
    1. max |gradient| 0.06913591193128552 <= 0.0001
Newton iter=2
  Check Convergence
    1. max |gradient| 0.018150161149654287 <= 0.0001
Newton iter=3
  Check Convergence
    1. max |gradient| 0.0034352484626171903 <= 0.0001
Newton iter=4
  Check Convergence
    1. max |gradient| 0.00021265163301065258 <= 0.0001
Newton iter=5
  Check Convergence
    1. max |gradient| 9.103931898719539e-07 <= 0.0001
    2. Newton decrement 1.2136939356784137e-06 <= 0.0001
  Solver did converge at loss = 0.30560948289180806.
Newton iter=1
  Check Convergence
    1. max |gradient| 0.08390172710314409 <= 0.0001
Newton iter=2
  Check Convergence
    1. max |gradient| 0.033309267315075476 <= 0.0001
Newton iter=3
  Check Convergence
    1. max |gradient| 0.013149524779894443 <= 0.0001
Newton iter=4
  Check Convergence
    1. max |gradient| 0.004354682224616898 <= 0.0001
Newton iter=5
  Check Convergence
    1. max |gradient| 0.0008828203101153394 <= 0.0001
Newton iter=6
  C

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 33 epochs took 1 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 43 epochs took 1 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 77 epochs took 4 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 32 epochs took 2 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 17 epochs took 1 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 19 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s finished


In [564]:
models

{'lbfgs_l2': LogisticRegression(verbose=1),
 'lbfgs_None': LogisticRegression(penalty=None, verbose=1),
 'liblinear_l1': LogisticRegression(penalty='l1', solver='liblinear', verbose=1),
 'liblinear_l2': LogisticRegression(solver='liblinear', verbose=1),
 'newton-cg_l2': LogisticRegression(solver='newton-cg', verbose=1),
 'newton-cg_None': LogisticRegression(penalty=None, solver='newton-cg', verbose=1),
 'newton-cholesky_l2': LogisticRegression(solver='newton-cholesky', verbose=1),
 'newton-cholesky_None': LogisticRegression(penalty=None, solver='newton-cholesky', verbose=1),
 'sag_l2': LogisticRegression(solver='sag', verbose=1),
 'sag_None': LogisticRegression(penalty=None, solver='sag', verbose=1),
 'saga_elasticnet': LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga', verbose=1),
 'saga_l1': LogisticRegression(penalty='l1', solver='saga', verbose=1),
 'saga_l2': LogisticRegression(solver='saga', verbose=1),
 'saga_None': LogisticRegression(penalty=None, solver='sag

In [492]:
solver_params

{'lbfgs': ['l2', None],
 'liblinear': ['l1', 'l2'],
 'newton-cg': ['l2', None],
 'newton-cholesky': ['l2', None],
 'sag': ['l2', None],
 'saga': ['elasticnet', 'l1', 'l2', None]}

In [565]:
conf_m_list["confmlbfgs_None"]

array([[1658,   29,   24,    0,  156,   41,    0],
       [ 127, 1434,   67,  103,   33,    5,   17],
       [  34,   82, 1055,    5,   87,   16,  561],
       [   0,   74,   11, 1658,   21,  102,    8],
       [ 385,  143,   40,  218,  549,  483,    4],
       [  72,    9,   37,  140,  408, 1182,    6],
       [   3,   77,  646,    3,   11,    3, 1219]], dtype=int64)

In [588]:
predictions["y_pred_lbfgs_l2"]

array([[3],
       [2],
       [3],
       ...,
       [4],
       [0],
       [6]], dtype=int64)

## Model Evaluation:

In [582]:
def model_eval(y_test, model_penalty, conf_dict_name, pred_dict_name):
    """
    Evaluate the performance of a predictive model using various metrics.

    Parameters:
    - y_test (array-like): Ground test labels.
    - model_penalty (str): The penalty associated with the model, used to retrieve the confusion matrix and prediction made against that model_penalty combination.
    - conf_dict_name (dict): Dictionary containing confusion matrices with keys formatted as "confm{model_penalty}".
    - pred_dict_name (dict): Dictionary containing model predictions with keys formatted as "y_pred_{model_penalty}".

    Returns:
    dict: A dictionary containing the evaluation metrics including Accuracy, Precision, Recall, and F1 Score.

    Notes:
    - The function uses the confusion matrix from conf_dict_name to calculate performance metrics.
    - The confusion matrix is flattened for binary or multiclass classification.
    - Metrics are calculated using scikit-learn functions: accuracy_score, precision_score, recall_score, f1_score.

    Example:
    >>> y_test = [0, 1, 1, 0, 1, 0]
    >>> model_penalty = "l1"
    >>> conf_dict_name = {"confmL1": [[2, 1], [1, 2]]}
    >>> pred_dict_name = {"y_pred_l1": [0, 1, 1, 0, 1, 0]}
    >>> result = model_eval(y_test, model_penalty, conf_dict_name, pred_dict_name)
    >>> print(result)
    {'Accuracy': 0.6667, 'Precision': 0.6667, 'Recall': 0.6667, 'F1 Score': 0.6667}
    """
    # Example for one confusion matrix
    conf_matrix = conf_dict_name[f"confm{model_penalty}"]

    # Flatten the matrix if needed
    y_test = y_test.flatten()
    y_pred = pred_dict_name[f"y_pred_{model_penalty}"].flatten()

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    result = round(accuracy, 4)

    return result


In [583]:
model_eval(y_test,"lbfgs_None",conf_m_list,predictions)

0.6711

In [606]:
scores = {}
for pred in predictions:
    model_penalty = f"{pred.split('_')[-2]}_{pred.split('_')[-1]}"
    result = model_eval(y_test,model_penalty,conf_m_list,predictions)
    scores.update({model_penalty:result})

In [607]:
scores

{'lbfgs_l2': 0.6709,
 'lbfgs_None': 0.6711,
 'liblinear_l1': 0.658,
 'liblinear_l2': 0.6578,
 'newton-cg_l2': 0.6708,
 'newton-cg_None': 0.6711,
 'newton-cholesky_l2': 0.658,
 'newton-cholesky_None': 0.6581,
 'sag_l2': 0.6709,
 'sag_None': 0.671,
 'saga_elasticnet': 0.6709,
 'saga_l1': 0.671,
 'saga_l2': 0.6708,
 'saga_None': 0.6711}

### DF Report:

In [621]:
df_scores = pd.DataFrame(list(scores.items()),columns = ["Model Penalty","Score"])

In [619]:
df_scores

Unnamed: 0,Model Penalty,Score
0,lbfgs_l2,0.6709
1,lbfgs_None,0.6711
2,liblinear_l1,0.658
3,liblinear_l2,0.6578
4,newton-cg_l2,0.6708
5,newton-cg_None,0.6711
6,newton-cholesky_l2,0.658
7,newton-cholesky_None,0.6581
8,sag_l2,0.6709
9,sag_None,0.671


In [None]:
df_scores.to_csv(r"D:\Data-Science-D-drive\sensor_placement_LogisticRegression\notebooks\

#### DF Report Calculations:

In [624]:
# Filter rows where "Macro_Avg_Accuracy" column doesn't have null values
max_index = df_scores["Score"].idxmax()
max_model_name = df_scores.loc[max_index]["Model Penalty"]
max_model_acc = df_scores.loc[max_index]["Score"]*100

min_index = df_scores["Score"].idxmin()
min_model_name = df_scores.loc[min_index]["Model Penalty"]
min_model_acc = df_scores.loc[min_index]["Score"]*100

print("Min Accuracy Model: ", min_model_name)
print("Accuracy: ",min_model_acc)

print("="*60)
print("Max Accuracy Model: ",max_model_name)
print("Accuracy: ",max_model_acc)

Min Accuracy Model:  liblinear_l2
Accuracy:  65.78
Max Accuracy Model:  lbfgs_None
Accuracy:  67.11


In [626]:
max_y_pred = predictions[f"y_pred_{max_model_name}"]

In [628]:
# Assuming y_true and max_y_pred are your arrays
print("y_test shape:", np.array(y_test).shape)
print("max_y_pred shape:", np.array(max_y_pred).shape)

y_test shape: (13046, 1)
max_y_pred shape: (13046, 1)


In [629]:
auc = roc_auc_score(y_test,max_y_pred,multi_class='ovr')

ValueError: Target scores need to be probabilities for multiclass roc_auc, i.e. they should sum up to 1.0 over classes