Logistic regression for Iris dataset

In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [44]:
data = pd.read_csv(r'/IrisNew.csv')
print(data)
df = pd.DataFrame(data, columns=["Sepal Length","Sepal Width","Petal Length","Petal Width","Class"])

     Sepal Length  Sepal Width  Petal Length  Petal Width      Class
0             4.3          3.0           1.1          0.1     setosa
1             4.4          2.9           1.4          0.2     setosa
2             4.4          3.0           1.3          0.2     setosa
3             4.4          3.2           1.3          0.2     setosa
4             4.5          2.3           1.3          0.3     setosa
..            ...          ...           ...          ...        ...
145           7.7          2.6           6.9          2.3  virginica
146           7.7          2.8           6.7          2.0  virginica
147           7.7          3.0           6.1          2.3  virginica
148           7.7          3.8           6.7          2.2  virginica
149           7.9          3.8           6.4          2.0  virginica

[150 rows x 5 columns]


In [45]:
X = df.values[:,0:4] # first 4 columns are independent variables
Y = df.values[:,4] # last column is dependent variable


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3,random_state=100)
#test size = 70% training and 30 percent testing



In [46]:
#preprocessing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train the logistic regression model
logistic_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs')
logistic_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_reg.predict(X_test)
print(y_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", classification_rep)


['virginica' 'setosa' 'virginica' 'setosa' 'virginica' 'virginica'
 'versicolor' 'setosa' 'virginica' 'setosa' 'setosa' 'versicolor' 'setosa'
 'setosa' 'virginica' 'versicolor' 'versicolor' 'virginica' 'virginica'
 'virginica' 'versicolor' 'setosa' 'virginica' 'setosa' 'versicolor'
 'virginica' 'versicolor' 'setosa' 'versicolor' 'versicolor' 'virginica'
 'versicolor' 'virginica' 'setosa' 'versicolor' 'setosa' 'versicolor'
 'versicolor' 'virginica' 'virginica' 'versicolor' 'versicolor'
 'virginica' 'virginica' 'setosa']
Accuracy: 0.9555555555555556
Confusion Matrix:
 [[13  0  0]
 [ 0 14  1]
 [ 0  1 16]]
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        13
  versicolor       0.93      0.93      0.93        15
   virginica       0.94      0.94      0.94        17

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0

In [47]:
#ranking of the results of the logistic regression model using the model’s predict_proba method

y_probabilities = logistic_reg.predict_proba(X_test)

# Assuming y_probabilities is an array of shape (n_samples, n_classes), you can rank the results
# based on the predicted probabilities for each class.
ranking = np.argsort(-y_probabilities, axis=1)  # Sort in descending order
print("Ranking of Results (Top 3 classes for each sample):\n", ranking[:, :3])


Ranking of Results (Top 3 classes for each sample):
 [[2 1 0]
 [0 1 2]
 [2 1 0]
 [0 1 2]
 [2 1 0]
 [2 1 0]
 [1 0 2]
 [0 1 2]
 [2 1 0]
 [0 1 2]
 [0 1 2]
 [1 2 0]
 [0 1 2]
 [0 1 2]
 [2 1 0]
 [1 2 0]
 [1 0 2]
 [2 1 0]
 [2 1 0]
 [2 1 0]
 [1 2 0]
 [0 1 2]
 [2 1 0]
 [0 1 2]
 [1 2 0]
 [2 1 0]
 [1 0 2]
 [0 1 2]
 [1 2 0]
 [1 2 0]
 [2 1 0]
 [1 2 0]
 [2 1 0]
 [0 1 2]
 [1 0 2]
 [0 1 2]
 [1 2 0]
 [1 2 0]
 [2 1 0]
 [2 1 0]
 [1 2 0]
 [1 2 0]
 [2 1 0]
 [2 1 0]
 [0 1 2]]


In [48]:
new_data = np.array([[5.8, 2.8, 5.1, 2.4], [6.0, 2.2, 4.0, 1.0]])

# Standardize the new data using the same scaler as before
new_data_standardized = scaler.transform(new_data)

# Predict probabilities for each class
predicted_probabilities = logistic_reg.predict_proba(new_data_standardized)

# Assuming you have classes names for reference, e.g., ["Class 1", "Class 2", "Class 3", "Class 4", "Class 5"]
class_names = ["setosa", "versicolor", "virginica"]

# Rank the classes based on predicted probabilities for each new data record
rankings = np.argsort(-predicted_probabilities, axis=1)

# Print the rankings for each new data record
for i, new_record in enumerate(new_data):
    print(f"Ranking for New Record {i + 1}:")
    for j, rank in enumerate(rankings[i]):
        class_name = class_names[rank]
        probability = predicted_probabilities[i][rank]
        print(f"Rank {j + 1}: {class_name} (Probability: {probability:.4f})")

Ranking for New Record 1:
Rank 1: virginica (Probability: 0.9621)
Rank 2: versicolor (Probability: 0.0376)
Rank 3: setosa (Probability: 0.0002)
Ranking for New Record 2:
Rank 1: versicolor (Probability: 0.9507)
Rank 2: virginica (Probability: 0.0418)
Rank 3: setosa (Probability: 0.0074)


Logistic Regression for Wiki Pass/Fail example data

In [49]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Input data (Hours of study)
X = np.array([0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50]).reshape(-1, 1)

# Target data (Pass or Fail, where 1 represents Pass and 0 represents Fail)
y = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1])

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
logistic_reg = LogisticRegression()
logistic_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_reg.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


[0 1 1 0]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [50]:
#the estimated probability of passing the exam for several values of hours studying
study_hours = np.array([1, 2,2.7, 3, 4, 5]).reshape(-1, 1)
probabilities = logistic_reg.predict_proba(study_hours)[:, 1]
for hours, probability in zip(study_hours, probabilities):
    print(f"Study Hours: {hours[0]}, Probability of Passing: {probability:.4f}")



Study Hours: 1.0, Probability of Passing: 0.1531
Study Hours: 2.0, Probability of Passing: 0.3214
Study Hours: 2.7, Probability of Passing: 0.4817
Study Hours: 3.0, Probability of Passing: 0.5537
Study Hours: 4.0, Probability of Passing: 0.7647
Study Hours: 5.0, Probability of Passing: 0.8949


Model evaluation

In [51]:
import numpy as np
import statsmodels.api as sm

# Input data (Hours of study)
X = np.array([0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50])
X = sm.add_constant(X)  # Add a constant term (intercept) to the input data

# Target data (Pass or Fail, where 1 represents Pass and 0 represents Fail)
y = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1])

# Fit logistic regression model using statsmodels
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Get the summary of the logistic regression model, which includes standard errors, z-values, and p-values
summary = result.summary()

# Print the summary
print(summary)


Optimization terminated successfully.
         Current function value: 0.401494
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                   20
Model:                          Logit   Df Residuals:                       18
Method:                           MLE   Df Model:                            1
Date:                Thu, 21 Sep 2023   Pseudo R-squ.:                  0.4208
Time:                        23:32:58   Log-Likelihood:                -8.0299
converged:                       True   LL-Null:                       -13.863
Covariance Type:            nonrobust   LLR p-value:                 0.0006365
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.0777      1.761     -2.316      0.021      -7.529      -0.626
x1             1.5046      0.

In [52]:
import numpy as np
from sklearn.linear_model import LogisticRegression

# Input data (Hours of study)
X = np.array([0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50]).reshape(-1, 1)

# Target data (Pass or Fail, where 1 represents Pass and 0 represents Fail)
y = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1])

# Create and train the logistic regression model
logistic_reg = LogisticRegression()
logistic_reg.fit(X, y)

# Values of study hours for which you want to calculate probabilities
study_hours = np.array([1, 2, 2.7,3, 4, 5]).reshape(-1, 1)

# Calculate probabilities of passing for the given study hours
probabilities = logistic_reg.predict_proba(study_hours)[:, 1]

# Print the probabilities
for hours, probability in zip(study_hours, probabilities):
    print(f"Study Hours: {hours[0]}, Probability of Passing: {probability:.4f}")


Study Hours: 1.0, Probability of Passing: 0.1202
Study Hours: 2.0, Probability of Passing: 0.3010
Study Hours: 2.7, Probability of Passing: 0.4904
Study Hours: 3.0, Probability of Passing: 0.5760
Study Hours: 4.0, Probability of Passing: 0.8108
Study Hours: 5.0, Probability of Passing: 0.9311


In [53]:
import numpy as np
import statsmodels.api as sm

# Input data (Hours of study)
X = np.array([0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50])
X = sm.add_constant(X)  # Add a constant term (intercept) to the input data

# Target data (Pass or Fail, where 1 represents Pass and 0 represents Fail)
y = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1])

# Fit logistic regression model using statsmodels
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Values of study hours for which you want to calculate probabilities
study_hours = np.array([1, 2, 3, 4, 5])
study_hours_with_const = sm.add_constant(study_hours)  # Add a constant term to the study hours

# Calculate probabilities of passing for the given study hours
probabilities = result.predict(study_hours_with_const)

# Print the probabilities
for hours, probability in zip(study_hours, probabilities):
    print(f"Study Hours: {hours}, Probability of Passing: {probability:.4f}")


Optimization terminated successfully.
         Current function value: 0.401494
         Iterations 7
Study Hours: 1, Probability of Passing: 0.0709
Study Hours: 2, Probability of Passing: 0.2557
Study Hours: 3, Probability of Passing: 0.6074
Study Hours: 4, Probability of Passing: 0.8744
Study Hours: 5, Probability of Passing: 0.9691
