# [9660] Logistic Regression 1
Data file:
* seaborn : iris

In [None]:
from datetime import datetime
print(f'Run time: {datetime.now().strftime("%D %T")}')

Run time: 09/24/24 12:11:17


### Import libraries

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

### Load data

In [None]:
# List available data sets
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [None]:
# Load iris data set
df = sns.load_dataset("iris")

Independent variables (in centimeters)
* sepal_length
* sepal_width
* petal_length
* petal_width

Dependent variable
* species
  * classes: Setosa, Versicolor, Virginica

### Examine data

In [None]:
df.shape

(150, 5)

In [None]:
# Review data
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
# List targets / classes
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [None]:
df['species'].value_counts()

Unnamed: 0_level_0,count
species,Unnamed: 1_level_1
setosa,50
versicolor,50
virginica,50


### Prepare data

In [None]:
# Prepare data set for training
X = df.iloc[:, :-1]     # independent variables - all columns except last one
y = df.iloc[:, -1]      # dependent variable - last column

In [None]:
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)

### Train Logistic Regression model

In [None]:
# Instantiate logistic regression model
model = LogisticRegression()

In [None]:
# Display model (default) hyperparameters
model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [None]:
# Train logistic regression model
model.fit(X_train, y_train)

#### Look closely at the error message above

In [None]:
# Train the model again
model = LogisticRegression(max_iter = 200)
model.fit(X_train, y_train)

#### Similar to linear regression, logistic regression calculates intercept and coefficient values
#### For multiclass (more than 2 classes) problems, logistic regression creates N classifiers
For example, in this case where the dataset has 3 classes, sklearn's LogisticRegression:
* creates 3 classifiers, each with their own intercept and set of coefficents
* to make a prediction for a new data point, the decision function is evaluated for each class, and the class with the highest probability is chosen as the predicted class

In [None]:
print(f"classes: {model.classes_}\n")
print(f"number of features: {model.n_features_in_}\n")
print(f"features: {model.feature_names_in_}\n")
print(f"intercept: {model.intercept_}\n")
print(f"coefficents: {model.coef_}\n")

classes: ['setosa' 'versicolor' 'virginica']

number of features: 4

features: ['sepal_length' 'sepal_width' 'petal_length' 'petal_width']

intercept: [ 10.12545498   1.79786094 -11.92331592]

coefficents: [[-5.29592038e-01  8.27322710e-01 -2.34730421e+00 -9.93436525e-01]
 [ 5.29720064e-01 -3.04811706e-01 -1.70960320e-01 -8.56124216e-01]
 [-1.28026113e-04 -5.22511005e-01  2.51826453e+00  1.84956074e+00]]



### Test model

In [None]:
# Test the model
predictions = model.predict(X_test)

# Print predictions
print(predictions)

['setosa' 'virginica' 'versicolor' 'versicolor' 'setosa' 'versicolor'
 'setosa' 'setosa' 'virginica' 'versicolor' 'virginica' 'virginica'
 'virginica' 'versicolor' 'setosa' 'setosa' 'setosa' 'versicolor'
 'versicolor' 'virginica' 'setosa' 'virginica' 'versicolor' 'virginica'
 'virginica' 'virginica' 'versicolor' 'setosa' 'virginica' 'setosa']


### Model accuracy

In [None]:
# Check model accuracy
accuracy = model.score(X_test, y_test)
print("accuracy =", round((accuracy * 100), 2), "%")

accuracy = 96.67 %
