# Logistic Regression Exercise


Task: You are walking in the forest and see an iris and measure: 4.8,2.5,5.3,2.4

Is this an Iris Virginica or not? 

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

## Load Dataset

In [2]:
# Import dataset
from sklearn.datasets import load_iris
iris = load_iris()

In [3]:
# Print complete dataset
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [4]:
# Save data to dataframe print fist five rows
data = pd.DataFrame(iris.data)
data.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
# Add column names to dataframe and print first five rows again
data.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
# Create column with target class labels for data (0 = Iris Setosa, 1 = Isis Versicolour, 3 = Iris Virginica)
target = pd.DataFrame(iris.target)
target = target.rename(columns = {0: 'target'})
target.target.unique()
target

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


In [7]:
# Concatenate target labels with data
df = pd.concat([data, target], axis = 1)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


# Create and fit Model

In [8]:
# Split dataframe into data and target
x = df.copy()
y = x.pop('target')

In [9]:
# Split data into train (80%) and test data (20%)
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=1, stratify = y)

In [10]:
# Print train and test data shapes
print('x train', x_train.shape)
print('y train', y_train.shape)
print('x test', x_test.shape)
print('y test', y_test.shape)

x train (120, 4)
y train (120,)
x test (30, 4)
y test (30,)


In [11]:
# Standardize x values to put features on a standard scale while keeping differences
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

In [12]:
# Create model instance from sklearn LogisticRegression class. Fit model on training data and print accuracy
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9666666666666667

In [13]:
# Predict classes for test dataset and print predictions
print(x_test)
predictions = model.predict(x_test)
predictions

    sepal_length  sepal_width  petal_length  petal_width
0       1.707371    -0.391013      1.407265     0.774900
1      -1.156478     0.087778     -1.283919    -1.436003
2      -0.917824    -1.348595     -0.442924    -0.135472
3      -1.275805     0.805965     -1.227853    -1.305950
4      -1.037151     1.045361     -1.227853    -0.785738
5      -0.917824     1.045361     -1.339986    -1.305950
6       0.394774     0.805965      0.902668     1.425166
7       0.633428    -0.869804      0.846602     0.904953
8      -0.321189    -0.630408      0.622336     1.035006
9       1.110736    -0.630408      0.566270     0.254687
10     -0.559843     2.002943     -1.396052    -1.045844
11     -0.440516    -1.827386      0.117739     0.124634
12      1.110736    -0.151617      0.958734     1.165059
13      0.156120    -2.066782      0.117739    -0.265525
14     -0.201862    -1.348595      0.678403     1.035006
15     -0.201862     3.199921     -1.283919    -1.045844
16      1.468717    -0.151617  

array([2, 0, 1, 0, 0, 0, 2, 2, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 1, 2, 1, 1,
       0, 0, 2, 1, 0, 0, 1, 1])

In [14]:
# Print confusion matrix for testing results. The model has misclassified one dataset (actual class 2 as class 1)
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_test, predictions, labels=[2, 1, 0]),
             index=[2, 1, 0], columns=[2, 1, 0])

Unnamed: 0,2,1,0
2,9,1,0
1,0,10,0
0,0,0,10


## Predict class for Iris found

In [15]:
# Create dataframe with standardized test data for Iris measured in the forest (4.8,2.5,5.3,2.4)
test = x_test[0:0]
test.loc[0] = [4.8,2.5,5.3,2.4]
test = pd.DataFrame(scaler.transform(test), columns=test.columns)
test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.loc[0] = [4.8,2.5,5.3,2.4]


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-1.275805,-1.348595,0.846602,1.555219


In [16]:
# Predict type of Iris and print result
# Reminder! Our model predicts the following classes: 0 = Iris Setosa, 1 = Isis Versicolour, 3 = Iris Virginica
predictions = model.predict(test)
if predictions[0] == 2:
    print("Hooray! We found an Iris Virginica. Our Model predicted class", predictions[0])
else:
    print("Sorry, the Iris found is not a Iris Virginic.")

Hooray! We found an Iris Virginica. Our Model predicted class 2
