# Logistic Regression (Binary Classification)

In [None]:
import pandas as pd
from plotnine import *

##### We will use the iris data set

In [None]:
iris = pd.read_csv("iris.csv")
print(iris.head())
ggplot(iris,aes(x="species",y='sepal_width'))+geom_boxplot()

##### Setosa flowers are more distinct from the other two species in terms of sepal width

##### We know that the data set has 3 classifications of the flower, let's make it a binary classification by adding an isSetosa attribute

In [None]:
iris['isSetosa'] = iris['species'] == 'setosa'
iris['isSetosa'] = iris['isSetosa'].map({False:0,True:1})

In [None]:
ggplot(iris,aes(x='sepal_width',y='isSetosa'))+geom_point()

##### Notice how the data is spread across sepal width, small sepal width means not Setosa, larger sepal width means Setosa

##### Let's see what will happen if we fit a Linear Regression model on this data

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(iris[['sepal_width']],iris[['isSetosa']])
iris['lm_fit'] = lm.predict(iris[['sepal_width']])
lm.score(iris[['sepal_width']],iris[['isSetosa']]) # r-squared value

##### And here is what the best fit line looks like

In [None]:
ggplot(iris,aes(x='sepal_width',y='isSetosa'))+geom_point()+geom_line(aes(y="lm_fit"))

##### As we can see, the linear regression model fails to fit the data, so we can now try the Logistic Regression model which predicts the probability of a flower to belong to the Setosa species

##### Splitting data into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris[['sepal_width']], iris['isSetosa'], test_size=0.2, random_state=0)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

##### And now loading the model and fitting it on the data

In [None]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

##### Predicting species for our test set and using it to calculate the accuracy

In [None]:
from sklearn.metrics import accuracy_score

predictions = logmodel.predict(X_test)
print(accuracy_score(y_test, predictions))

##### An important check for a model is to see the confusion matrix generated

In [None]:
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, predictions)
print(cm)

##### The matrix tells us that the model successfully predicted no-setosa for 19 flowers out of 19, but only predicted setosa for 2 out of 11 flowers. So it is very good at predicting if a flower is not Setosa, which is quite what we wanted!

##### Here's a way to visualize the Confusion Matrix for binary classification

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(logmodel,X_test,y_test)

##### How about we use sepal width and length attributes to predict the classification?

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris[['sepal_width','sepal_length']], iris['isSetosa'], test_size=0.2, random_state=0)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

predictions = logmodel.predict(X_test)
print(accuracy_score(y_test, predictions))

##### That's a 100% accuracy!

In [None]:
plot_confusion_matrix(logmodel,X_test,y_test)

### We can even predict multiclass data using Logistic Regression!

##### Using all 4 attributes to predict the 3 species

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris.iloc[:,0:4], iris['species'], test_size=0.2, random_state=0)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

predictions = logmodel.predict(X_test)
print(accuracy_score(y_test, predictions))

##### 100% accuracy!

In [None]:
plot_confusion_matrix(logmodel,X_test,y_test)

## Confusion Matrix (Measures)

### Recall
* TP / (TP+FN)
* Out of all the positive classes, how much we predicted correctly. Should be as high as possible

In [None]:
metrics.recall_score(y_test,predictions,average='weighted')

### Precision 
* TP / (TP + FP)
* Out of all the positive classes we have predicted correctly, how many are actually positive

In [None]:
metrics.precision_score(y_test,predictions,average='weighted')

### Accuracy
* (TP+TN) / Total
* Out of all classes, how much did we predict correctly

In [None]:
accuracy_score(y_test, predictions)