# Logistic Regression

In [3]:
# Standard Headers
# You are welcome to add additional headers here if you wish, as long as they are
# built-in or included in requirements.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

## Heart Dataset 

In this project we will work with a dataset of patients. 
We have access to 303 patients' data. The features are listed below. 

In [4]:
# Your code here
heart_df = pd.read_csv("Heart.csv")
heart_df

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,Target
0,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
2,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
3,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
4,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,typical,110,264,0,0,132,0,1.2,2,0.0,reversable,Yes
299,68,1,asymptomatic,144,193,1,0,141,0,3.4,2,2.0,reversable,Yes
300,57,1,asymptomatic,130,131,0,0,115,1,1.2,2,1.0,reversable,Yes
301,57,0,nontypical,130,236,0,2,174,0,0.0,2,1.0,normal,Yes


**Age:** The person’s age in years

**Sex:** The person’s sex (1 = male, 0 = female)

**ChestPain:** chest pain type

* Value 0: asymptomatic
* Value 1: atypical angina
* Value 2: non-anginal pain
* Value 3: typical angina

**RestBP:** The person’s resting blood pressure (mm Hg on admission to the hospital)

**Chol:** The person’s cholesterol measurement in mg/dl

**Fbs:** The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
**RestECG:** resting electrocardiographic results

* Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria
* Value 1: normal
* Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)

**MaxHR:** The person’s maximum heart rate achieved

**ExAng:** Exercise induced angina (1 = yes; 0 = no)

**Oldpeak:** ST depression induced by exercise relative to rest (‘ST’ relates to positions on the ECG plot. See more here)

**Slope:** the slope of the peak exercise ST segment — 0: downsloping; 1: flat; 2: upsloping

* 0: downsloping; 
* 1: flat; 
* 2: upsloping

**Ca:** The number of major vessels (0–3)

**Thal:** A blood disorder called thalassemia Value 0: NULL (dropped from the dataset previously

* Value 1: fixed defect (no blood flow in some part of the heart)
* Value 2: normal blood flow
* Value 3: reversible defect (a blood flow is observed but it is not normal)

**Target:** Heart disease (1 = no, 0= yes)

# Q1
We want to use logistic regerssion to predict if a patient will have heart problems or not. The column "Target" in our datasets includes data about heart disease. If the patient had heart disease, the patient's "Target" value equals 1. Otherwise, "Target" equals 0.

Prepare your data set for predicting heart disease ("Target" column) by using 3 features:

* Age of the patient (Column **"Age"**)
* Gender of the patient (male or female - Column **"Sex"**)
* Cholestrol level of the patient (Column **"Chol"**) 

Split your data into 80% traning data and 20% test data.

<!-- BEGIN QUESTION -->



In [5]:
# Put the features into an "X" array, and target variable into a "y" array.
X = heart_df[["Age", "Sex", "Chol"]].to_numpy()
Y = heart_df["Target"].to_numpy()
#print(X)
#print(Y)

In [7]:
# Split your "X" and "y" arrays into training and testing sets.
from sklearn.model_selection import train_test_split
#X_train = ...
#X_test = ...
#y_train = ...
#y_test = ...
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)
print(X_train)

[[ 57   0 303]
 [ 58   0 319]
 [ 54   1 206]
 [ 56   1 221]
 [ 48   1 245]
 [ 65   1 282]
 [ 50   0 254]
 [ 45   0 236]
 [ 60   1 206]
 [ 64   1 212]
 [ 57   1 274]
 [ 54   1 309]
 [ 41   0 268]
 [ 46   0 177]
 [ 53   1 246]
 [ 43   1 177]
 [ 45   1 308]
 [ 58   0 283]
 [ 66   1 246]
 [ 77   1 304]
 [ 62   0 263]
 [ 56   1 236]
 [ 71   0 149]
 [ 39   1 321]
 [ 57   1 335]
 [ 60   0 240]
 [ 57   0 354]
 [ 54   0 214]
 [ 44   1 226]
 [ 55   0 327]
 [ 70   1 269]
 [ 29   1 204]
 [ 70   1 174]
 [ 58   0 225]
 [ 52   1 223]
 [ 42   0 265]
 [ 50   1 196]
 [ 54   1 283]
 [ 55   0 342]
 [ 52   1 212]
 [ 57   1 276]
 [ 44   1 233]
 [ 54   1 266]
 [ 62   0 164]
 [ 58   1 224]
 [ 62   0 394]
 [ 42   1 295]
 [ 62   1 281]
 [ 45   0 234]
 [ 57   1 192]
 [ 46   1 311]
 [ 43   1 247]
 [ 58   1 270]
 [ 46   1 197]
 [ 46   1 249]
 [ 59   1 234]
 [ 41   1 250]
 [ 60   1 185]
 [ 58   1 220]
 [ 42   1 315]
 [ 44   1 220]
 [ 52   1 186]
 [ 46   1 231]
 [ 41   0 204]
 [ 45   1 260]
 [ 54   1 239]
 [ 54   0 

<!-- END QUESTION -->

# Q2

Generate a logistic regression model using your training data. 

Print out the accuracy if your model.

<!-- BEGIN QUESTION -->



In [15]:
# Create a logistic regression model.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression( fit_intercept=True)


# Fit the model to your training data.
model.fit(X_train,y_train)

LogisticRegression()

In [16]:
# Compute the accuracy of your model on the testing set.
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred) 
accuracy

0.6885245901639344

<!-- END QUESTION -->

# Q3
Generate the classification report for your Logistic regresion model and interpret your results regarding precision, recall and f1-score.


**Description**: The precision percentage tells us that out of all the patients that the model
predicted would have heart disease, only 65% actually do. The recall
percentage tells us that out of all the patients that actually do have heart
disease, the model predicted this outcome correctly for 71% of those patients
. The f1-score is 68% which tells us since it is still far from being 1, the
model does an ok job of predicting whether or not the patients have heart
disease.

<!-- BEGIN QUESTION -->



In [17]:
from sklearn.metrics import classification_report
report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

          No       0.73      0.67      0.70        33
         Yes       0.65      0.71      0.68        28

    accuracy                           0.69        61
   macro avg       0.69      0.69      0.69        61
weighted avg       0.69      0.69      0.69        61



<!-- END QUESTION -->

