<a href="https://colab.research.google.com/github/fernandovieira1/PracticalMachineLearningWithRandPython/blob/master/cap5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Logistic regression

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification, make_blobs

In [6]:
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer

**1.1 Load the cancer data**

In [8]:
(X_cancer, y_cancer) = load_breast_cancer(return_X_y=True)

In [10]:
X_cancer.shape

(569, 30)

In [12]:
y_cancer.shape

(569,)

**1.2 Split data into train and test**

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_cancer, y_cancer, random_state=0)

**1.3 Call the Logistic Regression function**

In [16]:
clf = LogisticRegression(max_iter=10000).fit(X_train, y_train)

In [18]:
clf.score(X_train, y_train)

0.960093896713615

In [20]:
clf.score(X_test, y_test)

0.951048951048951

In [22]:
y_predicted = clf.predict(X_test)

**1.4 Compute and print confusion matrix**

In [24]:
confusion = confusion_matrix(y_test, y_predicted)

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [28]:
accuracy_score(y_test, y_predicted).round(2)

0.95

In [30]:
precision_score(y_test, y_predicted).round(2)

0.99

In [32]:
recall_score(y_test, y_predicted).round(2)

0.93

In [34]:
f1_score(y_test, y_predicted).round(2)

0.96

### 2. Logistic Regression with dummy variables

In [36]:
import numpy as numpy
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#### 2.1 Read data

In [38]:
df = pd.read_csv("adult1.csv", encoding="ISO-8859-1", na_values=["","", "?"])

In [40]:
df1 = df.dropna()

In [42]:
print(df1.shape)

(32560, 16)


In [44]:
df1.head()

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capital-loss,hours-per-week,native-country,salary
0,1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
1,2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
2,3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
3,4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
4,5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0


In [46]:
df1.tail()

Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capital-loss,hours-per-week,native-country,salary
32555,32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32556,32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32557,32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32558,32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0
32559,32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,1


In [48]:
adult = df1[["age", "occupation", "education", "educationNum", "capitalGain", "capital-loss", "hours-per-week", "native-country", "salary"]]

In [50]:
X = adult[["age", "occupation", "education", "educationNum", "capitalGain", "capital-loss", "hours-per-week", "native-country"]]

#### 2.2 Set dummy values for factors variables

In [52]:
X_adult = pd.get_dummies(X, columns=["occupation", "education", "native-country"])

In [54]:
y = adult["salary"]

#### 2.3 Running the model

In [56]:
X_adult_train, X_adult_test, y_train, y_test = train_test_split(X_adult, y, random_state=0)

In [58]:
clf = LogisticRegression().fit(X_adult_train, y_train)

In [60]:
print('Accuracy of Logistic regression classifier on training set: {:.2f}'.format(clf.score(X_adult_train, y_train)))

Accuracy of Logistic regression classifier on training set: 0.81


In [62]:
print('Accuracy of Logistic regression classifier on test set: {:.2f}'.format(clf.score(X_adult_test, y_test)))

Accuracy of Logistic regression classifier on test set: 0.81


In [64]:
y_predicted=clf.predict(X_adult_test)

In [66]:
confusion = confusion_matrix(y_test, y_predicted)

In [68]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_predicted)))

Accuracy: 0.81


In [70]:
print('Precision: {:.2f}'.format(precision_score(y_test, y_predicted)))

Precision: 0.66


In [72]:
print('Recall: {:.2f}'.format(recall_score(y_test, y_predicted)))

Recall: 0.39


In [74]:
print('F1: {:.2f}'.format(f1_score(y_test, y_predicted)))

F1: 0.49


Pulei a parte que fala de cross validation. Estudar isso depois