<a href="https://colab.research.google.com/github/hussain0048/Machine-Learning/blob/master/Feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **The Breast Cancer Dataset**


# **Building the Baseline Model – Logistic Regression**

In [1]:
#import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from sklearn.datasets import load_breast_cancer

In [2]:
cancer_dict = load_breast_cancer()


In [3]:
cancer_dict.keys()


dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
#define the feature and labels in the data
data = cancer_dict.data
columns = cancer_dict.feature_names

X = pd.DataFrame(data, columns=columns)
y = pd.Series(cancer_dict.target, name='target')

#merge the X and y data
df = pd.concat([X, y], axis=1)
df.sample(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
355,12.56,19.07,81.92,485.8,0.0876,0.1038,0.103,0.04391,0.1533,0.06184,...,22.43,89.02,547.4,0.1096,0.2002,0.2388,0.09265,0.2121,0.07188,1
328,16.27,20.71,106.9,813.7,0.1169,0.1319,0.1478,0.08488,0.1948,0.06277,...,30.38,129.8,1121.0,0.159,0.2947,0.3597,0.1583,0.3103,0.082,0
451,19.59,25.0,127.7,1191.0,0.1032,0.09871,0.1655,0.09063,0.1663,0.05391,...,30.96,139.8,1421.0,0.1528,0.1845,0.3977,0.1466,0.2293,0.06091,0
393,21.61,22.28,144.4,1407.0,0.1167,0.2087,0.281,0.1562,0.2162,0.06606,...,28.74,172.0,2081.0,0.1502,0.5717,0.7053,0.2422,0.3828,0.1007,0
318,9.042,18.9,60.07,244.5,0.09968,0.1972,0.1975,0.04908,0.233,0.08743,...,23.4,68.62,297.1,0.1221,0.3748,0.4609,0.1145,0.3135,0.1055,1
160,11.75,20.18,76.1,419.8,0.1089,0.1141,0.06843,0.03738,0.1993,0.06453,...,26.21,88.91,543.9,0.1358,0.1892,0.1956,0.07909,0.3168,0.07987,1
116,8.95,15.76,58.74,245.2,0.09462,0.1243,0.09263,0.02308,0.1305,0.07163,...,17.07,63.34,270.0,0.1179,0.1879,0.1544,0.03846,0.1652,0.07722,1
175,8.671,14.45,54.42,227.2,0.09138,0.04276,0.0,0.0,0.1722,0.06724,...,17.04,58.36,259.2,0.1162,0.07057,0.0,0.0,0.2592,0.07848,1
273,9.742,15.67,61.5,289.9,0.09037,0.04689,0.01103,0.01407,0.2081,0.06312,...,20.88,68.09,355.2,0.1467,0.0937,0.04043,0.05159,0.2841,0.08175,1
485,12.45,16.41,82.85,476.7,0.09514,0.1511,0.1544,0.04846,0.2082,0.07325,...,21.03,97.82,580.6,0.1175,0.4061,0.4896,0.1342,0.3231,0.1034,1


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score

#scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
def apply_model(X, y):
    '''
    This function receives the X and y dataset set, splits into train and test,
    applies a Logistic Regression algorithm on the train data, make prediction after training,
    and returns the recall
    '''

    #split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

    #apply the Logistic Regression algorithm on the data
    model = LogisticRegression()
    model.fit(X_train, y_train)

    #make prediction
    y_pred = model.predict(X_test)

    #compute the metrics
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    return(f"Accuracy score: {accuracy}",
           f"Recall score: {recall}",
           f"Precision score: {precision}"
          )

In [7]:
%time apply_model(X_scaled, y)


CPU times: user 26.9 ms, sys: 17.4 ms, total: 44.3 ms
Wall time: 74.1 ms


('Accuracy score: 0.9707602339181286',
 'Recall score: 0.9722222222222222',
 'Precision score: 0.9813084112149533')

# **1-Feature Extraction with Principal Component Analysis (PCA)**

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

#apply PCA to transform the features to 2
X_pca = pca.fit_transform(X_scaled)

#create the new dataframe
df_pca = pd.concat([pd.DataFrame(X_pca), y], axis=1)

In [None]:
df_pca.head()


In [None]:
#visualize the dataset
sns.scatterplot(df_pca.iloc[:, 0], df_pca.iloc[:, 1], hue=df_pca['target'])

In [11]:
%time apply_model(X_scaled, y)

CPU times: user 28 ms, sys: 15.3 ms, total: 43.2 ms
Wall time: 86.2 ms


('Accuracy score: 0.9707602339181286',
 'Recall score: 0.9722222222222222',
 'Precision score: 0.9813084112149533')

# **2-Feature Extraction with Linear Discriminant Analysis (LDA)**

In [12]:
#import the LDA class
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#instantiate the LDA class and setting the number of components to 1
lda = LinearDiscriminantAnalysis(n_components=1)

#apply the LDA class
lda.fit(X_scaled, y)
X_lda = lda.transform(X_scaled)

#get the resulting data
df_lda = pd.concat([pd.DataFrame(X_lda), y], axis=1)

In [None]:
%time apply_model(X_scaled, y)

# **3-Feature Extraction with Isomap**


In [14]:
#import Isomap
from sklearn.manifold import Isomap

#apply Isomap on the data
iso = Isomap(n_components=2)
X_iso = iso.fit_transform(X_scaled)

#create the new dataset
df_iso = pd.concat([pd.DataFrame(X_iso), y], axis=1)

In [None]:
sns.scatterplot(df_iso.iloc[:, 0], df_iso.iloc[:, 1], hue=df_iso['target'])


In [None]:
%time apply_model(X_scaled, y)


# **3-Feature Extraction with Locally Linear Embedding (LLE)**

In [17]:
#import LLE
from sklearn.manifold import LocallyLinearEmbedding

#create an instance of the class
lle = LocallyLinearEmbedding(n_components=2)
X_lle = lle.fit_transform(X_scaled)

#create the new dataset
df_lle = pd.concat([pd.DataFrame(X_lle), y], axis=1)

In [18]:
%time apply_model(X_scaled, y)

CPU times: user 21.6 ms, sys: 13.5 ms, total: 35.2 ms
Wall time: 25.2 ms


('Accuracy score: 0.9707602339181286',
 'Recall score: 0.9722222222222222',
 'Precision score: 0.9813084112149533')