## <span style="color:orange">Techniques: </span>
- **Random Forest**

### <span style="color:purple">**Dataset: Breast-Cancer Prediction !**</span>

### <span style="color:red">Problem: **Classification**</span>

### <span style="color:cyan">Step: 00</span>: (Import Libraries)

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

### <span style="color:cyan">Step: 01</span>: (Import Dataset)

In [None]:
# import Dataset !

df = pd.read_csv('Breast_cancer_data.csv')
df.sample(5)

***

##### Missing Values

In [None]:
for feature in df:
    if df[feature].isnull().sum()>1:
        print(feature,":", np.round(df[feature].isnull().mean(),4),'%')
    
else:
     print("There is no Null Values")

In [None]:
df.isnull().sum()

#### Numerical Features

In [None]:
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']
print("The length of Numerical_values is :",len(numerical_features))
df[numerical_features].head()

#### Discrete Variables 

In [None]:
discrete_Feature = [feature for feature in numerical_features if len(df[feature].unique())<25]

print("The Discrete feature is :",len(discrete_Feature))
discrete_Feature

#### Continous Variables

In [None]:
continous_Feature = [feature for feature in numerical_features if feature not in discrete_Feature]

print("The Continous feature is :",len(continous_Feature))
continous_Feature

In [None]:
df[continous_Feature].head()

#### Categorical Features

In [None]:
categorical_feature = [feature for feature in df.columns if df[feature].dtypes =="O"]
print(len(categorical_feature))
categorical_feature

In [None]:
df[categorical_feature].head()

***

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
X = df.drop(['diagnosis'], axis=1)
y = df[['diagnosis']]

In [None]:
X.sample(5)

In [None]:
y.sample(5)

In [None]:
y['diagnosis'].unique()

---

### <span style="color:cyan">Step: 02</span>: Data Engineering

#### Normalization (Plotting)

In [None]:
# Plotting the distplots without any transformation

for col in X.columns:
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    sns.distplot(X[col])
    plt.title(col)

    plt.subplot(122)
    stats.probplot(X[col], dist="norm", plot=plt)
    plt.title(col)

    plt.show()

In [None]:
X.columns

In [None]:
df.columns

***

#### Data Normalization

In [None]:
# transformation !

feature = X.columns
scaling = StandardScaler()
df[feature] = scaling.fit_transform(df[feature])
X = df[feature]
X.head(3)


In [None]:
# Encoding
# ohe = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore', dtype=np.int32)
# y = ohe.fit_transform(y)
# y

***

### <span style="color:cyan">Step: 03</span>: Data Cleaned or Organize

In [None]:
df.isnull().sum()

In [None]:
print("Before Cleaning:", df.shape)

In [None]:
print("Before Removing Outliers")
plt.figure(figsize=(15,6))
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=X,
            palette="Set1")
plt.show()            

***

In [None]:
# mean_radius

print("Before Removing (mean_radius) outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            x="mean_radius",
            palette="Set1")
plt.show()            

df = df[df['mean_radius']<2.2] 


print("After Removing (mean_radius) outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            x="mean_radius",
            palette="Set1")
plt.show()            

In [None]:
print("Shape:", df.shape)

***

In [None]:
# mean_texture

print("Before Removing (mean_texture) outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            y="mean_texture",
            palette="Set2")
plt.show()            

df = df[df['mean_texture']<2.5] 


print("After Removing (mean_texture) outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            y="mean_texture",
            palette="Set2")
plt.show()            

In [None]:
print("Shape:", df.shape)

***

In [None]:
# mean_perimeter

print("Before Removing (mean_perimeter) Outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            x="mean_perimeter",
            palette="Paired")
plt.show()            

df = df[df['mean_perimeter']<2.2] 


print("After Removing (mean_perimeter) Outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            x="mean_perimeter",
            palette="Paired")
plt.show()            

In [None]:
print("Shape:", df.shape)

***

In [None]:
# mean_area

print("Before Removing (mean_area) outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            y="mean_area",
            palette="hls")
plt.show()            

df = df[df['mean_area']<1.3] 


print("After Removing (mean_area) outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            y="mean_area",
            palette="hls")
plt.show()            

In [None]:
print("Shape:", df.shape)

***

In [None]:
# mean_smoothness

print("Before Removing (mean_smoothness) outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            x="mean_smoothness",
            palette="Set3")
plt.show()            

df = df[df['mean_smoothness']<2.5] 


print("After Removing (mean_smoothness) outliers")
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            x="mean_smoothness",
            palette="Set3")
plt.show()            

In [None]:
print("Shape:", df.shape)

***

In [None]:
print("After Cleaning:", df.shape)

In [None]:
print("After Removing Outliers")
plt.figure(figsize=(15,6))
sns.set_theme(style="whitegrid", color_codes=True)
sns.boxplot(data=df,
            palette="Set1")
plt.show()            

***

## **Model Building**

### <span style="color:cyan">Step: 04</span>: Splitting Dataset into (Training and Testing) Data 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

***

### <span style="color:cyan">Step: 05</span>: Apply Techniques !

In [None]:
# DecisionTreeClassifier

dt_model = DecisionTreeClassifier().fit(X_train, y_train)
dt_model

In [None]:
# RandomForestClassifier

rf_model = RandomForestClassifier().fit(X_train, y_train)
rf_model

In [None]:
# GaussianNaiveBayes

gnb_model = GaussianNB().fit(X_train, y_train)
gnb_model

In [None]:
# KNeighborsClassifier

knn_model = KNeighborsClassifier().fit(X_train, y_train)
knn_model

In [None]:
# LogisticRegression

lr_model = LogisticRegression().fit(X_train, y_train)
lr_model

### <span style="color:cyan">Step: 05</span>: Evaluate or Test Model Accuracy

In [None]:
y_pred_dt = dt_model.predict(X_test)
y_pred_dt

# Decision tree
dt_score = np.round(accuracy_score(y_test, y_pred_dt)*100, 3)
print("Decision tree:", dt_score)

In [None]:
y_pred_rf = rf_model.predict(X_test)
y_pred_rf

# Random forest
rf_score = np.round(accuracy_score(y_test, y_pred_rf)*100, 3)
print("Random forest:", rf_score)

In [None]:
y_pred_gnb = gnb_model.predict(X_test)
y_pred_gnb

# Gussian Naive Bays
gnb_score = np.round(accuracy_score(y_test, y_pred_gnb)*100, 3)
print("GussianNB:", gnb_score)

In [None]:
y_pred_knn = knn_model.predict(X_test)
y_pred_knn

# K_Nearest Neighbours 
knn_score = np.round(accuracy_score(y_test, y_pred_knn)*100, 3)
print("KNN:", knn_score)

In [None]:
y_pred_lr = lr_model.predict(X_test)
y_pred_lr

# Logistic Regression 
lr_score = np.round(accuracy_score(y_test, y_pred_lr)*100, 3)
print("Logistic Regression:", lr_score)

###  Compare the Results of Techniques !

In [None]:
data={'Decision tree'            :[dt_score],
      'Random Forest'            :[rf_score],
      'Gaussian_NB'              :[gnb_score],
      'KNN'                      :[knn_score],
      'Logistic Regression'      :[lr_score]
}

In [None]:
Evaluation = pd.DataFrame(data, index=['Score'])
Evaluation

***

### Apply Cross Validation !

In [None]:
rf_validate = np.round(cross_val_score(rf_model, X, y, cv=5, scoring='accuracy').mean()*100, 3)
print("After cross validation of RF:", rf_validate)

***

In [None]:
data={'Random Forest'            :[rf_score],
      'Cross-validate RF'        :[rf_validate]
}

In [None]:
Evaluation = pd.DataFrame(data, index=['Score'])
Evaluation

***

***

***