## Iris Dataset

In [None]:
%%time
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns 

from pandas.plotting import scatter_matrix
from tqdm import tqdm, tqdm_notebook
from scipy import stats
from sklearn import model_selection
# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

<h4>Descriptive Analysis of dataset</h4>

In [None]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
df = pd.read_csv(url, names=names) # via url aumenta latencia
df

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"  
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
df = pd.read_csv('iris.csv', names=names) # via url aumenta latencia
df.head()

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
only_numeric_columns = df.iloc[:,:-1]
stats.describe(only_numeric_columns)

In [None]:
df.isnull().sum()

<h3>Data Visualizations </h3>

In [None]:
sns.pairplot(df,
             plot_kws = {'alpha': 0.4, 's': 80, 'edgecolor': 'k'},
             height = 3)
plt.show()          


In [None]:
sns.pairplot(df, kind="reg")
plt.show()

In [None]:
sns.pairplot(df , hue='class',  diag_kind = 'kde',
             plot_kws = {'alpha': 0.4, 's': 80, 'edgecolor': 'k'},
             height = 3, markers=["o", "s", "D"],)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,10))

sns.scatterplot(
    'sepal-length',
    'sepal-width',
    hue='class',
    size='petal-width',
    alpha=0.75,
    legend='brief',
    data=df,
    ax = ax
)

ax.set_xlabel('Sepal Length', size=22)
ax.set_ylabel('Sepal Width', size = 22)

ax.text(x=0.5, y=1.1, s='Sepal Length vs Width', fontsize=30, weight='bold', ha='center', va='bottom', transform=ax.transAxes)
ax.text(x=0.5, y=1.05, s='The size of each point corresponds to sepal width', fontsize=20, alpha=0.75, ha='center', va='bottom', transform=ax.transAxes)

h,l = ax.get_legend_handles_labels()
ax.legend(h[:4],l[:4], bbox_to_anchor=(1.05, 1), loc=2)

fig.tight_layout()
plt.show()

In [None]:
df.hist()
plt.show()

In [None]:
df.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.show()
print(df.describe())

In [None]:
plt.figure(figsize=(18,5))
sns.heatmap(df.corr(), annot=True, linewidths=.5,  cmap='coolwarm', robust=True, fmt=".3f", annot_kws={'size':14})
plt.title("Correlation HeatMap")
plt.show()

In [None]:
sns.set(style="white")

corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 10))

# Generate a custom diverging colormap
#cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  annot=True, fmt=".3f", annot_kws={'size':16}, cmap='coolwarm', 
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()

<h3>Modeling using ML techniques</h3>

In [None]:
np_array = df.values

# INPUT 
X = np_array[:,0:4]
print("INPUT\n", X[:5,:5])

# OUTPUT - classifications
Y = np_array[:,4]
print("\nOUTPUT\n", Y[:5])


test_size = 0.20
seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
print("TRAINING SHAPE ")
print(X_train.shape)
print(Y_train.shape)

print("\nTEST SHAPE")
print(X_test.shape)
print(Y_test.shape)


In [None]:
# SCALING
from sklearn import preprocessing
# We starting scaling only the training set, to avoid introducing test information into the dataset(info leakeage)
# Then we will scale the test set using the mean and std of the training set
#X_train_scaled = preprocessing.StandardScaler().fit(X_train)
#X_test_scaled = scaler.transform(X_test)



In [None]:
# hyperpameter tuning
models = [
            ('LR', LogisticRegression(solver='liblinear', multi_class='ovr')),
            ('LDA', LinearDiscriminantAnalysis()),
            ('KNN', KNeighborsClassifier()),
            ('CART', DecisionTreeClassifier()),
            ('NB', GaussianNB()),
            ('SVM', ('SVM', SVC(gamma='auto')))          
         ]

#### 10K FOLD CROSSVALIDATION TRAINING 

In [None]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

model_names = []
cv_metrics_result = []
cv_mean_result_collection =[]
cv_std_result_collection = []
scoring="accuracy"

for name, model in models:

 #   kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
    kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=False)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, scoring, cv=kfold)
    
    cv_metrics_result.append(cv_results)
    model_names.append(name)
    cv_mean_result_collection.append(round(cv_results.mean(),3))
    cv_std_result_collection.append(round(cv_results.std(),3))                    
    #print(name)
    #msg = "μ: {}  σ:{}".format(round(cv_results.mean(),3), round(cv_results.std(),3))
    #print(msg)
    #print(np.round(cv_results,3), '\n')

In [None]:
error_list = [round(1- i,3) for i in cv_mean_result_collection ]
print(model_names)
print(cv_mean_result_collection)
print(error_list)

In [None]:
df_cv_results = pd.DataFrame(cv_metrics_result)
a = df_cv_results.T 
a.columns = model_names
df = a.T
df['μ'] = cv_mean_result_collection
df['σ'] = cv_std_result_collection
df

In [None]:
plt.bar(model_names, error_list)
plt.title('ML Models X Mean of training erros (10K CV)', size=14)
plt.ylabel('MEAN ERROR')
plt.show()

In [None]:
fig = plt.figure()
fig.suptitle('ML Models X Mean of training erros distribution')
ax = fig.add_subplot(111)
plt.boxplot(cv_metrics_result)
ax.set_xticklabels(model_names)
plt.savefig('plots/benchmark.png')

<h3>Test models with blind data</h3>

In [None]:
# hyperpameter tuning
models = [
            ('LR', LogisticRegression(solver='liblinear', multi_class='ovr')),
            ('LDA', LinearDiscriminantAnalysis()),
            ('KNN', KNeighborsClassifier()),
            ('CART', DecisionTreeClassifier()),
            ('NB', GaussianNB()),
            ('SVM', ('SVM', SVC(gamma='auto')))          
         ]
for name, model in models:
    print(name, model)

In [None]:
def fit_and_predict_with_model(model, X_train, Y_train, X_test):
    fitted_model = model.fit(X_train, Y_train)
    predictions = fitted_model.predict(X_test)
    return predictions
    

In [None]:
def create_model_confusion_matrix(model_name, blind_test, predictions):
    knn_confusion_matrix = confusion_matrix(predictions,blind_test)
    f, ax = plt.subplots(figsize=(15, 5))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(knn_confusion_matrix,annot=True, fmt=".3f", annot_kws={'size':16}, cmap=cmap, 
                square=True, linewidths=.5, cbar_kws={"shrink": .5})
    plt.suptitle("{} Confusion Matrix".format(model_name), size=20)
    accuracy = accuracy_score(blind_test, predictions)
    plt.title("Accuracy score: {} ".format(accuracy), size=15)
    plt.show()

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
blind_test = Y_test
for model_name, model in tqdm(models):
    create_model_confusion_matrix(model_name, fit_and_predict_with_model(model, X_train, Y_train, X_test ), blind_test)

In [None]:
from tqdm.notebook import tqdm
from time import sleep

In [None]:
for i in tqdm(range(0,100)):
     sleep(.04)