# The Machine Learning Process
__MATH 3480__ - Dr. Michael Olson

## Before everything else, import packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Step 1 - Obtain and Load Data

In [None]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()
iris_dataset

In [None]:
iris = pd.DataFrame(iris_dataset['data'], columns=iris_dataset['feature_names'])
species = pd.DataFrame(iris_dataset['target'], columns=['species_num'])

In [None]:
species_names = {0:'setosa', 1:'versicolor', 2:'virginica'}
iris['Species'] = pd.Series(species['species_num']).apply(lambda x: species_names[x])
iris['Species']

### Step 2 - Clean the Data

In [None]:
# What does the data look like?
iris.shape

In [None]:
species.shape

In [None]:
# Numerically find missing values
print(iris.isna().sum(), " --- ", species.isna().sum())

# Graphically find missing values
sns.heatmap(iris.isna())

In [None]:
# Find outliers in X
plt.hist(iris.drop('Species', axis=1), label=iris.columns)
plt.legend()

### Step 3 - Exploratory Data Analysis (EDA)

In [None]:
iris.info()

In [None]:
iris.describe()

In [None]:
sns.pairplot(iris, hue='Species')

### Step 4 - Cross Validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'],
                                                    iris_dataset['target'],
                                                    test_size=0.25)

In [None]:
print(X_train.shape)
X_train

In [None]:
X_test.shape

### Step 5 - Build and Train the Model

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier(random_state=0)
clf = clf.fit(X_train, y_train)
tree.plot_tree(clf, max_depth=2)

In [None]:
y_model = clf.predict(X_train)

from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_train,y_model))

print(classification_report(y_train,y_model))

### Step 6 - Evaluation

In [None]:
y_predict = clf.predict(X_test)

from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,y_predict))

print(classification_report(y_test,y_predict))

In [None]:
X_test[:,2]

In [None]:
test_results = pd.DataFrame(X_test, columns=iris_dataset['feature_names'])
test_results['Actual Species'] = pd.Series(y_test).apply(lambda x: species_names[x])
test_results['Predicted Species'] = pd.Series(y_predict).apply(lambda x: species_names[x])

fig, ax = plt.subplots(1,2,figsize=(10,4))
sns.scatterplot(test_results, x='petal length (cm)', y='petal width (cm)',
                hue='Actual Species', ax=ax[0])

sns.scatterplot(test_results, x='petal length (cm)', y='petal width (cm)',
                hue='Predicted Species', ax=ax[1])