# The Machine Learning Process
__MATH 3480__ - Dr. Michael Olson

## Before everything else, import packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Step 1 - Obtain and Load Data

In [None]:
X = np.loadtxt('Data/X.txt')
y = np.loadtxt('Data/y.txt')

df = pd.DataFrame(
    {'data':X, 'target':y}
    )

In [None]:
df

### Step 2 - Clean the Data

In [None]:
# What does the data look like?
df.shape

In [None]:
# Numerically find missing values
print(df['data'].isna().sum(), " --- ", df['target'].isna().sum())

# Graphically find missing values
sns.heatmap(df.isna())

In [None]:
# Drop missing values
df.dropna(axis=0, inplace=True)

# Numerically find missing values
print(df['data'].isna().sum(), " --- ", df['target'].isna().sum())

# Graphically find missing values
sns.heatmap(df.isna())

In [None]:
# Find outliers in X
plt.hist(df['data'])

In [None]:
df[df['data'] > 1000]

In [None]:
# Find outliers in y
plt.hist(df['target'])

In [None]:
df = df[df['data'] < 0.2e6]
plt.hist(df['data'])

### Step 3 - Exploratory Data Analysis (EDA)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.boxplot(data=df, x='data')

In [None]:
plt.scatter(df['data'], df['target'])

### Step 4 - Cross Validation

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['data'].values,df['target'].values,
                                                    test_size=0.20)

In [None]:
print(X_train.size)
X_train = X_train.reshape(-1,1)
X_train

In [None]:
X_test.size

### Step 5 - Build and Train the Model

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X_train)

lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)
lin_reg.intercept_, lin_reg.coef_

In [None]:
plt.scatter(X_train,y_train)
y_model = lin_reg.intercept_ + lin_reg.coef_[0]*X_train + lin_reg.coef_[1]*X_train**2
plt.scatter(X_train,y_model)

In [None]:
y_predict = lin_reg.predict(X_poly)
plt.scatter(X_train,y_train)
plt.scatter(X_train,y_predict)

### Step 6 - Evaluation

In [None]:
X_poly = poly_features.fit_transform(X_test.reshape(-1,1))
y_predict = lin_reg.predict(X_poly)
plt.scatter(X_test,y_test)
plt.scatter(X_test,y_predict)

In [None]:
# Mean Squared Error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_predict)
mse