# Pre-Processing Template

## Load your common imports

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
#Import your Libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
%matplotlib inline

# if you get any errors - make sure you do a !pip install xxxx

## Load your dataset

In [None]:
# %%timeit -n 1
# df = pd.read_csv('./data/<put your data here>')
# TODO: CREATE AN EXAMPLE WITH EXCEL AND ONE WITH SEP
df = pd.read_csv("https://raw.githubusercontent.com/fenago/regress/main/data/data.csv")

In [None]:
# Eyeball your data
df.sample(3)

## Basic Data Cleaning

In [None]:
# Basic Data Cleaning
# Replace spaces in column names with an underscore
# Make all column names lower case
df.columns = df.columns.str.lower().str.replace(' ', '_') # A

string_columns = list(df.dtypes[df.dtypes == 'object'].index) # B
 
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_') # C

In [None]:
# Replace WHITE SPACE in the column names
for col in string_columns:
    if df[col].nunique():
        df[col] = df[col].str.strip()
        # print(df[col].head(1))


In [None]:
len(df)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.corr()

## Remove all duplicates

In [None]:
print(df.duplicated().sum())

In [None]:
# The Pandas .drop_duplicates() method
df = df.drop_duplicates(
    subset=None,            # Which columns to consider 
    keep='first',           # Which duplicate record to keep
    inplace=False,          # Whether to drop in place
    ignore_index=False      # Whether to relabel the index
)

In [None]:
print(df.duplicated().sum())

## Deal with Nulls

In [None]:
# Identify which columns have nulls
print(df.isnull().sum())

#### Note from Dr. Lee:
When you have null values, you are left with essentially 4 options:
   - Drop the Feature/Column
   - Drop the Rows
   - Impute
   - Replace
    
*important, pick only one of these 4 options per feature!

### Option 1 and 2:  Drop the Feature and/or Rows

In [None]:
# Exploring the Pandas .dropna() method
df.dropna(
    axis=0,         # Whether to drop rows or columns
    how='any',      # Whether to drop records if 'all' or 'any' records are missing
    thresh=None,    # How many columns/rows must be missing to drop
    subset=None,    # Which rows/columns to consider
    inplace=False   # Whether to drop in place (i.e., without needing to re-assign)
)
# you can just call df.dropna() and it will call with these defaults
# thresh=: the number of items that must be empty
# subset=: the names of columns to look at when considering missing values

In [None]:
# drop the duplicated column "market_category" - it can be comma separated
df = df.drop(columns="market_category")

In [None]:
# I would most likely always run this cell
# Dropping Records Only if All Records are Missing
df = df.dropna(how='all')
# print(df)

### Option 3: Impute

Missing values often plague data, and given that there are not too many of them, they can be imputed (filled in).
Before using KNN and other distance-based algorithms, the data needs to be scaled or normalized to eliminate differences in scale (for example, one column representing number of children and another representing annual salary — these values cannot be taken at face value). Using KNN imputing follows the following process:
- Scale/normalize the data.
- KNN-impute to fill in missing values.
- Inverse scale/normalize the data.


Simple Imputing Methods are statistical constant measures like the mean or the median which fills in NaN (missing values) with the statistical measure of each column. The parameter strategy can be substituted with ‘mean’, ‘median’, ‘most_frequent’ (mode), or ‘constant’ (a manual value with parameter fill_value).

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'bmw'

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
data = imputer.fit_transform(df)

KNN Imputing is the most popular and complex method for imputing missing values, in which the KNN algorithm finds other data points similar to one with a missing value within multidimensional space.

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()
data = imputer.fit_transform(df)

### Option 4:  Replace / fillna()

In [None]:
# For instance - in the CreditScoring dataset - there are numerous 99999999 that need to be replaced
# Obviously don't run this with your dataset
# for c in ['income', 'assets', 'debt']:
#    df[c] = df[c].replace(to_replace=99999999, value=np.nan)
#df = df[df.status != 'unk']   # Also make sure to treat the target variable

# 

# df['TotalCharges'] = df['TotalCharges'].fillna(0)

## Outlier Detection

Isolation Forest is an algorithm to return the anomaly score of a sample. The algorithm isolates observations by creating paths by randomly selecting a feature, randomly selecting a split value, the path length representing its normality. Shorter paths represent anomalies — when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies.  The output of predictions of the anomaly detector is an array of scores from -1 to 1, positive scores representing higher chances of being anomalies.

Only works on scaled and encoded data (like Impute)

In [None]:
from sklearn.ensemble import IsolationForest
identifier = IsolationForest().fit(X)
identifier.predict(X)

One Class SVM is another unsupervised method for detecting outliers, suited for high-dimensional distributions where an anomaly detection method like Isolation Forest would develop too much variance.

In [None]:
from sklearn.svm import OneClassSVM
identifier = OneClassSVM().fit(X)
identifier.predict(X)

Local Outlier Factor is the third of three commonly used outlier identifiers. The anomaly score of each sample — the Local Outlier Factor — measures the local deviation of density given a sample with respect to its neighbors. Based on the K-Nearest Neighbors, samples that have substantially lower density than their neighbors are considered outliers.
Because this algorithm is distance based, the data needs to be scaled or normalized before it is used. This algorithm can be seen as a non-linear high-variance alternative to Isolation Forest.

In [None]:
from sklearn.neighbors import LocalOutlierFactor
model = LocalOutlierFactor().fit(X)
model.predict(X)

For all three anomaly algorithms, it is the data scientist’s choice to eliminate all anomalies. Be sure that anomalies are not just data clusters themselves — make sure that the number of anomalies are not too excessive in number. A PCA visualization can confirm this.

## Fix all Data Types

In [None]:
df.head(1).T

In [None]:
df.dtypes

In [None]:
# Change columns to a certain datatype (avoid category datatype)
# You may have to run this several times for each datatype
cols_to_include = ['year', 'number_of_doors', 'engine_cylinders', 'engine_hp']
for col in df.columns:
    if df[col].nunique() and col in cols_to_include:
        df[col] = df[col].astype('object')

In [None]:
# df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# You can also convert objects to numeric one column at a time

# OR - if you have a binary column - you can use this template
# df.churn = (df.churn == 'yes').astype(int)

In [None]:
# %%timeit
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
unique_counts

In [None]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(df)
categorical_columns = categorical_columns_selector(df)
print(categorical_columns)
print(numerical_columns)

## Drop columns

In [None]:
# drop the duplicated column `"education-num"` 
# df = df.drop(columns="education-num")

## Replace Values

In [None]:
# MAKE SURE THAT YOU WRANGLE YOUR DATA.  THIS IS AN EXAMPLE OF THE TYPES OF THINGS THAT ARE NEEDED
# SKIP THIS CEL - IT IS ONLY TO REITERATE THE NEED TO CLEAN 
# For instance - in the CreditScoring dataset - there are numerous 99999999 that need to be replaced
# Obviously don't run this with your dataset
# for c in ['income', 'assets', 'debt']:
#    df[c] = df[c].replace(to_replace=99999999, value=np.nan)
#df = df[df.status != 'unk']   # Also make sure to treat the target variable

## Categorical Analysis (Mutual Information)

In [None]:
from IPython.display import display
target_name = "msrp"

In [None]:
global_mean = df[target_name].mean()
global_mean

In [None]:
for col in categorical_columns:
    df_group = df[categorical_columns].groupby(by=col).msrp.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

In [None]:
target_name = "msrp"
y = df[target_name]
X = df.drop(columns=[target_name])

In [None]:
df.dtypes

In [None]:
# %%timeit
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])

In [None]:
unique_counts

The simplest way to convert a column to a categorical type is to use astype('category') . We can use a loop to convert all the columns we care about using astype('category').

This code snippet can be reused to to change all of the data types to the correct data type.  I would not use "category" as data type because it will get missed later because of the technique used.
Python Data Types
![Python Data Types](https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fmiro.medium.com%2Fmax%2F3576%2F1*QfI8H_8HplGa1v9IrrWjBA.png&f=1&nofb=1 "Python Data Types")
Pandas Data Types
![Pandas Data Types](https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Fcdn-images-1.medium.com%2Fmax%2F2000%2F1*wrXMq7iTWih7lsBBRQFxXg.png&f=1&nofb=1 "Pandas Data Types")

In [None]:
cols_to_include = ['year', 'number_of_doors', 'engine_cylinders', 'engine_hp']
for col in df.columns:
    if df[col].nunique() and col in cols_to_include:
        df[col] = df[col].astype('object')

In [None]:
df.dtypes

In [None]:
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(df)
categorical_columns = categorical_columns_selector(df)

In [None]:
print("categorical columns: ", categorical_columns)
print(" ")
print("numerical columns: ", numerical_columns)

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor, LinearRegression())

In [None]:
from sklearn import set_config
set_config(display='diagram')
model

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    df, y, random_state=42)

In [None]:
%%time
_ = model.fit(data_train, target_train)

In [None]:
data_test.head()

In [None]:
model.predict(data_test)[:5]

In [None]:
target_test[:5]

In [None]:
model.score(data_test, target_test)

In [None]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(model, df, y, cv=5)
cv_results

In [None]:
scores = cv_results["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

In [None]:
# Harness

In [None]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score

In [None]:
from time import time

from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]

In [None]:
head = 10
for lr_model in regressors[:head]:
    start = time()
    lr_model.fit(data_train, target_train)
    train_time = time() - start
    start = time()
    y_pred = lr_model.predict(X_test)
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print()