- **Classification**: Target variable consists of categories
- **Regression**: Target variable is continuous

#### Naming Conventions:
- Feature = predictor variable = independent variable
- target variable = response variable = dependent variable

In [None]:
## scikit-learn syntax

from sklearn.module import Model   # import model
model = Model()   # intialize model
model.fit(X, y)   # fit model with training data
predictions = model.preict(X_new)   # use new data to predict
print(predictions)

## 1. Classification ##

### k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

X = chun_df[['total_day_charge', 'total_eve_charge']]
y = churn_df['churn'].values

knn = KNeighborsClassifier(n_neighbors = 15)
knn.fit(X, y)

y_pred = knn.predict(X_new)
print(y_pred)

### Train/test split

In [None]:
from sklearn.model_selection import train_test_split

X = churn_df.drop("churn", axis = 1).values
y = churn_df["churn"].values

X_train, X_test, y_train, y_test = train_text_split(X, y, test_size = 0.3, random_state = 21, stratify = y)   
# Note: stratify to ensure the target label proportions reflect that of the original data set

knn = KNeighborsClassifier(n_neighbors = 6)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))   # use .score function to calculate accuracy

### Model complexity and over/underfitting

In [None]:
train_accuracies = {}
test_accuracies = {}
neighbors = np.arrange(1, 26)
for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors = neighbor)
    knn.fit(X_train, y_train)
    train_accuracies[neighbor] = knn.score(X_train, y_train)
    test_accuracies[neighbor] = knn.score(X_test, y_test)

In [None]:
plt.figure(figsize = (8, 6))
plt.title("KNN: Varying Number of Neighbors")
plt.plot(neighbors, train_accuracies.values(), label = "Training Accuracy")
plt.plot(neighbors, test_accuracies.values(), label = "Testing Accuracy")
plt.legend()
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")
plt.show()

## 2. Regression

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_bmi, y)
predictions = reg.predict(X_bmi)
plt.scatter(X_bmi, y)
plt.plot(X_bmi, predictions)
plt.ylabel("Blood Glucose (mg/dl)")
plt.xlabel("Body Mass Index")
plt.show()

### Ordinary Least Square (OLS)

- **Residual Sum of Squared:** $RSS = \Sigma_{i = 1}^n (y_i - \hat{y_i})^2$ <br><br>
- **Mean Squared Error:** $MSE = \frac{1}{n} \Sigma_{i = 1}^n (y_i - \hat{y_i})^2$ <br><br>
- **Root Mean Squared Error:** $RMSE = \sqrt{MSE}$

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)

reg_all.score(X_test, y_test)   # calculates R2

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared = False)   # RMSE

### Cross-validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits = 6, shuffle = True, random_state = 42)

reg = LinearRegression()
cv_results = cross_val_score(reg, X, y, kf)

print(cv_results)
print(np.mean(cv_results), np.std(cv_results))
print(np.quantile(cv_results, [0.025, 0.975]))   # 95% CI

### Regularized Regression
- **Ridge regression:** OLS loss function + $\alpha \times \Sigma_{i = 1}^n a_i^2$

In [None]:
from sklearn.linear_model import Ridge
score = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha = alpha)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append(ridge.score(X_test, y_test))
print(scores)

- **Lasso regression:** OLS loss function + $\alpha \times \Sigma_{i = 1}^n |a_i|$

In [None]:
from sklearn.linear_model import Lasso
score = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    lasso = Lasso(alpha = alpha)
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    scores.append(lasso.score(X_test, y_test))
print(scores)

**Lasso for feature selection** 

In [None]:
from sklearn.liner_model import Lasso

X = diabetes_df.drop("glucose", ais = 1).values
y = diabetes_df["glucose"].values
names = diabetes_df.drop("glucose", axis = 1).columns

lasso = Lasso(alpha = 0.1)
lass_coe = lasso.fit(X, y).coef_

plt.bar(names, lasso_coef)
plt.xticks(rotation = 45)
plt.show()

## 3. Model Tuning

- **Class Inbalance**
- **Confusion Metrix**
    - **Accuracy** = $\frac{\text{true positives} + \text{true negatives}}{\text{true positives} + \text{true negatives} + \text{false positives} + \text{false negatives}}$ <br><br>
    - **Precision** = $\frac{\text{true positives}}{\text{true positives} + \text{false positives}}$ <br><br>
    - **Recall** = $\frac{\text{true positives}}{\text{true positives} + \text{false negatives}}$ <br><br>
    - **F1 Score** = $2 \times \frac{precision \times recall}{precision + recall}$ <br><br>

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

knn = KNeighborsClassifier(n_neighbors = 7)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_pred_probs = logreg.predict_proba(X_test)[:, 1]

- **Receiver Operating Characteristic (ROC) curve** for tunning thresholds

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pre_probs)

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Logistic Regression ROC Curve")
plt.show()

- **Area Under ROC Curve (AUC)**

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test, y_pred_probs))

### Hyperpaameter tuning
- **Grid search cross-validation**

In [None]:
from sklearn.model_selecion import GridSearchCV
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
param_grid = {"alpha": np.arrange(0.0001, 1, 10),
             "solver": ["sag", "lsqr"]}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv = kf)
ridge_cv.fit(X_train, y_train)

print(ridge_cv.best_params_, ridge_cv.best_score_)

- **RandomizedSearchCV**

In [None]:
from sklearn.model_selecion import RandomizedSearchCV
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
param_grid = {"alpha": np.arrange(0.0001, 1, 10),
             "solver": ["sag", "lsqr"]}
ridge = Ridge()
ridge_cv = RandomizedSearchCV(ridge, param_grid, cv = kf)
ridge_cv.fit(X_train, y_train)

print(ridge_cv.best_params_, ridge_cv.best_score_)

In [None]:
# evaluating on the test set
test_score = ridge_cv.score(X_test, y_test)
print(test_score)

## 4. Preprocessing and Pipelines
### Preprocessing
#### Dealing with categorical features:
- scikit-learn: **OneHotEncoder()**
- pandas: **get_dummies()**

In [None]:
import pandas as pd
music_df = pd.read_csv('music.csv')
music_dummies = pd.get_dummies(music_df["genre"], drop_first = True)
music_dummies = pd.concat([music_df, music_dummies], axis = 1)
music_dummies = music_dummies.drop("genre", axis = 1)

#### Linear regression with dummy variables

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression

X = music_dummies.drop("popularity", axis = 1).values
y = music_dummies["popularity"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

linreg = LinearRegression()
linreg_cv = cross_val_score(linreg, X_train, y_train, cv = kf, scoring = "neg_mean_squared_error")

print(np.sqrt(-linreg_cv))

#### Missing Data
- drop
- imputing values: mean/median/mode

In [None]:
music_df = music_df.dropna(subset = ['genre', 'popularity', 'loudness', 'liveness', 'tempo'])

In [None]:
# imputation with sklearn
from sklearn.impute import SimpleImputer
X_cat = music_df['genre'].values.reshape(-1, 1)
X_num = music_df.drop(['genre', 'popularity'], axis = 1).values
y = music_df['popularity'].values

X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_cat, y, test_size = 0.2, random_state = 12)
X_train_num, X_test_num, y_train, y_test = train_test_split(X_num, y, test_size = 0.2, random_state = 12)

imp_cat = SimpleImputer(strategy = 'most_frequent')
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.fit_transform(X_test_cat)

imp_num = SimpleImputer()   # by default fill with mean
X_train_num = imp_num.fit_transform(X_train_num)
X_test_num = imp_num.fit_transform(X_test_num)

X_train = np.append(X_train_num, X_train_cat, axis = 1)
X_test = np.append(X_test_num, X_test_cat, axis = 1)

### Pipeline

In [None]:
from sklearn.pipeline import Pipeline

music_df = music_df.dropna(subset = ['genre', 'popularity', 'loudness', 'liveness', 'tempo'])
music_df['genre'] = np.where(music_df['genre'] == 'Rock', 1, 0)
X = music_df.drop("genre", axis = 1).values
y = music_df['genres'].values

steps = [("imputation", SimpleImputer()),
        ("logistic_regression", LogisticRegression())]
pipeline = Pipeline(steps)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

- **Centering and scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
X = music_df.drop("genre", axis = 1).values
y = music_df['genres'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# scaling with pipeline
steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier(n_neighbors = 6))]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

knn_scaled = pipeline.fit(X_train, y_train)
y_pred = knn_scaled.predict(X_test)

- **CV**

In [None]:
from sklearn.model_selection import GridSearchCV
steps = [('scaler', StandardScaler()),
        ('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)

parameters = {"knn__n_neighbors": np.arrange(1, 50)}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

cv = GridSearchCV(pipeline, param_grid = parameters)
cv.fit(X_train, y_train)

#### Evaluating Classification Models

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

X = music.drop("genre", axis=1).values
y = music["genre"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {"Logistic Regression": LogisticRegression(), "KNN": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier()}
results = []
for model in models.values():
kf = KFold(n_splits=6, random_state=42, shuffle=True)
cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kf)
results.append(cv_results)

plt.boxplot(results, labels=models.keys())
plt.show()

In [None]:
# Create steps
steps = [("imp_mean", SimpleImputer()), 
         ("scaler", StandardScaler()), 
         ("logreg", LogisticRegression())]

# Set up pipeline
pipeline = Pipeline(steps)
params = {"logreg__solver": ["newton-cg", "saga", "lbfgs"],
         "logreg__C": np.linspace(0.001, 1.0, 10)}

# Create the GridSearchCV object
tuning = GridSearchCV(pipeline, param_grid=params)
tuning.fit(X_train, y_train)
y_pred = tuning.predict(X_test)

# Compute and print performance
print("Tuned Logistic Regression Parameters: {}, Accuracy: {}".format(tuning.best_params_, tuning.score(X_test, y_test)))