## Chapter 1. KNeighborsClassifier

In [None]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Create arrays for the features and the target variable
y = churn_df["churn"].values
X = churn_df[["account_length", "customer_service_calls"]].values

# Create a KNN classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors=6)

# Fit the classifier to the data
knn.fit(X,y)

In [None]:
# Predict the labels for the X_new
y_pred = knn.predict(X_new)

# Print the predictions for X_new
print("Predictions: {}".format(y_pred)) 

### evaluate performance
accuracy = correct predictions/total observations

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,
                                                   random_state=32, stratify=y)

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

In [None]:
train_accuracies = {}
test_accuracies = {}
neighbors = np.arange(1,26)
for neighbor in neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    train_accuracies[neighbor] = knn.score(X_train, y_train)
    test_accuracies[neighbor] = knn.score(X_test, y_test)

In [None]:
#plot the results
plt.figure(figsize=(8,6))
plt.title("KNN: Varying Number of Neighbors")
plt.plot(neighbors, train_accuracies.values(),label="Training Accuracy")
plt.plot(neighbors, test_accuracies.values(),label="Testing Accuracy")
plt.legend()
plt.xlabel("Number of Neighbors")
plt.ylabel("Accuracy")
plt.show()

## Chapter 2. Regression

In [None]:
import pandas as pd
diabetes_df = pd.read_csv("diabetes")
print(diabetes_df.head())

In [None]:
#feature and target arrays
X = diabetes_df.drop("glucose", axis=1).values
y= diabetes_df["glucose"].values

In [None]:
X_bmi = X[:, 3]
print(y.shape, X_bmi.shape)

In [None]:
X_bmi = X_bmi.reshape(-1,1)
print(X_bmi)

#x_bmi would be the same shape as the model

In [None]:
import matplotlib.pyplot as plt
plt.scatter(X_bmi,y)
plt.ylabel("Blood Glucose (mg/dL)")
plt.xlabel("Body Mass Index")
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_bmi, y)
predictions = reg.predict(X_bmi)
plt.scatter(X_bmi,y)
plt.plot(X_bmi, predictions)
plt.ylabel("Blood Glucose (mg/dL)")
plt.xlabel("Body Mass Index")
plt.show()

In [None]:
# Create X and y arrays
X = sales_df.drop("sales", axis=1).values
y = sales_df["sales"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate the model
reg = LinearRegression()

# Fit the model to the data
reg.fit(X_train,y_train)

# Make predictions
y_pred = reg.predict(X_test)
print("Predictions: {}, Actual Values: {}".format(y_pred[:2], y_test[:2]))

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared=False)

In [None]:
# Import mean_squared_error
from sklearn.metrics import mean_squared_error

# Compute R-squared
r_squared = reg.score(X_test, y_test)

# Compute RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print("R^2: {}".format(r_squared))
print("RMSE: {}".format(rmse))

### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=6, shuffle=True, random_state=42)
reg = LinearRegression()
cv_results = cross_val_score(reg, X,y,cv=kf)

In [None]:
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))

In [None]:
print(np.quantile(cv_results,[0.025, 0.975]))

### regularized regression
-avoiding overfitting
-penalizing large coeffecients

In [None]:
#ridge regression
from sklearn.linear_model import Ridge
scores = []
for alpha in [0.1,1,10,10,1000]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append(ridge.score(X_test, y_test))
print(scores)

In [None]:
#lasso regression
from sklearn.linear_model import Lasso
scores = []
for alpha in [0.1,1,10,10,1000]:
    lasso = lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    lasso_pred = lasso.predict(X_test)
    scores.append(lasso.score(X_test, y_test))
print(scores)

In [None]:
#Lasso for feature selection in scikit-learn
from sklearn.linear_model import Lasso
X = diabetes_df.drop("glucose", axis=1).values
y = diabetes_df["glucose"].values
names = diabetes_df.drop("glucose", axis=1).columns
lasso = Lasso(alpha=0.1)
lasso_coef = lasso.fit(X,y).coef_
plt.bar(names, lasso_coef)
plt.x_ticks(rotation=45)
plt.show()

### confusion matrix in scikit-learn


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
knn = KNeighborClassifier(n_neighbors=7)
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size=0.4,
                                                   random_state=42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

## Creating features

In [None]:
import numpy as np
X = sales_df['radio'].values.X.reshape(-1,1)
y = sales_df['sales'].values

print(X.shape, y.shape)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()
reg.fit(X,y)
predictions = reg.predict(X)

print(predictions[:5])

In [None]:
import matplotlib.pyplot as plt
plt.scatter(X, y, color='blue')
#create a line plot

plt.plot(X,predictions, color='red')
plt.xlabel("Radio Expenditure ($)")
plt.ylabel("Sales ($)")

plt.show()

## Logistic regression in SKLearn

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.3, random_state=42)

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

y_pred_probs = logreg.predict_proba(X_test)[:, 1]

print(y_pred_probs[:10])

print(y_pred_probs[:10])

In [None]:
#plooting the ROC
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, y_pred_probs))

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Hyperparameter Tuning
#### Ridge/lasso regression :choosing alpha
#### KNN: choosing n_neighbors
#### hyperparameters are parameters we specify before fitting the model

In [None]:
#grid search cross-validation
from sklearn.model_selection import GridSearchCV
kf = KFold(n_splits=5,
          shuffle=True,
          random_state=42)
param_grid = {"alpha": np.arange(0.0001, 1,10),
             "solver": ['sag','lsqr']}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=kf)
ridge_cv.fit(X_train, y_train)
print(ridge_cv.best_params_, ridge_cv.best_score_)

In [None]:
#RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
kf = KFold(n_splits=5,
          shuffle=True,
          random_state=42)

# Set up the parameter grid
param_grid = {"alpha": np.arange(0.0001, 1,10),
             "solver": ['sag','lsqr']}
ridge = Ridge()
ridge_cv = RandomizedSearchCV(ridge, param_grid, cv=kf, n_iter=2
ridge_cv.fit(X_train, y_train)
print(ridge_cv.best_params_, ridge_cv.best_score_)
                              
# Fit to the training data
lasso_cv.fit(X_train, y_train)
print("Tuned lasso paramaters: {}".format(lasso_cv.best_params_))
print("Tuned lasso score: {}".format(lasso_cv.best_score_))

In [None]:
#Evaluating on the test set
test_score = ridge_cv.score(X_test, y_test)
print(test_score)

In [None]:
# Create the parameter space
params = {"penalty": ["l1", "l2"],
         "tol": np.linspace(0.0001, 1.0, 50),
         "C": np.linspace(0.1, 1.0, 50),
         "class_weight": ["balanced", {0:0.8, 1:0.2}]}

# Instantiate the RandomizedSearchCV object
logreg_cv = RandomizedSearchCV(logreg, params, cv=kf)

# Fit the data to the model
logreg_cv.fit(X_train, y_train)

# Print the tuned parameters and score
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Best Accuracy Score: {}".format(logreg_cv.best_score_))

### preprocessing data

In [None]:
#music dataset
import pandas as pd
music_df = pd.read_csv('music.csv')
music_dummies = pd.get_dummies(music_df['genre'], drop_first=True)
print(music_dummies.head())

In [None]:
#to bring dummies to df
music_dummies = pd.concat([music_df, music_dummies], axis=1)
#remove genres column
music_dummies = music_dummies.drop("genre", axis = 1)

In [None]:
#linear regression with dummy variables
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
X = music_dummies.drop("popularity", axis = 1)
y = music_dummies["popularity"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                   random_state=42)
kf = KFold(n_split=5, shuffle=True, random_state=42)
linreg = LinearRegression()
linreg_cv = cross_val_score(linreg, X_train, y_train, cv=kf,
                           scoring="neg_mean_squared_error")
print(np.sqrt(-linreg_cv))

In [None]:
# Create X and y
X = music_dummies.drop('popularity', axis=1).values
y = music_dummies['popularity'].values

# Instantiate a ridge model
ridge = Ridge(alpha=0.2)

# Perform cross-validation
scores = cross_val_score(ridge, X, y, cv=kf, scoring="neg_mean_squared_error")

# Calculate RMSE
rmse = np.sqrt(-scores)
print("Average RMSE: {}".format(np.mean(rmse)))
print("Standard Deviation of the target array: {}".format(np.std(y)))