### Pre-process Data

In [None]:
## Read the dataframe:
df = pd.read_csv('DiabetesDataset.csv')

## Show statistics:
df.info(), df.describe(), df.types, df.head(), df.columns()

## Selecting certain columns:
df.iloc[:,1:4] or df["col1","col2"]

## Drop NaN values:
print(df.shape)
print(data.isnull().sum())
data = data.dropna(axis=0, inplace=True).reset_index(drop=True)

## Drop columns:
data.drop(['Cabin', 'Name', 'PassengerId', 'Ticket'], axis=1, inplace=True)

## Fill NaN values (all or columns):
df.fillna(130, inplace = True)
df["Calories"].fillna(130, inplace = True)
x = df["Calories"].mean()
x = df["Calories"].median()
df["Calories"].fillna(x, inplace = True)

## Replace values:
pima2.replace(0, np.nan,inplace=True)

## Remove duplicates:
print(df.duplicated())
df.drop_duplicates(inplace = True)

## Change values in dataframe:
df['Duration'] = df['Duration'].replace({60:'Miss', 'Mme':'Mrs', 'Ms':'Miss'})

## Change the date:
df['Date'] = pd.to_datetime(df['Date'])

## Selecting X and Y:
X = df.drop(columns=['Diabetes'], axis=1)
y = df['Diabetes'].values

## Train-Test Split: (Train-Test Dataset): Select correct split amount:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
print ("Training Set Size:", len(X_train))
print ("Test Set Size:", len(X_test))

## Create a Validation Dataset:
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
print ("Training Set Size:", len(X_train_val))
print ("Validation Set Size:", len(X_val))

## Polynomial and Log-Transform:
# Polynomial transform
poly = PolynomialFeatures(degree=2)  # Change degree as per requirements
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
print("Original X_train shape: ", X_train.shape)
print("Transformed X_train shape: ", X_train_poly.shape)
# Log Transform:
transformer = FunctionTransformer(np.log1p, validate=True)
X_train_log = transformer.transform(X_train)
X_test_log = transformer.transform(X_test)
print("Log-transformed X_train shape: ", X_train_log.shape)

## Use StandardScaler or PowerTransformer or MinMaxScaler:
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
scaler = PowerTransformer()
Xtrain_scale = scaler.fit_transform(X_train)
Xtest_scale = scaler.transform(X_test)
# Transform and show histogram:
Xtrain = pd.DataFrame(Xtrain_scale, columns=X.columns)
Xtrain.hist(bins=50, figsize=(20, 15))
plt.show()

## One-Hot Encoding of a categorical Feature:
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder()
one_hot_degree = one_hot.fit_transform(data[["Degree"]]).toarray()
one_hot_degree = pd.DataFrame(one_hot_degree, columns=one_hot.get_feature_names_out())
data_tree = pd.concat([data, one_hot_degree], axis=1)

## Ordinal encoding (i.e. Male&Female -> 0,1) Categorical Features:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
oe=OrdinalEncoder()
# select variables for label encoding
categorical_cols=['Sex', 'BP', 'Cholesterol', 'Drug']
# set up your preprocessor (name, transformer, columns to transform)
preprocessor = ColumnTransformer([('categorical', oe, categorical_cols)], remainder='passthrough')
encoded_df = pd.DataFrame(preprocessor.fit_transform(df), columns=['Sex', 'BP', 'Cholesterol', 'Drug', 'Age', 'Na_to_K'])

## Training Loop:

models =[("RF", RandomForestClassifier()),
         ("kNN", KNeighborsClassifier(n_neighbors=3)) ]
results = []
names = []
finalResults = []
for name, model in models:
    model.fit(X_train, y_train)
    model_results = model.predict(X_test)
    train_score = precision_score(y_train, model.predict(X_train), average='macro')
    print(train_score)
    test_score = precision_score(y_test, model_results, average='macro')
    print(test_score)
    results.append(test_score)
    names.append(name)
    finalResults.append((name, test_score, train_score))
    print(model)
    print(model.classes_)
    cm = confusion_matrix(y_test, model_results, labels=classes)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot()
    plt.show()
finalResults.sort(key=lambda k:k[1],reverse=True)

## Grid Search CV:
from sklearn.model_selection import GridSearchCV
rf=RandomForestClassifier()
params= {'n_estimators':[10,50,100,200],
         'max_depth':list(range(1,7))}
estimator= GridSearchCV(rf, params,cv=10, scoring='f1_macro', n_jobs=-1)
estimator.fit(X_train, y_train)
RF_par=estimator.best_params_

## Permutation Importance:
from sklearn.inspection import permutation_importance
best_rf_model = estimator.best_estimator_
perm_importances = permutation_importance(
    best_rf_model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
forest_importances = pd.Series(perm_importances.importances_mean, index=features)
sort_index = np.argsort(forest_importances)[::-1]
# plot the importances
fig, ax = plt.subplots()
forest_importances[sort_index].plot.bar(yerr=perm_importances.importances_std[sort_index], ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

## PERFORMANCE MEASUREMENTS
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score,accuracy_score,average_precision_score

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)

## Confusion Matrix:
print(confusion_matrix(y_test, y_pred))
labels = []
sns.heatmap(conf, 
            annot=True, 
            fmt='d', 
            cbar=False, 
            cmap="coolwarm_r", 
            xticklabels=labels, 
            yticklabels=labels, 
            linewidth = 1)
plt.title("Confusion Matrix")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

## ROC and Precision-Recall:
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# Calculates Probabilities & Keep the Probabilities of the positive class only:
probs = model.predict_proba(X_test)
probs = probs[:, 1]

# Function for plotting the ROC curve
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label = 'random classifier')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

# Plot ROC curve & Compute the Area Under the ROC Curve (AUC) - the ROC AUC score
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
auc = roc_auc_score(y_test, probs)
print("AUC: " , round(auc, 3))

# Function for plotting the Precision-Recall curve
def plot_rpc(recall, precision):
    plt.plot(recall, precision, color='orange', label='RPC')
    plt.ylabel('Precision')
    plt.xlabel('Recall = True Positive Rate')
    plt.title('Recall-Precision Curve')
    plt.legend()
    plt.show()

# Plot Precision-Recall curve  & Compute average precision - Precision-Recall AUC
precision, recall, _ = precision_recall_curve(y_test, probs)
plot_rpc(recall, precision)
average_precision = average_precision_score(y_test, probs)
print("Average Precision: ", round(average_precision, 3))


## Correlation Matrix
# Simple:
corrmatrix=df.corr()
import seaborn as sns
corrmatrix
sns.heatmap(corrmatrix)

## Advanced correlation Matrix (absolute)
# Get Feature names and column names
features=df.iloc[:,:-1].columns;
corr = df[features].corr()
dcorr = corr.stack().reset_index()
dcorr.columns = ['F1', 'F2', 'Corr']
mask_dups = (dcorr[['F1', 'F2']].apply(frozenset, axis=1).duplicated()) | (dcorr['F1']==dcorr['F2'])
dcorr = dcorr[~mask_dups]
dcorr['Corr_abs'] = dcorr['Corr'].abs()
# Highest absolute:
dcorr.sort_values(by=['Corr_abs'], ascending=False).head(10)
# Highest Positive:
dcorr.sort_values(by=['Corr'], ascending=False).head(10)
# Highest Negative:
dcorr.sort_values(by=['Corr'], ascending=True).head(10)$
# Plot it:
f, ax = plt.subplots(figsize=(10, 10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
_ = sns.heatmap(corr, cmap=cmap, square=True, linewidths=.5, cbar_kws={"shrink": .5})

## KNN 
* `n_neighbors`, default=5
*  `weights`{‘uniform’, ‘distance’} default=’uniform’=All equal weight, distance=weigh by inverse distance

In [None]:
## Imports:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score,accuracy_score,average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## KNN normal
model =  KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
pred = model.predict(X_test)

# Calculate Accurracy, Recall, Precision & F1-Score:
vacc = accuracy_score(y_test, pred)
rec = recall_score(y_test, pred)
prec = precision_score(y_test, pred)
f1 = f1_score(y_test, pred)

## KNN with Validation dataset:
scores = []
for k in range(1,101):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_val, y_train_val)
    pred = knn.predict(X_val)
    val_score = knn.score(X_val, y_val)
    scores.append(acc)
# Plot and find best k:
best_k = np.argmax(scores)+1
best_score = scores[best_k-1]
plt.plot(range(1,101), scores)
plt.title("best k at {} with score of {}".format(best_k, round(best_score,3)))
plt.axvline(x=best_k, c="k",  ls="--")
plt.show()

## KNN with Cross Validation (k-fold) 10 (cross-val-score)
mean_scores = np.array([])
scores_std =np.array([])
for k in range(1,101):
    knn_cross = KNeighborsClassifier(n_neighbors=k)
    cv_scores = cross_val_score(knn_cross, X_train, y_train, cv=10)
    mean_scores = np.append(mean_scores, np.mean(cv_scores))
    scores_std = np.append(scores_std, np.std(cv_scores))
# Plot & find best k:
best_k = np.argmax(mean_scores)+1
best_score = mean_scores[best_k-1]
plt.plot(range(1,101), mean_scores)
plt.title("best k at {} with score of {}".format(best_k, round(best_score,3)))
plt.fill_between(range(0, len(mean_scores)), mean_scores + scores_std, mean_scores - scores_std, alpha=0.15, color='blue')
plt.axvline(x=best_k, c="k",  ls="--")
plt.show()

## KNN with Hyperparameter Tuning Grid Search Cross-Validation:
grid = {'n_neighbors':np.arange(1,100),
        'p':np.arange(1,3),
        'weights':['uniform','distance']
       }
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, grid, cv=10)
knn_cv.fit(X_train, y_train)
print("Hyperparameters:", knn_cv.best_params_)
print("CV Mean Accuracy Score:", round(knn_cv.best_score_, 4))
# Best estimator
knn_best = knn_cv.best_estimator_

## Confusion Matrix:
knn_cross = KNeighborsClassifier(n_neighbors=best_k).fit(X_train,y_train)
pred = knn_cross.predict(X_test)
acc = knn_cross.score(X_test,y_test)
conf = confusion_matrix(y_test, pred)
sns.heatmap(conf, 
            annot=True, 
            fmt='d', 
            cbar=False, 
            cmap="coolwarm_r", 
            linewidth = 1)
plt.title("Confusion Matrix")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

## Decision Trees

In [None]:
## Imports:
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier

## Train a Decision Tree:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
# Accurracy on training set:
tree.score(X_train, y_train)

## Plotting the trees:
plt.figure(figsize=(12,12))
plot_tree(tree, filled=True);
tree.get_depth()

## Plotting the Confusion Matrix:
def accuracy_conf_mat(y_test, y_pred):
  print("Accuracy score:", round(accuracy_score(y_test, y_pred), 4))
  conf_mat = confusion_matrix(y_test, y_pred)
  cm_display = ConfusionMatrixDisplay(conf_mat).plot()
accuracy_conf_mat(y_test, y_pred)

## Different Depths of Decision Trees:
for depth in [2, 3, 4, 5]:
    tree = DecisionTreeClassifier(max_depth=depth).fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    print("Depth: " + str(depth))
    print(round(accuracy_score(y_test, y_pred), 2))
    plt.figure(figsize=(6, 6))
    plot_tree(tree, filled=True)
    plt.show()
    print("\n\n\n\n")

## Dummy Classifier as baseline:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(None, y_train)
baseline = dummy.score(None, y_test)
baseline


## Grid Search to find max Depth:
grid = {'max_depth':np.arange(1,7)}
tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree, grid, cv=5)
tree_cv.fit(X_train, y_train)
plot_tree(tree_cv.best_estimator_, filled=True)
print("Hyperparameters (best max_depth):", tree_cv.best_params_)
print("Training CV Accuracy Score:", round(tree_cv.best_score_, 4))
print("Test Accuracy Score:", round(tree_cv.score(X_test, y_test), 4))
tree_best = tree_cv.best_estimator_

### Random Forest

In [None]:
## Imports:
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score,accuracy_score,average_precision_score
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV

## Train Random Forest:
rfc = RandomForestClassifier(n_estimators=100, random_state = 42, oob_score=True)
rfc.fit(X_train,y_train)
y_pred_train = rfc.predict(X_train)
y_pred_test = rfc.predict(X_test)

## Calculate performance measures:
acc_test = accuracy_score(y_pred_test, y_test)
acc_train = accuracy_score(y_pred_train, y_train)
acc_oob = rfc.oob_score_
conf_mat = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(conf_mat).plot()

## Random Forest with Grid Search:
grid = {'n_estimators': np.arange(100,1000,100),
        'criterion': ['gini','entropy']
       }

rf = RandomForestClassifier(random_state=42)
rf_cv = GridSearchCV(rf, grid, cv=5)
rf_cv.fit(X_train,y_train)
print("Hyperparameters:", rf_cv.best_params_)
print("Training CV Accuracy Score:", rf_cv.best_score_)
print("Test Accuracy Score:", rf_cv.score(X_test,y_test))

## Permutation Importance to compute Feature Importance Plot
result = permutation_importance(
    rf_cv.best_estimator_, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
# Adjust Feature names:
forest_importances = pd.Series(result.importances_mean, index=cancer.feature_names)
sort_index = np.argsort(forest_importances)[::-1]
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

## Support Vector Machines 
Parmaters in `sklearn import svm`
* `kernel`: Can be `poly`, `linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
* `gamma` 
* `degree`: Is the degree of `poly` i.e. 2
* `C`: regularisation paramer i.e. 1.

In [None]:
## Imports:
from sklearn import svm
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score,accuracy_score,average_precision_score

## Fit a Polynomial Kernel:
model = svm.SVC(kernel='poly', degree=2,C=1,gamma=0.10)
model.fit(x_train, y_train)

## Fit a Radial Basis Kernel:
model=SVC(kernel='rbf', gamma=1, C=1)
model.fit(Xtrain, ytrain)
ypred=model.predict(Xtest_scale)
print('Accuracy: %f' % accuracy_score(ytest, ypred))

## For loop over changing C values 10^4-10^4 in 5 steps
gamma=0.001
CList=np.logspace(0,4,5)
for C in CList:
    model = svm.SVC(kernel='rbf',C=C,gamma=gamma)
    model.fit(x_train, y_train)
    PlotDecisionBoundary(model, x_train.values,y_train)

## SVM with Cross validation:
C_range = np.logspace(-3, 3, 7)
gamma_range = np.logspace(-3, 3, 7)
param_grid = dict(gamma=gamma_range, C=C_range)
model=SVC()
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(model, param_grid=param_grid, cv=cv,n_jobs=-1)
grid.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))
ypred = grid.best_estimator_.predict(X_test_scaled)
target_names = ['negative', 'positive']
print(classification_report(ytest, ypred, target_names=target_names))
print(confusion_matrix(ytest, ypred))

## Advanced Parameter tunning:
tuned_parameters = [{'kernel': ['linear'], 'C': [1, 10, 15, 100, 1000]},
                    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}]

score = 'accuracy'
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                   scoring=score)
clf.fit(X_train, y_train)
print(clf.best_params_)
results = clf.cv_results_
for i in range(len(results["params"])):
    print("%0.3f (+/-%0.03f) for %r" % (results["mean_test_score"][i], results["std_test_score"][i] * 2, results["params"][i]))

## SVM with LeaveOneOut Cross Validation:
from sklearn import model_selection
C = 0.03
svc = svm.SVC(kernel='linear', C=C)
loo = model_selection.LeaveOneOut()
res = [svc.fit(X[train], y[train]).score(X[test], y[test]) for train, test in loo.split(X)]
# Averge accurracy:
np.mean(res)
# Same with a for loop 
Clist = np.logspace(-3,3,14)
for C in Clist:
    svc = svm.SVC(kernel='linear', C=C)
    loo = model_selection.LeaveOneOut()
    res = [svc.fit(X[train], y[train]).score(X[test], y[test]) for train, test in loo.split(X)]
    print('C: %f \t accuracy: %f' % (C,np.mean(res))) #The average accuracy



### Linear Regression & Regularization

In [None]:
## Imports:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

## Simple Linear Regression:
LR = LinearRegression(fit_intercept=True)
LR.fit(X, y)
# R2 score (a,b): 
print("params: ", LR.coef_)
print("constant: ", LR.intercept_)
print("R^2 score: ", LR.score(X, y))
# Predictions:
print("TV: ", 200, "Radio: ", 50, "Sales: ", LR.predict(np.array([200, 50]).reshape(-1,2)))
print("TV: ", 200, "Radio: ", 30, "Sales: ", LR.predict(np.array([200, 30]).reshape(-1,2)))

## Polynomial Features with LR:
X = np.array(ad_df[["TV", "Radio"]])
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)
X_poly.shape

## Different Degrees Polynomials: 
X = np.array(ad_df[["TV"]])
y = np.array(ad_df["Sales"])
train_err = []
test_err = []
for f in range(1,7):
    poly = PolynomialFeatures(f)
    X_poly = poly.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=1)
    LR = LinearRegression(fit_intercept=False)
    LR.fit(X_train, y_train)
    train_err.append(mean_squared_error(y_train, LR.predict(X_train)))
    test_err.append(mean_squared_error(y_test, LR.predict(X_test)))
# plot the training and test errors for the different models used    
plt.plot(range(1,7), train_err, label="train_error")
plt.plot(range(1,7), test_err, label="test_error")
plt.legend(fontsize=10)
plt.xlabel("polynomial degree")
plt.ylabel("error")

## Ridge Regression:
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=1)
test_err = []
for alpha in np.linspace(0.1, 1, num=10):
    ridge_regression = Ridge(alpha=alpha)
    ridge_regression.fit(X_train, y_train)
    test_err.append(mean_squared_error(y_test, ridge_regression.predict(X_test)))
plt.semilogx(np.logspace(-4, 2, num=10), test_err)
plt.xlabel("alpha")
plt.ylabel("test error");

## Cross Validation with Ridge:
ridge = RidgeCV(fit_intercept=False, cv=5)
ridge.fit(X_train, y_train)
# print he fitted regression coefficients
ridge.coef_
ridge.alpha_
# fit the model with the best selected alpha
ridge_best = Ridge(alpha=ridge.alpha_)
ridge_best.fit(X_train, y_train)
# compute the MSE on the test set
mean_squared_error(y_test, ridge_best.predict(X_test))

### Logistic Regression & Naive Bayes

In [None]:
## Imports:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), tokenizer=spacy_tokenizer)
classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
pipe = Pipeline([('vectorizer', tfidf),
                 ('classifier', classifier)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(round(accuracy_score(y_test, y_pred), 4))
conf_mat = confusion_matrix(y_test, y_pred)
conf_mat = ConfusionMatrixDisplay(conf_mat)
conf_mat.plot()

## Multinomial Logistic Regression:
log_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs", random_state=42,max_iter=1000)
log_clf.fit(X_train, y_train)
y_pred = log_clf.predict(X_test)
accuracy_score(y_test, y_pred)


## Naive Bayes:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

### Dimensionality Reduction PCA & Naive Bayes

In [None]:
## Imports
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import MDS
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Simple PCA fitting:
pca = PCA(n_components=2)
pca.fit(X)
print(pca.components_)
print(pca.explained_variance_)

## Transform a dataset into a lower dim.
pca = PCA(n_components=1)
pca.fit(X)
X_pca_reduced = pca.transform(X)
print("original shape:   ", X.shape)
print("transformed shape:", X_pca_reduced.shape)

## Visualise PCA: (Adjust accordingly)
plt.scatter(X_pca_reduced[:,0], X_pca_reduced[:,1], c=df.child_mortality, s=df.gdp/100, cmap="jet")

## PCA with RandomForest (explained variance ratio of 95%)
pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(X_train)
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train_reduced, y_train)
X_test_reduced = pca.transform(X_test)
y_pred = forest.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

## Inverse it and see the difference
X_new = pca.inverse_transform(X_pca)
plt.scatter(X[:, 0], X[:, 1], alpha=0.2)
plt.scatter(X_new[:, 0], X_new[:, 1], alpha=0.8)
plt.axis('equal');

## Perform PCA with Naive Bayes: (Pipeline):
unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)
print('\nPrediction accuracy for the normal test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))

## PCA + NB + Standardisation
std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB())
std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)
print('\nPrediction accuracy for the normal test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))

## Optional: Extract the Components from the Pipeline and scale:
pca = unscaled_clf.named_steps['pca']
X_train_unscaled=pca.transform(X_train)
pca_std = std_clf.named_steps['pca']
scaler = std_clf.named_steps['standardscaler']
X_train_std = pca_std.transform(scaler.transform(X_train))
# Show first principal componenets
print('\nPC 1 without scaling:\n', pca.components_[0])
print('\nPC 1 with scaling:\n', pca_std.components_[0])

## t-SNE Reduce dim to 2D:
tsne = TSNE(n_components=2, random_state=42)
X_reduced = tsne.fit_transform(X)
plt.figure(figsize=(13,10))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y.astype(int), cmap="jet")
plt.axis('off')
plt.colorbar()
plt.show()

## PCA and LocallyLinearEmbedding
pca_lle = Pipeline([
    ("pca", PCA(n_components=0.95, random_state=42)),
    ("lle", LocallyLinearEmbedding(n_components=2, random_state=42)),
])
t0 = time.time()
X_pca_lle_reduced = pca_lle.fit_transform(X)
t1 = time.time()
print("PCA+LLE took {:.1f}s.".format(t1 - t0))
plot_digits(X_pca_lle_reduced, y.astype(int))
plt.show()

## MDS
X_mds_reduced = MDS(n_components=2, random_state=42).fit_transform(X_train)

### KMeans

In [None]:
## Imports: 
from sklearn.cluster import KMeans

## K-Means:
km = KMeans(n_clusters=8, 
            init='random', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(X)
km.score(X)
PlotClusters(X,y_km, km)

## For Loop Kmeans (Variation of the number of k-clusers):
for n_clusters in range(4,9):
    km = KMeans(n_clusters=n_clusters,
                init='random', 
                n_init=10, 
                max_iter=300,
                tol=1e-04,
                random_state=0)
    y_km = km.fit_predict(X)
    PlotClusters(X,y_km, km)


## Elbow Methods Plot: (k-means++) = Putting centroids far away
distortions = []
ScoreList   = []
maxNumberOfClusters=15
for i in range(1, maxNumberOfClusters):
    km = KMeans(n_clusters=i, 
                init='k-means++', 
                n_init=10, 
                max_iter=300, 
                random_state=0)
    km.fit(X)
    distortions.append(km.inertia_)
    ScoreList.append(-km.score(X))
# Plot the ellbow plot:
plt.plot(range(1, maxNumberOfClusters), distortions, marker='o')
plt.plot(range(1, maxNumberOfClusters), ScoreList, marker='^')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.tight_layout()
plt.grid(True)
plt.show()

## Plot Clusters:
from matplotlib import colors as mcolors
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
ColorNames=list(colors.keys())
HSV=colors.values()
def PlotClusters(X,y, km):
    for ClusterNumber in range(km.n_clusters):
        plt.scatter(X[y == ClusterNumber, 0],
                X[y == ClusterNumber, 1],
                s=50, c=ColorNames[ClusterNumber+1],
                marker='s', edgecolor='black',
                label='cluster {0}'.format(ClusterNumber+1))
    plt.scatter(km.cluster_centers_[:, 0],
        km.cluster_centers_[:, 1],
        s=250, marker='*',
        c='red', edgecolor='black',
        label='centroids')
    plt.legend(scatterpoints=1)
    plt.grid()
    plt.tight_layout()
    plt.show()

## Kmeans on images:
image_flattened = np.reshape(original_img, (width * height, depth))
image_array_sample = shuffle(image_flattened, random_state=0)[:1000]
estimator = KMeans(n_clusters=8, random_state=0)
estimator.fit(image_array_sample)
cluster_assignments = estimator.predict(image_flattened)

## Spectral Clustering
from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                          assign_labels='kmeans')

labelsS = model.fit_predict(X_mn)

### DBSCAN

In [None]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('stores.csv')
features = data[['latitude', 'longitude']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
dbscan = DBSCAN(eps=0.1, min_samples=5)
dbscan.fit(scaled_features)
labels = dbscan.labels_
data['cluster_label'] = labels
print(data)

for eps in np.linspace(14,20,11):
    print("\neps={}".format(eps))
    dbscan = DBSCAN(eps=eps, min_samples=3)
    labels = dbscan.fit_predict(X_pca)
    print("Number of clusters: {}".format(len(np.unique(labels))))
    print("Cluster sizes: {}".format(np.bincount(labels + 1)))

### Gaussian Mixture Model

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split

n_components = np.arange(1, 21)
models = [mix.GaussianMixture(n, covariance_type='full',
                             random_state=42).fit(X_train)
         for n in n_components]

fig, ax = plt.subplots(figsize=(9,7))
ax.plot(n_components, [m.bic(X_train) for m in models], label='BIC')
ax.plot(n_components, [m.aic(X_train) for m in models], label='AIC')
ax.axvline(np.argmin([m.bic(X_train) for m in models]), color='blue')
ax.axvline(np.argmin([m.aic(X_train) for m in models]), color='green')

plt.legend(loc='best')
plt.xlabel('n_components')


## Visualisations

In [None]:
# Line Plot: Matplotlib
fig, ax = plt.subplots(figsize = (10, 6))
ax.set(title="Calories",xlabel="calories",ylabel="index")
# Define (x,y) -> if not specified => Index
ax.plot(df.Calories)
plt.show()

# Line plot with range:
fig, ax = plt.subplots(figsize=(10, 6))
ax.set(title="Calories", xlabel="Index", ylabel="Calories")
x_values = np.arange(1, 101)  # Specify the desired range for the x-axis
ax.plot(x_values, df["Calories"].values[:100])
plt.show()

# Scatter plot:
fig, ax = plt.subplots(figsize = (10, 6))
ax.set(title="Scatterplot",xlabel="Duration",ylabel="Calories")
ax.scatter(df.Duration,df.Pulse, c=df.Pulse, cmap="coolwarm_r")
plt.show()

# Scatterplot with SNS
fig, ax = plt.subplots(figsize = (10, 6))
ax.set(title="Scatterplot",xlabel="Duration",ylabel="Calories")
sns.scatterplot(df, x="Duration",y="Pulse", hue=df.Pulse)
plt.show()

# Vertical Barplot:
fig, ax = plt.subplots(figsize = (10, 6))
ax.set(title="Barplot",xlabel="Duration",ylabel="Calories")
ax.bar(df.Duration,df.Pulse, color = 'darkblue')
plt.show()

# Horizontal Barplot:
fig, ax = plt.subplots(figsize = (10, 6))
ax.set(title="Barplot",xlabel="Duration",ylabel="Calories")
ax.barh(df.Duration,df.Pulse, color = 'darkblue')
plt.show()

# Histograms:
fig, ax = plt.subplots(figsize = (10, 6))
ax.set(title="Histogram",xlabel="WHatever",ylabel="Frequency")
ax.hist(df.Pulse, color = 'darkblue',bins=50)
plt.show()

## Hard one: Barplot with hue (dependencies):
df_CH_Drug = df.groupby(["Drug","Cholesterol"]).size().reset_index(name = "Count")
plt.figure(figsize = (9,5))
sns.barplot(x = "Drug",y="Count", hue = "Cholesterol",data = df_CH_Drug)
plt.title("Cholesterol -- Drug")
plt.show()

# Multiple plots: (1 row 3 columns)
fig, ax = plt.subplots(1, 3, figsize=(18,5))
i = 0
for k in [1, 3, 5]: # 2 scatter plots in 1:
  model = KNeighborsClassifier(n_neighbors=k).fit(X,y)
  pred = model.predict(p[["x1", "x2"]])
  ax[i].scatter(data.x1, data.x2, c=data.y, cmap="coolwarm_r")
  ax[i].scatter(p.x1, p.x2, c=pred, cmap="coolwarm_r", marker="x")
  ax[i].set_title("KNN with k = " + str(k))
  i += 1

# Markers = o = circle . = point

# Horizontal Line can be added by:
ax.axvline(x=4, c="k",  ls="--")

## Pairplot: Dependencies between target = Drug and other:
plt.figure(figsize=(10,10))
sns.pairplot(df, hue="Drug")

## Countplot: Target and Categorical Input Features:
df_CH_Drug = df.groupby(["Drug","Cholesterol"]).size().reset_index(name = "Count")
plt.figure(figsize = (9,5))
sns.barplot(x = "Drug",y="Count", hue = "Cholesterol",data = df_CH_Drug)
plt.title("Cholesterol -- Drug")
plt.show()

# Multiple plot:
plt.figure()
df.hist(figsize=(12,12),bins=37)
plt.show()

# Correlation Matrix:
corrmatrix=df.corr()
import seaborn as sns
corrmatrix
sns.heatmap(corrmatrix)

## Advanced correlation Matrix (absolute)
# Get Feature names and column names
features=df.iloc[:,:-1].columns;
corr = df[features].corr()
# Stack them and remove duplicate correlations
features=df.iloc[:,:-1].columns;
corr = df[features].corr()
dcorr = corr.stack().reset_index()
dcorr.columns = ['F1', 'F2', 'Corr']
mask_dups = (dcorr[['F1', 'F2']].apply(frozenset, axis=1).duplicated()) | (dcorr['F1']==dcorr['F2'])
dcorr = dcorr[~mask_dups]
dcorr['Corr_abs'] = dcorr['Corr'].abs()
# Highest absolute:
dcorr.sort_values(by=['Corr_abs'], ascending=False).head(10)
# Highest Positive:
dcorr.sort_values(by=['Corr'], ascending=False).head(10)
# Highest Negative:
dcorr.sort_values(by=['Corr'], ascending=True).head(10)$
# Plot it:
f, ax = plt.subplots(figsize=(10, 10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
_ = sns.heatmap(corr, cmap=cmap, square=True, linewidths=.5, cbar_kws={"shrink": .5})

## KNN Scatter Plot (2 Classes):
fig, ax = plt.subplots(1, 3, figsize=(18,5))
i = 0
for k in [1, 3, 5]:
  model = KNeighborsClassifier(n_neighbors=k).fit(X,y)
  pred = model.predict(p[["x1", "x2"]])
  ax[i].scatter(data.x1, data.x2, c=data.y, cmap="coolwarm_r")
  ax[i].scatter(p.x1, p.x2, c=pred, cmap="coolwarm_r", marker="x")
  ax[i].set_title("KNN with k = " + str(k))
  i += 1

## Confusion Matrix:
conf = confusion_matrix(y_test, pred)
labels = []
sns.heatmap(conf, 
            annot=True, 
            fmt='d', 
            cbar=False, 
            cmap="coolwarm_r", 
            xticklabels=labels, 
            yticklabels=labels, 
            linewidth = 1)
plt.title("Confusion Matrix")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

## ROC and Precision-Recall:
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# Calculates Probabilities & Keep the Probabilities of the positive class only:
probs = model.predict_proba(X_test)
probs = probs[:, 1]

# Function for plotting the ROC curve
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--', label = 'random classifier')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

# Plot ROC curve & Compute the Area Under the ROC Curve (AUC) - the ROC AUC score
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
auc = roc_auc_score(y_test, probs)
print("AUC: " , round(auc, 3))

# Function for plotting the Precision-Recall curve
def plot_rpc(recall, precision):
    plt.plot(recall, precision, color='orange', label='RPC')
    plt.ylabel('Precision')
    plt.xlabel('Recall = True Positive Rate')
    plt.title('Recall-Precision Curve')
    plt.legend()
    plt.show()

# Plot Precision-Recall curve  & Compute average precision - Precision-Recall AUC
precision, recall, _ = precision_recall_curve(y_test, probs)
plot_rpc(recall, precision)
average_precision = average_precision_score(y_test, probs)
print("Average Precision: ", round(average_precision, 3))

## Pandas

In [None]:
import pandas as pd
# Read a file:
df=pf.read_csv("filepath.csv")

# Head or Tail:
df.head(5), df.tail(5) 

# Summary of the data:
df.info(), df.describe(), df.types

# Column Names: 
df.columns

# Shape of dataframe (rows, columns)
df.shape
df.shape[0]
df.shape[1]

# Select columns by name and index (0-3)
cols = ["CRASH DATE", "BOROUGH", "NUMBER OF PERSONS INJURED"] 
df[cols]
df.iloc[:,1:4]

# Selecting rows (first 5)
df[0:5]

# Selecting rows and columns: row 1&4 and columns 0-2
df.iloc[[1,4], 0:3]

# Changing a value:
df.loc[0, "BOROUGH"] = "BROOKLYN"
df.iloc[0,1] = "Brook"

# Boolean Indexing:
boolean_condition = (df.LONGITUDE<-50) & (df.LONGITUDE>-74.5) & (df.LATITUDE< 41)
df_filtered = df[boolean_condition]

# Counting Number of Values in Column Frequency:
df["BOROUGH"].value_counts()
df.BOROUGH.value_count()

# Relative Frequency:
df.Drug.value_counts(normalize=True)

# Drop NaN values:
df = pd.read_csv('data.csv')
df.dropna(inplace = True)

# Fill NaN Values (All or column specific)
df.fillna(130, inplace = True)
df["Calories"].fillna(130, inplace = True)

# Fill NaN with the mean/median of the column:
x = df["Calories"].mean()
x = df["Calories"].median()
df["Calories"].fillna(x, inplace = True)

# Convert to date:
df['Date'] = pd.to_datetime(df['Date'])

# For loop over rows to delete:
for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.drop(x, inplace = True)

# Duplicate rows:
print(df.duplicated())
df.drop_duplicates(inplace = True)

## Groupby:
# Sum up the number of all injured persons per borough for all different boroughs
df.groupby("BOROUGH", as_index=False).["NUMBER OF PERSONS INJURED"].sum()

# Multiple Aggregation functions (sum & max):
df.groupby("BOROUGH", as_index=False)["NUMBER OF PERSONS INJURED"].agg({"SUM INJURED": "sum", "MAX INJURED": "max"})

# Apply an aggregating function on multiple variables
df.groupby(["BOROUGH","VEHICLE TYPE CODE 1"])[["NUMBER OF PERSONS INJURED","CONTRIBUTING FACTOR VEHICLE 1"]].sum()

## Examples:
# 1. Find the average age of the players for the year 2017.
bool = (df.Year == 2017)
df[bool].Age.mean()

# 2. Plot the total number of points (`PTS`) per year since 2000
bool = (df.Year >= 2000)
df[bool].groupby("Year",as_index=True).PTS.sum().plot(kind="barh")

# 3. Plot the number of players per year since 2010. 
bool = (df.Year >= 2010)
df[bool].Year.value_counts(sort=True).plot(kind="bar")

## Direct Plotting inside of pandas:

# Plots everything:
df.plot()
df.show()

# Scatterplot (Duration / Calories)
df.plot(kind = 'scatter', x = 'Duration', y = 'Calories')
plt.show()

# Histogram:
df["NUMBER OF PERSONS INJURED"].hist()
df["NUMBER OF PERSONS INJURED"].hist(bins=50)

# Bar Charts (Vertical and Horizontal)
df["BOROUGH"].value_counts().plot(kind='bar')
df.BOROUGH.value_counts().plot(kind='barh')

# Line Plot:
plot = (df['NUMBER OF PERSONS INJURED'].value_counts().plot( 
        kind='line', # we use a line plot because the x-axis is numeric/continuous
        marker='o',  # we use a marker to mark where we have data points 
        logy=True # make the y-axis logarithmic
))
plot.set_xlabel("Number of injuries")
plot.set_ylabel("Number of collisions")

# KDE (Kernel Density Estimation)
df["NUMBER OF PERSONS INJURED"].plot(
    kind='kde', 
    color='Black', 
    xlim=(0,5), 
    figsize=(15,5)
)

## Text Analysis

In [None]:
!pip install -U spacy
!python -m spacy download en
!pip install -U gensim

import spacy
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# Load English Model:
sp = spacy.load('en_core_web_sm')
text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""
# Create document:
my_doc = sp(text)
my_doc
# Tokenisation:
token_list = []
for token in my_doc:
    token_list.append(token.text)

# POS Tags:
for word in my_doc:
    print(word.text, word.pos_)
doc1 = sp("I like to fish") # verb
doc2 = sp("I eat a fish") # noun
for word in doc1:
  print(word.text, word.pos_)
for word in doc2:
  print(word.text, word.pos_)

# Split up into sentences:
sents_list = []
for sent in my_doc.sents:
    sents_list.append(sent.text)

# Remove Stop-words:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stopwords: %d' % len(spacy_stopwords))
print('20 stopwords: %s' % list(spacy_stopwords)[:20])
filtered_sent = []
for word in my_doc:
    if word.is_stop == False:
        filtered_sent.append(word.text)

# Remove stop words & punctuation & spaces:
filtered_sent2 = []
removed_tokens = []
for word in my_doc:
  if (word.is_stop == True) or (word.is_punct == True) or (word.is_space == True):
    removed_tokens.append(word.text)
  else:
    filtered_sent2.append(word.text)

# Lemmatisation:
lem = sp("run runs ran running runner runners")
for word in lem:
    print(word.text, word.lemma_)

## Text Representations:
s1 = """
President Donald Trump gets a lot of attention for using Twitter to attack American trading partners, political foes, and media companies."""
s2 = """Donald Trump is a great friend, and he has four or five Picassos on his plane. And that's where I would look at them.""" # from Shaquille O'Neal
s3 = """Donald Trump is a phony, a fraud. His promises are as worthless as a degree from Trump University.""" # from Mitt Romney
texts = [s1, s2, s3]

# Bag-of-words:
count = CountVectorizer(ngram_range=(1,2), stop_words="english")
bow = count.fit_transform(texts)
# Show matrix:
bow.toarray()
# Get Feature names & Show as Dataframe:
feature_names = count.get_feature_names_out()
pd.DataFrame(
    bow.todense(), 
    columns=feature_names
    )

# TF-IDF:
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words="english")
features = tfidf.fit_transform(texts)
pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out()
)

# N-Grams (with TF-IDF bigram)
tfidf = TfidfVectorizer(ngram_range=(2, 2), stop_words=["and", "a", "is"])
features = tfidf.fit_transform(texts)
pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out()
)

## CNN

In [None]:
# load required packages
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Dropout, MaxPooling2D, Activation, BatchNormalization
import numpy as np
import matplotlib.pyplot as plt
print(tf.__version__) #version should be at least 1.15.x

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
# names of class labels (we have ten classes)
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

train_images = train_images.reshape((len(train_images),28,28))
test_images = test_images.reshape((len(test_images),28,28))

# check the shapes of the training and test data 
print("shape for training (x) data : ", train_images.shape) # should be: 60'000 Images each with 28x28 pixels
print("shape for training (y) data : ", train_labels.shape)  # 60'000 Labels with 10 classes
print("shape for test (x) data     : ", test_images.shape)  # 10'000 Images with 28x28 pixels
print("shape for test (y) data     : ", test_labels.shape)  # 10'000 Labels with 10 classes

# to give you an overview of the data plot first 25 images with corresponding labels
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_labels[i]])
plt.show()

model = Sequential()
# data reshaped for Convolution2D
train_images=train_images.reshape(60000,28,28,1)
test_images=test_images.reshape(10000,28,28,1)
model.add(Conv2D(filters = 32, kernel_size=(3,3), strides =1, padding='same', input_shape= (28,28,1), activation="relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.3))
model.add(Conv2D(filters = 64, kernel_size=(4,4), strides =1, padding='same',activation="relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.4))
model.add(Conv2D(filters = 128, kernel_size=(5,5), strides =1, padding='same',activation="relu"))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))

# final output softmax layer for 10 classes (do not modify this layer)
model.add(Dense(10, activation = 'softmax'))

# print a summary of your model
model.summary()
model.compile(optimizer = 'adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])
log = model.fit(train_images, 
                train_labels, 
                batch_size=128,
                epochs=10,
                validation_split=0.1)
plt.plot(log.history['accuracy'], label='Training Accuracy')
plt.plot(log.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()

test_scores = model.evaluate(test_images, test_labels, verbose=0)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

# you can also make predictions for the test data
predictions = model.predict(test_images)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
max_probability_predictions = np.argmax(predictions, axis=1)
conf_mat = confusion_matrix(test_labels, max_probability_predictions)
conf_mat = ConfusionMatrixDisplay(conf_mat)
conf_mat.plot()

