In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from keras.utils import np_utils
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dropout, Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

# Step 1: Data Exploration

In [None]:
MAIN_PATH = '../input/msbd5001-spring-2022/'
TRAIN_PATH = MAIN_PATH + 'train.csv'
TEST_PATH = MAIN_PATH + 'test.csv'

In [None]:
df = pd.read_csv(TRAIN_PATH, index_col=0)
print(df.shape)
df.head()

In [None]:
df.describe()

In [None]:
df.dropna(inplace=True)
df.shape

In [None]:
label = df[['label']].copy()

plt.title('Class Distribution')
sns.countplot(y='label', data=label)
plt.show()
label.value_counts()

# Step 2: Preprocessing

In [None]:
features = df.drop(columns=['label'])
label = label.to_numpy().reshape(-1,)

In [None]:
scaler = MinMaxScaler()
scaler.fit(features)
features = scaler.transform(features)

In [None]:
# train validation split for tensorflow
y = np_utils.to_categorical(label,2)
X_train, X_val, y_train, y_val = train_test_split(features, y, test_size=0.2, random_state=42)

# Step 3: Modeling

## Scikit Learn

### SVM

In [None]:
clf = svm.SVC(kernel='rbf', C=1, random_state=42)
scores = cross_val_score(clf, features, label, cv=5)
print(scores)
print(scores.mean())

In [None]:
clf = svm.SVC(kernel='poly', degree=3, C=1, random_state=42)
scores = cross_val_score(clf, features, label, cv=5)
print(scores)
print(scores.mean())

### KNN

In [None]:
clf = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(clf, features, label, cv=5)
print(scores)
print(scores.mean())

### Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(clf, features, label, cv=5)
print(scores)
print(scores.mean())

## TensorFlow

In [None]:
NUM_EPOCHS = 1000
BATCH_SIZE = int(X_train.shape[0]*0.8)

### ANN

In [None]:
# cross validation for tensorflow
def kfold(features, label, K, model):
    return

In [None]:
X_train.shape

In [None]:
86*0.8

In [None]:
ann_model = Sequential()
ann_model.add(Dense(16, input_shape=(11,), activation='relu'))
ann_model.add(Dense(8, activation='relu'))
ann_model.add(Dense(2, activation='sigmoid'))
ann_model.summary()

In [None]:
ann_model.compile(
    optimizer=Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

checkpointer = ModelCheckpoint(filepath='ann_model.h5', verbose=1, save_best_only=True)

In [None]:
hist = ann_model.fit(
    X_train, y_train,
    validation_data = (X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs= NUM_EPOCHS,
    callbacks=[checkpointer],
    verbose=1
)

In [None]:
# Plot loss function value through epochs
plt.figure(figsize=(18, 4))
plt.plot(hist.history['loss'], label = 'train')
plt.plot(hist.history['val_loss'], label = 'valid')
plt.legend()
plt.title('Loss Function')
plt.show()

# Plot accuracy through epochs
plt.figure(figsize=(18, 4))
plt.plot(hist.history['accuracy'], label = 'train')
plt.plot(hist.history['val_accuracy'], label = 'valid')
plt.legend()
plt.title('Accuracy')
plt.show()