In [1]:
# Loading Packages

# EDA
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from numpy import sort
from collections import Counter

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, roc_curve, auc, log_loss, roc_auc_score 
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
import xgboost

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.regularizers import l2
from keras.preprocessing.image import ImageDataGenerator

# Settings
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
ds = pd.read_csv('train.csv')
ds = ds.drop_duplicates()
X = ds.drop('label', axis=1)
y = ds.label.values
x_train_val, x_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20,random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_val,y_train_val, test_size=0.10,random_state=42)
x_train = x_train/255
x_val = x_val/255
x_test= x_test/255
#model

def model_score(model, x_train, y_train, x_val,y_val,x_test,y_test):
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', 'AUC', 'Recall', 'Precision']
    )

    callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
                EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=12, restore_best_weights=True)]

    
    print("\n MODEL BUILDING ============================================================================ \n")

    history = model.fit(
        x_train, y_train, 
        epochs=128, 
        batch_size = 32, 
        validation_data=(x_val,y_val),
        callbacks=callbacks
    )

    print("\n MODEL EVALUATION ON TEST SET ============================================================================ \n")

    score = model.evaluate(x_test, y_test, batch_size=32)

    return [model, history, score]


def pred_test_to_csv(model, name):
    test_ds = pd.read_csv("test.csv")
    Id = test_ds['Id'][:]
    test_ds = test_ds.drop("Id", axis=1)
    test_ds = test_ds/255
    pred = model.predict(test_ds)
    pred_df = pd.DataFrame({'Id':Id.values, 'Predicted':pred.flatten()}, columns=['Id', 'Predicted'])
    print(pred_df)
    pred_df.to_csv(f'{name}.csv', index=False)
    return True


In [None]:
records = {}

In [8]:
model = tf.keras.Sequential([
    layers.Reshape(target_shape=(20, 20, 1), input_shape=(400,)),
    layers.Conv2D(32, (3, 3), activation = 'relu', kernel_regularizer=l2(0.0000001), padding='same'),
    layers.BatchNormalization(),
    layers.SpatialDropout2D(0.4),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
    layers.Conv2D(64, (3, 3), activation = 'relu', kernel_regularizer=l2(0.0000001), padding='same'),
    layers.BatchNormalization(),
    layers.SpatialDropout2D(0.4),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
    layers.Conv2D(128, (3, 3), activation = 'relu', kernel_regularizer=l2(0.0000001), padding='same'),
    layers.BatchNormalization(),
    layers.SpatialDropout2D(0.4),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
    layers.Conv2D(256, (3, 3), activation = 'relu', kernel_regularizer=l2(0.0000001), padding='same'),
    layers.BatchNormalization(),
    layers.SpatialDropout2D(0.4),
    layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
    layers.Conv2D(256, (3, 3), activation = 'relu', kernel_regularizer=l2(0.0000001), padding='same'),
    layers.BatchNormalization(),
    #layers.SpatialDropout2D(0.5),
    #layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
    layers.Flatten(),
    layers.Dropout(0.6),
    #layers.Dropout(0.6, seed=RANDOM_STATE),
    layers.Dense(128, activation='relu', kernel_regularizer=l2(0.0000001)),
    layers.Dropout(0.6),
    #layers.Dense(128, activation='relu', kernel_regularizer=l2(0.000001)),
    layers.Dense(16, activation='relu', kernel_regularizer=l2(0.0000001)),
    layers.Dense(1, activation='sigmoid')
])

model_CNN = model_score(model, x_train, y_train, x_val, y_val, x_test,y_test)



Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128




In [5]:
pred_test_to_csv(model_CNN[0], "predictions_CNN_prob_6416")

          Id  Predicted
0          0   0.002504
1          1   0.004330
2          2   0.005245
3          3   0.000177
4          4   0.002328
...      ...        ...
30912  30912   0.008278
30913  30913   0.013641
30914  30914   0.399683
30915  30915   0.003009
30916  30916   0.001592

[30917 rows x 2 columns]


True