# Final Project - DSCI 619 Deep Learning
### Graham Bachman

## Part I - Image Classification

Data Source: https://www.kaggle.com/puneet6060/intel-image-classification

+ buildings
+ forest
+ glacier
+ mountain
+ sea
+ street

The dataset has been divided into folders for training, testing, and prediction. The training folder includes around 14,000 images and the testing folder has around 3,000 images. Finally, the prediction folder includes around 7,000 images. 

In [None]:
# Please put your codes for Q1 here and run this cell to produce the results.
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

import pathlib

In [None]:
train = tf.keras.preprocessing.image_dataset_from_directory(
    directory='../input/intel-image-classification/seg_train/seg_train',
    image_size=(128,128),
    validation_split=0.2,
    subset='training',
    seed=128
)

val = tf.keras.preprocessing.image_dataset_from_directory(
    directory='../input/intel-image-classification/seg_train/seg_train',
    validation_split=0.2,
    subset='validation',
    image_size=(128,128),
    seed=128
)

test = tf.keras.preprocessing.image_dataset_from_directory(
    directory='../input/intel-image-classification/seg_test/seg_test',
    image_size=(128,128),
    seed=128
)

print()
print('Classes: ', train.class_names)

### Baseline CNN model on the training dataset and evaluate it on the test dataset.




In [None]:
# Please put your codes for Q2 here and run this cell to produce the results.
normal = train.map(lambda x, y: (layers.experimental.preprocessing.Rescaling(1./255)(x), y))
image_batch, labels_batch = next(iter(normal))

In [None]:
# Please put your codes for Q1 here and run this cell to produce the results.
model_1 = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(128, 128, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
    
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
    
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  
  layers.Flatten(),
  
  layers.Dense(128, activation='relu'),
  layers.Dense(6)
])

model_1.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model_1.summary()

In [None]:
log_1 = model_1.fit(train, validation_data=val, epochs=10)

In [None]:
loss, accuracy = model_1.evaluate(test)
print('Test accuracy :', accuracy)

In [None]:
acc = log_1.history['accuracy']
val_acc = log_1.history['val_accuracy']

loss = log_1.history['loss']
val_loss = log_1.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(range(10), acc, label='Training Accuracy')
plt.plot(range(10), val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(range(10), loss, label='Training Loss')
plt.plot(range(10), val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

### CNN model 2, with data augmentation and dropout and evaluate it on the test dataset.
 


In [None]:
# Please put your code for Q5 here. Please run this cell and produce results.
data_augmentation = keras.Sequential([
    layers.experimental.preprocessing.RandomFlip("horizontal", input_shape=(128, 128, 3)),
    layers.experimental.preprocessing.RandomRotation(0.1),
    layers.experimental.preprocessing.RandomZoom(0.1),
])

In [None]:
model_2 = Sequential([
data_augmentation,
    layers.experimental.preprocessing.Rescaling(1./255, input_shape=(128, 128, 3)),
    layers.Conv2D(16, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.5),


    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.5),


    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.5),

    layers.Flatten(),

    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),

    layers.Dense(6)
    ])

model_2.compile(optimizer='adam',
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
model_2.summary()

In [None]:
log_2 = model_2.fit(train, validation_data=val, epochs=10)

In [None]:
loss, accuracy = model_2.evaluate(test)
print('Test accuracy :', accuracy)

In [None]:
acc = log_2.history['accuracy']
val_acc = log_2.history['val_accuracy']

loss = log_2.history['loss']
val_loss = log_2.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(range(10), acc, label='Training Accuracy')
plt.plot(range(10), val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(range(10), loss, label='Training Loss')
plt.plot(range(10), val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

### CNN model 3, based on the pre-trained model (transfer learning) and evaluate it on the test dataset.



In [None]:
# Please put your codes for Q4 here and run this cell to produce the results.
preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input
rescale = tf.keras.layers.experimental.preprocessing.Rescaling(1./127.5, offset= -1)

In [None]:
BATCH_SIZE = 32
IMG_SIZE = (128, 128)
IMG_SHAPE = IMG_SIZE + (3,)

base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
                                               include_top=False,
                                               weights='imagenet')
image_batch, label_batch = next(iter(train))
feature_batch = base_model(image_batch)
print(feature_batch.shape)

In [None]:
base_model.trainable = False
base_model.summary()

In [None]:
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
feature_batch_average = global_average_layer(feature_batch)
print(feature_batch_average.shape)
prediction_layer = tf.keras.layers.Dense(6)
prediction_batch = prediction_layer(feature_batch_average)
print(prediction_batch.shape)

In [None]:
inputs = tf.keras.Input(shape=(128, 128, 3))
x = data_augmentation(inputs)
x = preprocess_input(x)
x = base_model(x, training=False)
x = global_average_layer(x)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = prediction_layer(x)
model_3 = tf.keras.Model(inputs, outputs)

base_learning_rate = 0.0001
model_3.compile(optimizer=tf.keras.optimizers.Adam(lr=base_learning_rate),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model_3.summary()

In [None]:
initial_epochs = 10

loss0, accuracy0 = model_3.evaluate(test)

In [None]:
log_3 = model_3.fit(train,
                    epochs=20,
                    validation_data=val)

In [None]:
loss, accuracy = model_3.evaluate(test)
print('Test accuracy :', accuracy)

In [None]:
acc = log_3.history['accuracy']
val_acc = log_3.history['val_accuracy']

loss = log_3.history['loss']
val_loss = log_3.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(range(10), acc, label='Training Accuracy')
plt.plot(range(10), val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(range(10), loss, label='Training Loss')
plt.plot(range(10), val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

### Q5.  Which model do you recommend for the model in Q2, Q3, and Q4? Justify your answer.


I recommend the third model, because it scored the best on the test and validation sets and showed little evidence of overfitting.

## Part II - Sentiment Analysis

Data Source: https://www.kaggle.com/hj5992/restaurantreviews

The dataset is comprised of tab-separated files with two columns:

+ Review: Customers' review 
+ Liked : 0 or 1

In [1]:
# Please put your codes for Q1 here and run this cell to produce the results.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import io
import os
import re #regular expression, easier for us to clean texts
import shutil
import string
import tensorflow as tf

from datetime import datetime
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization # map text to ints

In [None]:
def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [None]:
df = pd.read_csv('../input/restaurantreviews/Restaurant_Reviews.tsv', sep='\t', quoting=3)
df

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(
           ( tf.cast(df['Review'].values, tf.string),
            tf.cast(df['Liked'].values, tf.int32)))

### Clean and preprocess the text data and split into training and test dataset.

In [None]:
# Please put your codes for Q2 here and run this cell to produce the results.


In [None]:
df.sample(frac=1, random_state=42)

train_ds, val_ds, test_dataset = \
              np.split(df.sample(frac=1, random_state=42), 
                       [int(.6*len(df)), int(.8*len(df))])
train_ds.head()

In [None]:
TRAIN_SIZE = int(len(dataset)*0.7)

train_dataset = dataset.take(TRAIN_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE) 

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 1
 
train_dataset = train_dataset.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)

### Baseline RNN model using embedding layer and GRU on the training dataset and evaluate it on the test dataset.


In [None]:
# Please put your codes for Q3 here and run this cell to produce the results.
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
model_1 = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()), 
        output_dim=64, 
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

model_1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history_1 = model_1.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=30)

In [None]:
model_1.summary()

In [None]:
test_loss, test_acc = model_1.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history, 'loss')
plt.ylim(0,None)

### RNN model 2 using embedding layer and LSTM and evaluate it on the test dataset.


In [None]:
# Please put your codes for Q4 here and run this cell to produce the results.
model_2 = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()), 
        output_dim=64, 
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

model_2.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history_2 = model_2.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=30)

In [None]:
model_2.summary()

In [None]:
test_loss, test_acc = model_2.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history_2, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history_2, 'loss')
plt.ylim(0,None)

### RNN model 3, using embedding layer and GRU and LSTM and evaluate it on the test dataset

In [None]:
# Please put your codes for Q5 here and run this cell to produce the results.
model_3 = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()), 
        output_dim=64, 
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.5),
    
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

model_3.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history_3 = model_3.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=30)

In [None]:
model_3.summary()

In [None]:
test_loss, test_acc = model_3.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plot_graphs(history_3, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history_3, 'loss')
plt.ylim(0,None)

### Model Recommendation

I would recommend the first because it had the best accuracy. All three models were overfitting significantly, and more work needs to be done to remedy this. My first approach would be to slow the learning rate and train over more epochs. It would also be beneficial to include more Dropout Layers. But my recommendation for now is the first model, on the grounds of it's performance on the test data set. 