In [None]:
# Part 1. Text Classification 

In [None]:
# Load data
workbook = load_workbook(filename="Assignment 2 text.xlsx",  read_only=True, data_only=True)
worksheet = workbook["Sheet1"]

# Seperate data into training and test dataset
review_list = []
label_list = []
for row in worksheet.iter_rows(min_row=2):
    review_list.append(row[1].value)
    label_list.append(row[2].value)
training_x_origin = review_list[0:400] + review_list[500:900]
training_y = label_list[0:400] + label_list[500:900]
testing_x_origin = review_list[400:500] + review_list[900:1000]
testing_y = label_list[400:500] + label_list[900:1000]


In [None]:
# Lemmatization
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t.isalpha()]
my_stop_words = [t for t in stopwords.words('english')] # remove stopwords
# These are extra stop words I defined
self_defined_stop_words = ['could', 'doe', 'ha', 'might', 'must', 'need', 'sha', 'wa', 'wo', 'would']
my_stop_words.extend(self_defined_stop_words)
tfidf_vectorizer = TfidfVectorizer(stop_words=my_stop_words, tokenizer=LemmaTokenizer(), encoding='utf-8', min_df=5,
                             ngram_range=(1, 2))
tfidf_vectorizer.fit(training_x_origin)
training_x = tfidf_vectorizer.transform(training_x_origin)
testing_x = tfidf_vectorizer.transform(testing_x_origin)

In [None]:
# Naïve Bayes model
NBmodel = MultinomialNB()
# training
NBmodel.fit(training_x, training_y)
y_pred_NB = NBmodel.predict(testing_x)
# evaluation
acc_NB = accuracy_score(testing_y, y_pred_NB)
print("Naive Bayes model Accuracy::{:.2f}%".format(acc_NB*100))

# Logit model
Logitmodel = LogisticRegression()
# training
Logitmodel.fit(training_x, training_y)
y_pred_logit = Logitmodel.predict(testing_x)
# evaluation
from sklearn.metrics import accuracy_score
acc_logit = accuracy_score(testing_y, y_pred_logit)
print("Logit model Accuracy:: {:.2f}%".format(acc_logit*100))

# Random Forest model
RFmodel = RandomForestClassifier(n_estimators=50, bootstrap=True, random_state=0) # number of trees and number of layers/depth
# training
RFmodel.fit(training_x, training_y)
y_pred_RF = RFmodel.predict(testing_x)
# evaluation
acc_RF = accuracy_score(testing_y, y_pred_RF)
print("Random Forest Model Accuracy: {:.2f}%".format(acc_RF*100))

# SVM model
SVMmodel = LinearSVC()
# training
SVMmodel.fit(training_x, training_y)
y_pred_SVM = SVMmodel.predict(testing_x)
# evaluation
acc_SVM = accuracy_score(testing_y, y_pred_SVM)
print("SVM model Accuracy:{:.2f}%".format(acc_SVM*100))

# simple ANN models
DLmodel = MLPClassifier(solver='lbfgs' , hidden_layer_sizes=(4), random_state=0)
# training
DLmodel.fit(training_x, training_y)
y_pred_DL= DLmodel.predict(testing_x)
# evaluation
acc_DL = accuracy_score(testing_y, y_pred_DL)
print("DL model Accuracy: {:.2f}%".format(acc_DL*100))


In [None]:
max_len = 100
tokenized_reviews = [nltk.word_tokenize(training_review) for training_review in review_list[0:1000]]
all_tokenized_review_words = [tokenized_review_word for tokenized_review in tokenized_reviews for tokenized_review_word in tokenized_review]
# Create index endcoder
index_encoder = LabelEncoder()
index_encoder.fit(all_tokenized_review_words)

# Encoded all tokenized_review
encoded_review_vector = [index_encoder.transform(tokenized_review).tolist() for tokenized_review in tokenized_reviews]

x_train = np.array(encoded_review_vector[0:400] + encoded_review_vector[500:900])
x_test = np.array(encoded_review_vector[400:500] + encoded_review_vector[900:1000])
y_train = np.array([1 if label =='restaurant' else 0 for label in training_y])  # Convert to 0 and 1
y_test = np.array([1 if label =='restaurant' else 0 for label in testing_y])  # Convert to 0 and 1
x_train_pad = sequence.pad_sequences(x_train, maxlen=max_len)  # padding to 100
x_test_pad = sequence.pad_sequences(x_test, maxlen=max_len)  # padding to 100

In [None]:
max_features = len(all_tokenized_review_words)
smodel = Sequential()
smodel.add(Embedding(max_features, 20, input_length=max_len))
smodel.add(LSTM(40, dropout=0.20, recurrent_dropout=0.20))
smodel.add(Dense(1, activation='sigmoid'))
smodel.add(Dropout(0.1))
smodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
his = smodel.fit(x_train_pad, y_train, batch_size=100, epochs=10, validation_data=(x_test_pad, y_test))
performance = smodel.evaluate(x_test_pad, y_test)
print('Test accuracy:', performance[1])


In [None]:
Part 2 Image Recognition (total 5 points)

In [None]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

cifar_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
f, axarr = plt.subplots(4, 5)
f.set_size_inches(16, 6)
label_list = []
for i in range(20):
    img = x_test[i]
    row_index = int(i / 5)
    col_index = i % 5
    axarr[row_index][col_index].imshow(img)
    axarr[row_index][col_index].title.set_text(cifar_classes[y_test[i][0]])
    label_list.append(y_test[i])
plt.show()
print(label_list)

In [None]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
y_train = keras.utils.to_categorical(y_train, num_classes=10)
y_test = keras.utils.to_categorical(y_test, num_classes=10)
CNNmodel = Sequential()
CNNmodel.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))  # step a
CNNmodel.add(Dropout(0.2))  # step b
CNNmodel.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))  # step c
CNNmodel.add(MaxPooling2D(pool_size=(2, 2)))  # step d
CNNmodel.add(Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))  # step e
CNNmodel.add(Dropout(0.2))  # step f
CNNmodel.add(Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))  # step g
CNNmodel.add(MaxPooling2D(pool_size=(2, 2)))  # step h
CNNmodel.add(Flatten())  # step i
CNNmodel.add(Dense(256, activation='relu'))  # step j
CNNmodel.add(Dropout(0.2))  # step k
CNNmodel.add(Dense(10, activation='softmax'))  # step l

CNNmodel.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(),metrics=['accuracy'])
#  model fit
CNNmodel.fit(x_train, y_train, batch_size=300, epochs=5, validation_data=(x_test, y_test))
#  model performance
performance = CNNmodel.evaluate(x_test, y_test)
print('Test accuracy:', performance[1])


In [None]:
CNNmodel_3 = Sequential()
CNNmodel_3.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3))) # step a
CNNmodel_3.add(Dropout(0.2)) # step b
CNNmodel_3.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3))) # step c
CNNmodel_3.add(MaxPooling2D(pool_size=(2, 2))) # step d
CNNmodel_3.add(Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3))) # step e
CNNmodel_3.add(Dropout(0.2)) # step f
CNNmodel_3.add(Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3))) # step g
CNNmodel_3.add(MaxPooling2D(pool_size=(2, 2))) # step h
CNNmodel_3.add(Conv2D(128, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))
CNNmodel_3.add(Dropout(0.2))
CNNmodel_3.add(Conv2D(128, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))
CNNmodel_3.add(Flatten()) # step i
CNNmodel_3.add(Dense(256, activation='relu')) # step j
CNNmodel_3.add(Dropout(0.2)) # step k
CNNmodel_3.add(Dense(10, activation='softmax'))

CNNmodel_3.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(),metrics=['accuracy'])
#  model fit
CNNmodel_3.fit(x_train, y_train, batch_size=300, epochs=5, validation_data=(x_test, y_test))
#  model performance
performance = CNNmodel_3.evaluate(x_test, y_test)
print('Test accuracy:', performance[1])


In [None]:
CNNmodel_3 = Sequential()
CNNmodel_3.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3))) # step a
CNNmodel_3.add(Dropout(0.2)) # step b
CNNmodel_3.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3))) # step c
CNNmodel_3.add(MaxPooling2D(pool_size=(2, 2))) # step d
CNNmodel_3.add(Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3))) # step e
CNNmodel_3.add(Dropout(0.2)) # step f
CNNmodel_3.add(Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3))) # step g
CNNmodel_3.add(MaxPooling2D(pool_size=(2, 2))) # step h
CNNmodel_3.add(Conv2D(128, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))
CNNmodel_3.add(Dropout(0.2))
CNNmodel_3.add(Conv2D(128, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))
CNNmodel_3.add(Flatten()) # step i
CNNmodel_3.add(Dense(256, activation='relu')) # step j
CNNmodel_3.add(Dropout(0.2)) # step k
CNNmodel_3.add(Dense(10, activation='softmax'))

CNNmodel_3.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(),metrics=['accuracy'])
#  model fit
CNNmodel_3.fit(x_train, y_train, batch_size=300, epochs=20, validation_data=(x_test, y_test))
#  model performance
performance = CNNmodel_3.evaluate(x_test, y_test)
print('Test accuracy:', performance[1])


In [None]:
(x_train_images, y_train_labels), (x_test_images, y_test_labels) = cifar10.load_data()

x_train_flatten_image_list = []
x_test_flatten_image_list = []
for x_train_image in x_train_images:
    x_train_flatten_image_list.append(x_train_image.flatten())

for x_test_image in x_test_images:
    x_test_flatten_image_list.append(x_test_image.flatten())

y_train_label_list = []
y_test_label_list = []
for y_train_label in y_train_labels:
    y_train_label_list.append(y_train_label[0])

for y_test_label in y_test_labels:
    y_test_label_list.append(y_test_label[0])

NBmodel = MultinomialNB()
# training
NBmodel.fit(x_train_flatten_image_list, y_train_label_list)
y_pred_NB = NBmodel.predict(x_test_flatten_image_list)
# evaluation
acc_NB = accuracy_score(y_test_label_list, y_pred_NB)
print("Naive Bayes model Accuracy::{:.2f}%".format(acc_NB*100))


RFmodel = RandomForestClassifier(n_estimators=100, bootstrap=True, max_depth=10, random_state=0) # number of trees and number of layers/depth
# training
RFmodel.fit(x_train_flatten_image_list, y_train_label_list)
y_pred_RF = RFmodel.predict(x_test_flatten_image_list)
# evaluation
acc_RF = accuracy_score(y_test_label_list, y_pred_RF)
print("Random Forest Model Accuracy: {:.2f}%".format(acc_RF*100))
