# CS 5785 - Final
## Applied Machine Learning

Eric Nguyen *(en274)*

# This program was used for a Kaggle challenge to create the highest accurate match of a text description and a corresponding photo.

# This program uses word2vec to match text descriptions to the closest photo matching that description.  Both the training and test photos are tagged with keywords and descriptions.  

# We are given 10,000 training photos, each with 5 sentences describing the photo.  We are also given 2,000 test photos, also with 5 sentences describing the photo.

# word2vec is a neural network model from Google that maps words to their most common surrounding words, allowing you incorporate the context, or order of words, in your prediction.  We will be using the gensim library to access the word2vec model, and train it with our own data.

# From a high level, word2vec works by taking a large corpus of sentences, and understands the context of words in sentences.  It maps a word to other commonly associated words.  For example, "dog" might commonly occur with "pet", "animal" or even "cat".  

# After training with a corpus, a high dimensional space is created.  This allows you to take a word vector and project it into this high dimensional space.  What you might observe is that words that commonly occur together will project to the same region or cluster near each other in this "space".

# For our purposes, we take a given test sentence, averaged all the word vectors, and then found its nearest neighbor word2vec based on cosine similarity distance, in the training data.  We then mapped the nearest 20 neighbor descriptions in the test data and used their photo labels as the predictions to submit.

In [127]:
import setup
import string
import numpy as np
import pandas as pd
import nltk
import gensim
from gensim import corpora, models, similarities
from gensim.models import Word2Vec
from IPython.display import display, Markdown
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors as KNN

In [128]:
TRAIN = 0  # this index indicates which part of data is for training
TEST = 1  # this index indicates which part of data is for test
N_TRAIN = 10000  # number of training images
N_TEST = 2000  #  number of test data
N_DESCRIPTIONS = 10000  # number of descriptions

# Placeholders for gathering the descriptions from the photos (Pandas data frames)
descriptions_train = []
descriptions_test = []

# get the image features from the dataset
features_train = setup.get_features(False, TRAIN)
features_test = setup.get_features(False, TEST)

In [129]:
# Using our own utility module, we call these functions to pull from the
# descriptions from training and test data
descriptions_train = setup.get_descriptions(N_TRAIN,TRAIN)
descriptions_test = setup.get_descriptions(N_TEST,TEST)

In [130]:
def preprocess(descriptions, dataset):
    """
    Given a Pandas DataFrame of training or testing descriptions, 
    this function:
    1. Combines the five sentence descriptions into list of words
    2. Preprocesses the combined description by lowercasing words,
       removing punctuation, removing stopwords, and stemming words.
    3. Returns a Pandas DataFrame with one column consisting of a list
    of preprocessed words.
    
    :param descriptions: Pandas dataframe
    :param dataset: raw dataset, either training or testing
    
    :return: result, Pandas dataframe
    
    """    
    preproc_words = []  # placeholder for return of preprocessed words

    # iterate through every description
    for index, row in descriptions.iterrows():
        
        for sentence in row.tolist():
            sentence = sentence.split()  # separate by white spaces
            
            # Lowercase each word
            sentence = [word.lower() for word in sentence]

            # Remove punctuation from words
            sentence = [word.translate(str.maketrans(dict.fromkeys(string.punctuation))) for word in sentence]

            # Remove stopwords
            sentence = [word for word in sentence if word not in stopwords.words('english')]

            # Stem words
            ps = PorterStemmer()
            sentence = [ps.stem(word) for word in sentence]

            # removes empty elements
            sentence = list(filter(None, sentence))
            
            preproc_words.append(sentence)  # include preprocessed sentence
            
    # save into a csv file to reduce having to repeat this step when 
    # testing
    output = pd.DataFrame(preproc_words)
    output.to_csv("preprocess_description_EN" + str(dataset) + ".csv")
    return preproc_words

In [131]:
def get_preprocess(dataset):
    """
    Given the type of dataset, TRAIN or TEST,
    read the preprocessed descriptions from a file
    and return them as a Pandas DataFrame.
    
    :param: dataset, the raw text data
    :return: preprocess, Pandas dataframe of preprocessed words
    
    """
    preprocess = pd.read_csv("preprocess_description_EN" + str(dataset) + ".csv", index_col=0)
    return preprocess

In [132]:
# preprocess both the training and testing (data) sentence descriptions
pre_pro_train = preprocess(descriptions_train,TRAIN)
pre_pro_test = preprocess(descriptions_test,TEST)

In [135]:
# train word2vec with our own training data corpus
# using our preprocessed sentences
model = gensim.models.Word2Vec(pre_pro_train, size=500,window=5, min_count=1)

In [137]:
def get_avg_word2vec(sentence):
    '''
    This function creates a word2vec for every word in a sentence.
    It then averages over all the words in a sentence to create a
    word2vec for that sentence.
    
    :param: sentence, list, description of a photo
    
    :return: avg_w2v, np.array, description vector projected into the
    pretrained word2vec space.
    
    '''
    # for testing purposes
    if sentence == "" or sentence == " ":
        print('found error')
    
    count = 0  # track number of words
    avg_w2v = []  # initialize empty array

    # iterate through all words and sentences
    for word in sentence:
        if word in model:  # only if the word is in the model
            count += 1   # increment count of words
            single_w2v = model.wv[word]    # generate word2vec
            single_w2v = np.array(single_w2v)  # convert to np array
            avg_w2v.append(single_w2v)  # append the sentence w2v
    
    avg_w2v = np.array(avg_w2v)  # convert to np array
    avg_w2v = np.mean(avg_w2v,axis=0)  # average over each word
    return avg_w2v

In [138]:
def get_w2v_list(sentence_list):
    '''
    This function takes in a list of sentences, and outputs 
    an average w2v for each sentence in a list.  It uses 
    get_avg_word2vec() as a helper.
    
    :param: sentence_list, list of sentence descriptions
    
    :return: w2v_list, list, word2vec for every sentence in a list
    
    '''
    w2v_list = []  # declare an output list
    
    # iterate through every sentence list
    for sentence in sentence_list:
        if sentence == []:
            sentence_w2v = get_avg_word2vec('car')  # for testing purposes
        else:
            sentence_w2v = get_avg_word2vec(sentence)  # get word2vec for sentence
        w2v_list.append(sentence_w2v)  # append to output var
    return w2v_list

In [139]:
def get_w2v_list_by_description(sentence_list):
    
    '''
    This function aggregates a word2vec by description, which has
    5 sentences.  This is purely for organizing word2vecs by photo
    and uses other helper functions to create the word2vecs, and is
    made simple since the sentence_list is in order, ie, the first 5 
    sentences belong to photo 1, and the next 5 to photo 2, and so on.
    This was necessary for testing.
    
    :param: sentence_list, list, list of all sentences in the training data,
    size 50,000
    
    :return: grouped_w2v, np array of word2vecs for a description, 
    size 10,000 (number of images) by 5 (sentences per image)
    
    '''
    sentence_w2v_list = []  # initialize empty array
    grouped_w2v = []  # initialize output np array
    
    # go through entire sentence list (all 50,000 sentences)
    for sentence in sentence_list:
        sentence_w2v = get_avg_word2vec(sentence)  # get w2v for sentence
        sentence_w2v = np.array(sentence_w2v)  # convert to np array
        sentence_w2v_list.append(sentence_w2v)  # add to a list
        
    # iterate through all 10000 photos and assign the 5 word2vecs per photo
    for i in range(0,len(sentence_w2v_list),5):       
        temp = sentence_w2v_list[i:i+5]  # assigning the next 5 word2vecs
        temp = np.array(temp)
        avg_w2v_sentence = np.mean(temp,axis=0)  # average them
        grouped_w2v.append(avg_w2v_sentence)  # append the 5 sentences to grouped output
    return grouped_w2v  # returned the aggregated w2v

In [142]:
# -------------------   # testing starts here. ----------------------------

In [143]:
# get preprocessed word2vecs from training data descriptions, use as
# inputs to our classifier
train_x_vals = get_w2v_list(pre_pro_train)

In [None]:
# create label vectors for training data.  The photo label
# is in the features file in the training data, and has the form
# images_train/5373.jpg, where the 4 digit number varies. Each label
# is in order, so we know what descriptions match to which labels
# (supervised learning)
features_train = setup.get_features(False,TRAIN)

In [144]:
def make_labels(n_elements):
    '''
    This function creates a label for each description by pulling the 
    the photo label from the image features file and matching
    them to the descriptions.  (We do not know which labels correspend
    to which description in the test data, this is what we are predicting.)
    
    :param: n_elements, list, a photo from the image features file
    
    :return: y_vals, list, labels for each photo description
    '''
    y_vals = []
    for i in range(n_elements):
        for j in range(5):
            # assign the photo label to 5 descriptions at a time
            y_vals.append(pic_index)
            pic_index = features_train.index[i]  
    return y_vals

In [145]:
# get labels for training data 
# output is 50,000 labels (50000 sentences for 10000 descriptions)
train_y_vals = make_labels(N_DESCRIPTIONS)  # number of descriptions

In [159]:
# # build logistic regression model to map descriptions to photo label
lr = LogisticRegression()
lr.fit(train_x_vals, train_y_vals)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [161]:
# get word2vecs for all the test data by description
test_x_vals = get_w2v_list_by_description(pre_pro_test)  # pass in preprocessed test data

In [162]:
# print(np.array(test_x_vals).shape)  # for testing

(2000, 500)


In [163]:
# predict test y-vals photo labels on the training model
estimated_train_indexes = lr.predict(test_x_vals)

In [164]:
# len(estimated_train_indexes)  # for testing

2000

In [165]:
# using these estimated training indexes, pull up their image feature vectors

estimates_features = []  # declare a list of dataframes

for elem in estimated_train_indexes: 
    estimates_features.append(features_train.iloc[elem])  # retrieve feature vector

# Finally, now that we mapped a test description to a predicted feature vector in the training data, we find the nearest 20 neighbors in the test data as our prediction.

# kNN - for each test data, find the nearest 20 neighbors as our prediction

In [None]:
test_labels = list(features_test.index.values)  # grab the features from the test data
knn = KNN(n_neighbors = 20)  # build knn model using k = 20 (required for submission)
knn = knn.fit(features_test,test_labels)  # fit the features and test labels

# calculate predictions and output into pandas dataframe for Kaggle contest format
predictions = knn.kneighbors(estimates_features,return_distance=False)
predictions = np.insert(predictions,0,list(range(N_TEST)),axis=1)
predictions = pd.DataFrame(predictions)

In [169]:
# save predictions to csv using our setup module we built
setup.save_output(predictions,'eric-300pm.csv')

Unnamed: 0,Descritpion_ID,Top_20_Image_IDs
0,0.txt,669.jpg 231.jpg 46.jpg 1992.jpg 1480.jpg 92.jp...
1,1.txt,589.jpg 1714.jpg 818.jpg 1753.jpg 1199.jpg 149...
2,2.txt,1183.jpg 838.jpg 1471.jpg 634.jpg 42.jpg 598.j...
3,3.txt,1514.jpg 469.jpg 1765.jpg 26.jpg 235.jpg 1126....
4,4.txt,305.jpg 1243.jpg 1980.jpg 870.jpg 105.jpg 1380...
5,5.txt,846.jpg 1913.jpg 1118.jpg 413.jpg 1882.jpg 170...
6,6.txt,1944.jpg 1855.jpg 1377.jpg 1940.jpg 897.jpg 11...
7,7.txt,1904.jpg 364.jpg 20.jpg 1203.jpg 1400.jpg 1258...
8,8.txt,1004.jpg 923.jpg 1261.jpg 1031.jpg 1896.jpg 71...
9,9.txt,351.jpg 1321.jpg 1700.jpg 1145.jpg 249.jpg 600...
