# BOW + Models

# Imports

In [183]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import csv
import gensim
from gensim.models import Word2Vec
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist


from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Definitions and Data Load

In [2]:
dataFolder = "cs5785-fall19-final"

descTrainFolder = dataFolder + "/descriptions_train"
descTestFolder = dataFolder + "/descriptions_test"

featTrainFolder = dataFolder + "/features_train"
featTestFolder = dataFolder + "/features_test"

imagesTrainFolder = dataFolder + "/images_train"
imagesTestFolder = dataFolder + "/images_test"

tagsTrainFolder = dataFolder + "/tags_train"
tagsTestFolder = dataFolder + "/tags_test"

folders = [descTrainFolder,   descTestFolder,   featTrainFolder, featTestFolder, 
           imagesTrainFolder, imagesTestFolder, tagsTrainFolder, tagsTestFolder]

word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [242]:
def getFilesFromFolder(folder):
    return listdir(folder)

In [243]:
def dist_matrix(x1, x2):
    return cdist(x1,x2,'cosine')

In [244]:
# Formatting predictions for csv output
def output_format(dm):
    id_dists = []
    for i, d in enumerate(dm):
        temp_dists = [(j, d[j]) for j in range(len(d))]
        id_dists.append(list(sorted(temp_dists, key= lambda x: abs(x[1])))[:20])
    return [["{}.jpg".format(id) for id, _ in row] for row in id_dists]

In [245]:
def outputCSV(predictions):
    with open("image_prediction.csv", "w") as outputFile:
        headers = ["Description_ID", "Top_20_Image_IDs"]
        fileWriter = csv.DictWriter(outputFile, fieldnames=headers)
        fileWriter.writeheader()
        for index, pred in enumerate(predictions):
            fileWriter.writerow({headers[0]: "{}.txt".format(index), headers[1]: ' '.join(predictions[index])})

In [246]:
def get_acc_from_dm(dm):
    val_scores = []
    val_pos_list = []

    for i in range(len(yval)):
        pred_dist_idx = list(np.argsort(dm[i]))
        val_pos = pred_dist_idx.index(i)
        val_pos_list.append(val_pos)
        if val_pos < 20:
            val_scores.append(1 / (val_pos + 1))
        else:
            val_scores.append(0.0)

    print("Development MAP@20:", np.mean(val_scores))
    print("Mean index of true image", np.mean(val_pos_list))
    print("Median index of true image", np.median(val_pos_list))

# Preprocess data

In [247]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        
        # remove punctuation
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        """
        Analyzing if words are upper/lower case is more for analyzing the intensity of the sentiment rather than classifying it. 
        """
        word_tokens = list(map(lambda r: r.lower(), word_tokenize(data[i])))
        
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        data[i] = ' '.join(word_tokens)
    
    assert(len(data) == 1)
    return " ".join(set(data[0].split()))

In [248]:
def get_flat_descriptions_from_folder(folder):
    flat_descriptions = []
    amtFiles = len(getFilesFromFolder(folder))
    indexes = list(sorted([str(i) for i in range(amtFiles)], key= lambda x: int(x)))
    
    for index in indexes:
        filename = folder + "/{}.txt".format(index)
        with open(filename, 'r') as d_file:
            flat_descriptions += [[' '.join(d_file.readlines()).replace("\n", "")]]
            
    return flat_descriptions

In [6]:
# [  [ unprocessed description of image 1  ] , [ unprocessed description of image 2  ]   , ...  ]
train_flat_descs = get_flat_descriptions_from_folder(descTrainFolder)
test_flat_descs  = get_flat_descriptions_from_folder(descTestFolder)

In [253]:
def unique_words_in_desc(desc):
    return [preprocessing(d) for d in desc]

In [8]:
# [  { processed description of image 1  } , { processed description of image 2  }   , ...  ]
train_descs = unique_words_in_desc(train_flat_descs)
test_descs = unique_words_in_desc(test_flat_descs)

## Bag of Words

In [254]:
# CREATE THE BAG OF WORDS DICTIONARY
def create_bow(train_descs):
    BOW = {'null' : 0}
    for description in train_descs:
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            BOW[word] = 0
    return BOW

def create_bow_vectors(data, BOW):
    feature_vectors = []
    for description in data:
        feat_vec = BOW.copy()
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            if word in feat_vec:
                feat_vec[word] += 1
            else:
                feat_vec['null'] += 1       
        feature_vectors.append(feat_vec)

    # TURN DICTIONARIES INTO A MATRIX with each row as one description
    feature_vector_matrix = []
    for feature_vec in feature_vectors:
        feature_vector_matrix.append(list(feature_vec.values()))

    # NORMALIZE THE FEATURES
    feature_vector_matrix = sklearn.preprocessing.normalize(feature_vector_matrix) # default is L2 norm
    return feature_vector_matrix

In [9]:
BOW = create_bow(train_descs)

In [16]:
train_bow = create_bow_vectors(train_descs, BOW)
test_bow  = create_bow_vectors(test_descs, BOW)

## Feature Extraction

In [250]:
def get_file_num(filename):
    return int(filename.replace(".jpg", "").replace("images_train/", "").replace("images_test/", ""))

def get_feat_from_file(filename):
    with open(filename, 'r') as f:
        feats = list(csv.reader(f))
    sorted_feats = list(sorted(feats, key= lambda l: get_file_num(l[0])))
    
    return sklearn.preprocessing.normalize([s[1:] for s in sorted_feats])

In [24]:
resTrainFile = featTrainFolder + "/features_resnet1000_train.csv"
train_feat = get_feat_from_file(resTrainFile)

resTestFile = featTestFolder + "/features_resnet1000_test.csv"
test_feat = get_feat_from_file(resTestFile)

# PCA 

In [251]:
from sklearn.decomposition import PCA
def _PCA(train_feat, n):
    pca = PCA(n_components = n)
    pca.fit(train_feat)
    return pca

In [193]:
pca = _PCA(train_feat, 0.96)
train_feat_pca = pca.transform(train_feat)
test_feat_pca  = pca.transform(test_feat)

In [119]:
pca.n_components_

158

# Predicion Models

## Kernel Ridge Regression

In [252]:
from sklearn.kernel_ridge import KernelRidge
def kernel_ridge(xtrain, ytrain):
    kr = KernelRidge()
#     kr.fit(train_bow, train_feat_pca)
    kr.fit(xtrain, ytrain)
    return kr

In [179]:
# kr = kernel_ridge(train_bow, train_feat_pca)
# preds = kr.predict(test_bow)
# dm = dist_matrix(preds, test_feat_pca)
# top_images = output_format(dm)
# outputCSV(top_images)

## Multi Layer Perception (MLP)

# Train Test Split

In [236]:
pca = _PCA(train_feat, 0.96)
train_feat_pca = pca.transform(train_feat)
test_feat_pca  = pca.transform(test_feat)

In [237]:
xtrain, xval, ytrain, yval = train_test_split(train_bow, train_feat_pca)

## Testing Kernel Rigde

In [238]:
kr2 = kernel_ridge(xtrain, ytrain)
yval_pred = kr2.predict(xval)
dm2 = dist_matrix(yval_pred, yval)
# top_images2 = output_format(dm2)
get_acc_from_dm(dm2)

In [None]:
'''
pca(0.97) - [20.8, 21]%
pca(0.96) ~ 22%
pca(0.95) - [20.3, 20.5]%
pca(0.94) - [21.5, 21.6]%
'''

## Testing MLP

In [None]:
'''


'''