# BOW + Models

# Imports

In [71]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import csv
import gensim
from gensim.models import Word2Vec
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist
from sklearn.neural_network import MLPRegressor

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Definitions and Data Load

In [20]:
dataFolder = "cs5785-fall19-final"

descTrainFolder = dataFolder + "/descriptions_train"
descTestFolder = dataFolder + "/descriptions_test"

featTrainFolder = dataFolder + "/features_train"
featTestFolder = dataFolder + "/features_test"

imagesTrainFolder = dataFolder + "/images_train"
imagesTestFolder = dataFolder + "/images_test"

tagsTrainFolder = dataFolder + "/tags_train"
tagsTestFolder = dataFolder + "/tags_test"

folders = [descTrainFolder,   descTestFolder,   featTrainFolder, featTestFolder, 
           imagesTrainFolder, imagesTestFolder, tagsTrainFolder, tagsTestFolder]

word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [21]:
def getFilesFromFolder(folder):
    return listdir(folder)

In [22]:
def dist_matrix(x1, x2):
    return cdist(x1,x2,'cosine')

In [23]:
# Formatting predictions for csv output
def output_format(dm):
    id_dists = []
    for i, d in enumerate(dm):
        temp_dists = [(j, d[j]) for j in range(len(d))]
        id_dists.append(list(sorted(temp_dists, key= lambda x: abs(x[1])))[:20])
    return [["{}.jpg".format(id) for id, _ in row] for row in id_dists]

In [69]:
def outputCSV(predictions):
    with open("image_prediction_kernel_pca_bow_concat.csv", "w") as outputFile:
        headers = ["Descritpion_ID", "Top_20_Image_IDs"]
        fileWriter = csv.DictWriter(outputFile, fieldnames=headers)
        fileWriter.writeheader()
        for index, pred in enumerate(predictions):
            fileWriter.writerow({headers[0]: "{}.txt".format(index), headers[1]: ' '.join(predictions[index])})

In [62]:
# [IRENE] I have changed the yval for yval_conc so I could use it for the features concatenated. 
# Change it back if 

def get_acc_from_dm(dm):
    val_scores = []
    val_pos_list = []

    for i in range(len(yval_concat)):
        pred_dist_idx = list(np.argsort(dm[i]))
        val_pos = pred_dist_idx.index(i)
        val_pos_list.append(val_pos)
        if val_pos < 20:
            val_scores.append(1 / (val_pos + 1))
        else:
            val_scores.append(0.0)

    print("Development MAP@20:", np.mean(val_scores))
    print("Mean index of true image", np.mean(val_pos_list))
    print("Median index of true image", np.median(val_pos_list))

# Preprocess data

In [72]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        
        # remove punctuation
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        """
        Analyzing if words are upper/lower case is more for analyzing the intensity of the sentiment rather than classifying it. 
        """
        word_tokens = list(map(lambda r: r.lower(), word_tokenize(data[i])))
        
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        data[i] = ' '.join(word_tokens)
    
    assert(len(data) == 1)
    return " ".join(set(data[0].split()))

In [73]:
def get_flat_descriptions_from_folder(folder):
    flat_descriptions = []
    amtFiles = len(getFilesFromFolder(folder))
    indexes = list(sorted([str(i) for i in range(amtFiles)], key= lambda x: int(x)))
    
    for index in indexes:
        filename = folder + "/{}.txt".format(index)
        with open(filename, 'r') as d_file:
            flat_descriptions += [[' '.join(d_file.readlines()).replace("\n", "")]]
            
    return flat_descriptions

In [75]:
# [  [ unprocessed description of image 1  ] , [ unprocessed description of image 2  ]   , ...  ]
train_flat_descs = get_flat_descriptions_from_folder(descTrainFolder)
test_flat_descs  = get_flat_descriptions_from_folder(descTestFolder)

In [76]:
def unique_words_in_desc(desc):
    return [preprocessing(d) for d in desc]

In [77]:
# [  { processed description of image 1  } , { processed description of image 2  }   , ...  ]
train_descs = unique_words_in_desc(train_flat_descs)
test_descs = unique_words_in_desc(test_flat_descs)

## Bag of Words

In [78]:
# CREATE THE BAG OF WORDS DICTIONARY
def create_bow(train_descs):
    BOW = {'null' : 0}
    for description in train_descs:
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            BOW[word] = 0
    return BOW

def create_bow_vectors(data, BOW):
    feature_vectors = []
    for description in data:
        feat_vec = BOW.copy()
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            if word in feat_vec:
                feat_vec[word] += 1
            else:
                feat_vec['null'] += 1       
        feature_vectors.append(feat_vec)

    # TURN DICTIONARIES INTO A MATRIX with each row as one description
    feature_vector_matrix = []
    for feature_vec in feature_vectors:
        feature_vector_matrix.append(list(feature_vec.values()))

    # NORMALIZE THE FEATURES
    feature_vector_matrix = sklearn.preprocessing.normalize(feature_vector_matrix) # default is L2 norm
    return feature_vector_matrix

In [79]:
BOW = create_bow(train_descs)

In [80]:
train_bow = create_bow_vectors(train_descs, BOW)
test_bow  = create_bow_vectors(test_descs, BOW)

## Feature Extraction

In [84]:
def get_file_num(filename):
    return int(filename.replace(".jpg", "").replace("images_train/", "").replace("images_test/", ""))

def get_feat_from_file(filename):
    with open(filename, 'r') as f:
        feats = list(csv.reader(f))
    sorted_feats = list(sorted(feats, key= lambda l: get_file_num(l[0])))
    
    return sklearn.preprocessing.normalize([s[1:] for s in sorted_feats])

In [85]:
resTrainFile = featTrainFolder + "/features_resnet1000_train.csv"
train_feat = get_feat_from_file(resTrainFile)

resTrainInterFile = featTrainFolder + "/features_resnet1000intermediate_train.csv"
train_inter_feat = get_feat_from_file(resTrainInterFile)

resTestFile = featTestFolder + "/features_resnet1000_test.csv"
test_feat = get_feat_from_file(resTestFile)

resTestInterFile = featTestFolder +  "/features_resnet1000intermediate_test.csv"
test_inter_feat = get_feat_from_file(resTestInterFile)

In [86]:
# CONCATENATE INTERMEDIATE + FINAL FEATURES
train_feat_conc = np.concatenate((train_feat, train_inter_feat), axis = 1)
test_feat_conc = np.concatenate((test_feat, test_inter_feat), axis = 1)

# PCA 

In [87]:
from sklearn.decomposition import PCA
def _PCA(train_feat, n):
    pca = PCA(n_components = n)
    pca.fit(train_feat)
    return pca

In [None]:
pca = _PCA(train_inter_feat, 0.97)
train_feat_pca_inter = pca.transform(train_inter_feat)
test_feat_pca_inter  = pca.transform(test_inter_feat)

In [88]:
# PCA IN CONCATENATION:
pca = _PCA(train_feat_conc, 0.97)
train_feat_conc_pca = pca.transform(train_feat_conc)
test_feat_conc_pca = pca.transform(test_feat_conc)

In [89]:
pca.n_components_

252

# Predicion Models

## Kernel Ridge Regression

In [65]:
from sklearn.kernel_ridge import KernelRidge
def kernel_ridge(xtrain, ytrain):
    kr = KernelRidge()
#     kr.fit(train_bow, train_feat_pca)
    kr.fit(xtrain, ytrain)
    return kr

### Uncomment when using actual test data

In [67]:
kr = kernel_ridge(train_bow, train_feat_conc_pca)
preds = kr.predict(test_bow)
dm = dist_matrix(preds, test_feat_conc_pca)
top_images = output_format(dm)

In [70]:
outputCSV(top_images)

## Multi Layer Perception (MLP)

In [23]:
def MLP(xtrain, ytrain):
    mlp = MLPRegressor(solver='adam', alpha=1e-5, hidden_layer_sizes=(200,), random_state=1, max_iter = 1000)
    mlp.fit(xtrain, ytrain)
    return mlp

### Uncomment when using actual test data

In [24]:
# mlp = MLP(train_bow, train_feat_pca)
# preds = mlp.predict(test_bow)
# dm = dist_matrix(preds, test_feat_pca)
# top_images = output_format(dm)
# outputCSV(top_images)

## Ridge

In [57]:
from sklearn.linear_model import Ridge
def ridge(xtrain, ytrain):
    rid = Ridge()
    rid.fit(xtrain, ytrain)
    return rid

In [None]:
#

## Keras

In [102]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import losses


model = Sequential()
model.add(Dense(252, input_dim=6830, activation='relu'))
model.add(Dense(252, activation='relu'))
model.add(Dense(252, activation='softmax'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [None]:
'''
Development MAP@20: 0.04275898772022611 -->

model.add(Dense(252, input_dim=6830, activation='relu'))
model.add(Dense(252, activation='relu'))
model.add(Dense(252, activation='softmax'))

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.fit(xtrain_conc, ytrain_conc, epochs=15, batch_size=10)

//



'''

# PCA / Train Test Split

In [59]:
#n_comp = 0.90
xdata = train_bow

### Normal features

In [213]:
pca = _PCA(train_feat, n_comp)
train_feat_pca = pca.transform(train_feat)
test_feat_pca  = pca.transform(test_feat)

In [214]:
xtrain, xval, ytrain, yval = train_test_split(xdata, train_feat_pca)

### Intermediate features

In [215]:
pca = _PCA(train_inter_feat, n_comp)
train_feat_pca_inter = pca.transform(train_inter_feat)
test_feat_pca_inter  = pca.transform(test_inter_feat)

In [216]:
xtrain_inter, xval_inter, ytrain_inter, yval_inter = train_test_split(xdata, train_feat_pca)

### Concatenate features

In [100]:
xtrain_conc, xval_conc, ytrain_conc, yval_conc = train_test_split(xdata, train_feat_conc_pca)

In [101]:
train_feat_conc_pca.shape

(10000, 252)

## Testing Kernel Ridge

### Normal features

In [217]:
kr2 = kernel_ridge(xtrain, ytrain)
yval_pred = kr2.predict(xval)
dm2 = dist_matrix(yval_pred, yval)
get_acc_from_dm(dm2)

Development MAP@20: 0.20247492240130008
Mean index of true image 50.5556
Median index of true image 13.0


In [218]:
''' Yeaaaaaaa
pca(0.97) - [20.8, 21]%
pca(0.96) ~ 22%
pca(0.95) - [20.3, 20.5]%
pca(0.94) - [21.5, 21.6]%
'''

' Yeaaaaaaa\npca(0.97) - [20.8, 21]%\npca(0.96) ~ 22%\npca(0.95) - [20.3, 20.5]%\npca(0.94) - [21.5, 21.6]%\n'

### Intermediate features

In [219]:
kr2 = kernel_ridge(xtrain_inter, ytrain_inter)
yval_pred_inter = kr2.predict(xval_inter)
dm2 = dist_matrix(yval_pred_inter, yval_inter)
get_acc_from_dm(dm2)

Development MAP@20: 0.2163893492609901
Mean index of true image 48.5432
Median index of true image 13.0


In [220]:
'''
pca(0.97) - 24.9


'''

'\npca(0.97) - 24.9\n\n\n'

### Concatenated features

In [66]:
kr2 = kernel_ridge(xtrain_conc, ytrain_conc)
yval_pred_conc = kr2.predict(xval_conc)
dm2 = dist_matrix(yval_pred_conc, yval_conc)
get_acc_from_dm(dm2)

Development MAP@20: 0.2389050227601776
Mean index of true image 37.7732
Median index of true image 11.0


In [220]:
'''
pca(0.97) 0.2389

'''

'\npca(0.97) - 24.9\n\n\n'

## Testing MLP

### Normal features

In [221]:
mlp = MLP(xtrain, ytrain)
yval_pred = mlp.predict(xval)
dm2 = dist_matrix(yval_pred, yval)
get_acc_from_dm(dm2)

Development MAP@20: 0.18574217424130737
Mean index of true image 64.8976
Median index of true image 16.0


In [222]:
''' Not convinvced
pca(0.97) - [17.0]%, 100 layers
pca(0.97) - [18.3]%, 200 layers
pca(0.96) ~ [17.8, 18.2]%, 100 layers
pca(0.96) - [19.1, 19.1]%, 200 layers

'''

' Not convinvced\npca(0.97) - [17.0]%, 100 layers\npca(0.97) - [18.3]%, 200 layers\npca(0.96) ~ [17.8, 18.2]%, 100 layers\npca(0.96) - [19.1, 19.1]%, 200 layers\n\n'

### Intermediate features

In [223]:
mlp = MLP(xtrain_inter, ytrain_inter)
yval_pred_inter = mlp.predict(xval_inter)
dm2 = dist_matrix(yval_pred_inter, yval_inter)
get_acc_from_dm(dm2)

Development MAP@20: 0.18903845659500457
Mean index of true image 57.7076
Median index of true image 16.0


In [224]:
'''



'''

'\n\n\n\n'

## Testing Ridge

### Normal features

In [None]:
rid2 = ridge(xtrain, ytrain)
yval_pred = rid2.predict(xval)
dm2 = dist_matrix(yval_pred, yval)
get_acc_from_dm(dm2)

In [None]:
''' Yeaaaaaaa 2.0
pca(0.96) - [21.5, 21.8]%


'''

### Intermediate features

In [None]:
rid2 = ridge(xtrain_inter, ytrain_inter)
yval_pred_inter = rid2.predict(xval_inter)
dm2 = dist_matrix(yval_pred_inter, yval_inter)
get_acc_from_dm(dm2)

In [None]:
'''
PCA(0.97), 24.9, 25.2


'''

### Concatenate features

In [63]:
rid2 = ridge(xtrain_conc, ytrain_conc)
yval_pred_conc = rid2.predict(xval_conc)
dm2 = dist_matrix(yval_pred_conc, yval_conc)
get_acc_from_dm(dm2)

Development MAP@20: 0.23954887504136732
Mean index of true image 37.628
Median index of true image 11.0


In [None]:
'''
PCA(0.97), 0.239


'''

## Testing Keras

### Concatenated features

In [112]:
model = Sequential()
model.add(Dense(252, input_dim=6830, activation='relu'))
model.add(Dense(252, activation='relu'))
model.add(Dense(252, activation='softmax'))

model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])
model.fit(xtrain_conc, ytrain_conc, epochs=15, batch_size=10)

yval_pred_conc = model.predict(xval_conc)
dm2 = dist_matrix(yval_pred_conc, yval_conc)
get_acc_from_dm(dm2)


#_, accuracy = model.evaluate(X, y)
#print('Accuracy: %.2f' % (accuracy*100))

Train on 7500 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Development MAP@20: 0.0014416711066711067
Mean index of true image 1249.5284
Median index of true image 1248.5
