In [1]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

In [2]:
descriptionTrainFolder = "/Users/francescoperera/Desktop/data/descriptions_train"
featuresTrainFolder = "/Users/francescoperera/Desktop/data/features_train"
imagesTrainFolder = "/Users/francescoperera/Desktop/data/images_train"
tagsTrainFolder = "/Users/francescoperera/Desktop/data/tags_train"

In [3]:
descriptionTestFolder = "/Users/francescoperera/Desktop/data/descriptions_test"
featuresTestFolder = "/Users/francescoperera/Desktop/data/features_test"
imagesTestFolder = "/Users/francescoperera/Desktop/data/images_test"
tagsTestFolder = "/Users/francescoperera/Desktop/data/tags_test"

In [4]:
numTrain = 10000
numTest = 2000

## Read Files

In [5]:
def readTagsFile(fileName):
    tags = []
    f = open(fileName,"r")
    for line in f:
        line = line.split(":") # possibly also consider using the keys in each line(vehicle,outdoor etc..)
        tag = line[-1].replace("\n","")
        tags.append(tag)
    return tags

def readDescriptionFile(fileName):
    desc = []
    f = open(fileName,"r")
    for line in f:
        noPuncSentence = stripPunctuation(line.replace("\n",""))
        desc.append(noPuncSentence)
    return desc


## Process Descriptions ( POS & Stemming)

In [17]:
def stemmedDescriptions(lst):
    stemmedDescriptions = []
    for line in lst:
        posLine = pos(line.split(" "))
        #print posLine
        stemmedDescriptions.append(stemmer(posLine))
    return stemmedDescriptions

def stemmer(line):
    stemmer = PorterStemmer()
    stemmedLine = []
    for word,pos in line:
        if pos == "NN": #only stem nouns
            stem = stemmer.stem(word)
            stemmedLine.append(stem)
    return stemmedLine 

def pos(line):
    return pos_tag(line)

def stripPunctuation(s):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)

## Process Tags ( Stemming)

In [7]:
def stemTags(lst):
    newTags = []
    for word in lst:
        stem = PorterStemmer().stem(word)
        newTags.append(stem)
    return newTags 

## BoT

In [21]:
def createBag(num,tagFolder):
    bag = {}
    for n in range(num):
        f = tagFolder +"/" + str(n) + ".txt"
        fileTags = readTagsFile(f)
        newTags = stemTags(fileTags)
        for tag in newTags:
            if str(tag) not in bag.keys():
                bag[str(tag)] = 0
    return bag

def indexBag(bag):
    idx = 0
    for tag in bag.keys():
        bag[tag] = idx
        idx+=1
    return bag

def tagFrequency(num,descFolder,bag):
    featureVectorList = []
    for n in range(num):
        imageTagVec = [0.0 for tag in range(len(bag.keys()))]
        f = descFolder +"/" + str(n) + ".txt"
        fileDescriptions = readDescriptionFile(f) #2D Array
        stemDescriptions = stemmedDescriptions(fileDescriptions)
        for desc in stemDescriptions:
            for word in desc:
                if str(word) in bag.keys() and str(word) != "":
                    imageTagVec[bag[str(word)]] +=1.0
        featureVectorList.append(imageTagVec)
    return pd.DataFrame(featureVectorList,columns = bag.keys())

In [13]:
bagOfTags = createBag(numTrain,tagsTrainFolder)

In [14]:
print len(bagOfTags)

80


In [15]:
indexedBag = indexBag(bagOfTags)

In [22]:
vectorizedTrain = tagFrequency(numTrain,descriptionTrainFolder,indexedBag)

In [23]:
vectorizedTrain.head(1)

Unnamed: 0,toilet,teddy bear,kite,train,laptop,tennis racket,sports bal,donut,snowboard,carrot,...,bed,cat,hot dog,person,dining t,truck,car,cake,wine glass,traffic light
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [32]:
vectorizedTest = tagFrequency(numTest,descriptionTestFolder,indexedBag)

In [33]:
vectorizedTest.head(1)

Unnamed: 0,toilet,teddy bear,kite,train,laptop,tennis racket,sports bal,donut,snowboard,carrot,...,bed,cat,hot dog,person,dining t,truck,car,cake,wine glass,traffic light
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
def getHighestOccuringObj(arr,n):
    zerosCount = 0
    for i in range(n):
        col = vectorizedTest.iloc[i].argmax()
        freq = vectorizedTest.iloc[i][col]
        if int(freq) == 0:
            zerosCount +=1
        print col,freq
    print "Zeros occured " + str(zerosCount) + " times."

In [53]:
getHighestOccuringObj(vectorizedTest,vectorizedTest.shape[0])

toilet 0.0
cake 5.0
airplan 2.0
toilet 0.0
toilet 0.0
skateboard 5.0
person 3.0
zebra 5.0
truck 3.0
toilet 0.0
toilet 0.0
skateboard 4.0
car 4.0
toilet 0.0
ski 1.0
pizza 6.0
frisbe 5.0
toilet 0.0
eleph 1.0
surfboard 4.0
car 5.0
orang 4.0
toilet 0.0
toilet 4.0
toilet 0.0
toilet 0.0
pizza 7.0
toilet 0.0
pizza 3.0
bench 5.0
hors 4.0
toilet 0.0
bu 2.0
toilet 0.0
train 6.0
toilet 0.0
toilet 0.0
umbrella 4.0
toilet 0.0
cat 4.0
tv 1.0
clock 4.0
toilet 0.0
sheep 2.0
cup 1.0
banana 5.0
skateboard 3.0
toilet 0.0
toilet 0.0
clock 1.0
giraff 1.0
kite 1.0
dog 5.0
airplan 3.0
toilet 2.0
handbag 1.0
pizza 5.0
bench 2.0
person 4.0
toilet 0.0
toilet 0.0
microwav 2.0
motorcycl 4.0
dog 5.0
giraff 4.0
bowl 3.0
toilet 0.0
toilet 0.0
toilet 0.0
train 6.0
train 5.0
airplan 2.0
person 3.0
train 6.0
bowl 6.0
couch 1.0
boat 3.0
zebra 2.0
bowl 5.0
kite 1.0
eleph 5.0
laptop 3.0
truck 3.0
donut 1.0
toothbrush 2.0
refriger 4.0
truck 4.0
toilet 0.0
toilet 0.0
toilet 0.0
bicycl 4.0
chair 2.0
suitcas 3.0
bird 8.0
sand

## kNN

In [58]:
def kNN(arr,k,test):
    model = NearestNeighbors(n_neighbors= k , algorithm='kd_tree').fit(arr)
    dist,idxs = model.kneighbors(test)
    return dist,idxs

In [59]:
dist,ind = kNN(np.array(vectorizedTrain),20,np.array(vectorizedTest))

In [60]:
print ind

[[  11   52   47 ...,   13   30  167]
 [  87 6645 4670 ...,  792 3519 1609]
 [3630 5841 3867 ..., 2955 3098 2287]
 ..., 
 [9977 6746 1951 ..., 2535 2563 5582]
 [6722 4821 9077 ...,   59   58   47]
 [ 996 6905 5131 ..., 2967  450 2250]]


In [61]:
print ind.shape

(2000, 20)


In [62]:
print ind[0]

[  11   52   47   29   42   46 9997   24   12   32   16   15    4   10   22
    8  330   13   30  167]


In [68]:
def saveResults(res,name):
    f = open(name,'w')
    f.write('Descritpion_ID,Top_20_Image_IDs\n')
    for i in xrange(2000):
        f.write(str(i)+'.txt,')
        for j in xrange(20):
            if j == 19:
                f.write(str(res[i,j])+'.jpg\n')
            else:
                f.write(str(res[i,j])+'.jpg ')
    f.close()

In [70]:
saveResults(ind,'bot_baseline.csv')