## Library

In [1]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

## Folder Locations

### Train

In [2]:
descriptionTrainFolder = "/Users/francescoperera/Desktop/data/descriptions_train"
featuresTrainFolder = "/Users/francescoperera/Desktop/data/features_train"
imagesTrainFolder = "/Users/francescoperera/Desktop/data/images_train"
tagsTrainFolder = "/Users/francescoperera/Desktop/data/tags_train"

### Test

In [3]:
descriptionTestFolder = "/Users/francescoperera/Desktop/data/descriptions_test"
featuresTestFolder = "/Users/francescoperera/Desktop/data/features_test"
imagesTestFolder = "/Users/francescoperera/Desktop/data/images_test"
tagsTestFolder = "/Users/francescoperera/Desktop/data/tags_test"

## Number of Files

In [4]:
numTrain = 10000
numTest = 2000

## Read Files

In [5]:
def readTagsFile(fileName):
    tags = []
    f = open(fileName,"r")
    for line in f:
        line = line.split(":") # possibly also consider using the keys in each line(vehicle,outdoor etc..)
        tag = line[-1].replace("\n","")
        tags.append(tag)
    return tags

def readDescriptionFile(fileName):
    desc = []
    f = open(fileName,"r")
    for line in f:
        noPuncSentence = stripPunctuation(line.replace("\n",""))
        desc.append(noPuncSentence)
    return desc
    
def stripPunctuation(s):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)

In [8]:
#tags = readTagsFile(tagsTrainFolder +"/0.txt")

In [9]:
#descriptions = readDescriptionFile(descriptionTrainFolder + "/0.txt")

### Process Descriptions with stemming and POS

In [6]:
def stemmedDescriptions(lst):
    stemmedDescriptions = []
    for line in lst:
        posLine = pos(line.split(" "))
        #print posLine
        stemmedDescriptions.append(stemmer(posLine))
    return stemmedDescriptions

def stemmer(line):
    stemmer = PorterStemmer()
    stemmedLine = []
    for word,pos in line:
        if pos == "NN": # only stem nouns
            stem = stemmer.stem(word)
            stemmedLine.append(stem)
    return stemmedLine 

def pos(line):
    return pos_tag(line)
    

In [11]:
# stemDescriptions = stemmedDescriptions(descriptions)
# print stemDescriptions
# print descriptions

In [7]:
def stemTags(lst):
    newTags = []
    for word in lst:
        stem = PorterStemmer().stem(word)
        newTags.append(stem)
    return newTags 

In [13]:
#print stemTags(tags)

### Create bag of words

In [8]:
def createBag(num,descFolder):
    bag = {}
    for n in range(num):
        f = descFolder +"/" + str(n) + ".txt"
        fileDesc = readDescriptionFile(f)
        stemDescriptions = stemmedDescriptions(fileDesc)
        for desc in stemDescriptions:
            for word in desc: #word is in unicode, convert it to string with str()
                if str(word) not in bag.keys() and str(word) != "":
                    bag[str(word)] = 0
    return bag

def indexBag(bag):
    idx = 0
    for tag in bag.keys():
        bag[tag] = idx
        idx+=1
    return bag

def wordFrequency(num,descFolder,bag):
    #for test purposes
    nonBagWords = []
    count = 0
    featureVectorList = []
    for n in range(num):
        imageFeatureVec = [0.0 for tag in range(len(bag.keys()))]
        f = descFolder +"/" + str(n) + ".txt"
        fileDescriptions = readDescriptionFile(f) #2D Array
        stemDescriptions = stemmedDescriptions(fileDescriptions)
        for desc in stemDescriptions:
            for word in desc:
                if str(word) in bag.keys() and str(word) != "":
                    imageFeatureVec[bag[str(word)]] +=1.0
                else:
                    nonBagWords.append(str(word))
                    count +=1
        #print max(imageFeatureVec) # test
        featureVectorList.append(imageFeatureVec)
        
        
    print count # count of words in descriptions not present in bag
    return pd.DataFrame(featureVectorList,columns = bag.keys())

        

In [9]:
bagOfWords = createBag(numTrain,descriptionTrainFolder)

In [10]:
print len(bagOfWords.keys())

4463


In [11]:
indexedBOW = indexBag(bagOfWords)
vectorizedTrain = wordFrequency(numTrain,descriptionTrainFolder,indexedBOW)

33793


In [12]:
print vectorizedTrain.shape

(10000, 4463)


In [43]:
# def normalize(df,bag):
#     normalizedDf = preprocessing.normalize(df, norm='l1')
#     return pd.DataFrame(normalizedDf,columns = bag.keys()).reset_index().drop("index",axis =1)

In [44]:
# normalizedTrain = normalize(vectorizedTrain,bagOfWords)

In [45]:
# print normalizedTrain.shape

(10000, 4463)


In [46]:
# normalizedTrain.head(1)['skateboard']

0    0.25
Name: skateboard, dtype: float64

In [13]:
vectorizedTrain.head(1)['skateboard']

0    4
Name: skateboard, dtype: float64

In [14]:
print bagOfWords.keys()

['disarray', 'mit', 'birdfeed', 'videogam', 'woodi', 'Night', 'all', 'yellow', 'sleek', 'gag', 'herslf', 'sleep', 'upsid', 'captain', 'hate', 'sore', 'trolley', 'educ', 'vandal', 'tricycl', 'sorri', 'swan', 'sunlit', 'illustr', 'bike', 'ziploc', 'under', 'hedg', 'worth', 'merchant', 'swivel', 'wreath', 'deli', 'blanket', 'rise', 'piti', 'vase', 'hardwar', 'ident', 'lunchmeat', 'jack', 'african', 'rec', 'confetti', 'affect', 'showroom', 'special', 'chapel', 'school', 'gown', 'prize', 'motorcyclist', 'kitchenett', 'wooden', 'showcas', 'kithen', 'formica', 'satchel', 'huddl', 'crotch', 'cutout', 'cheeto', 'barbecu', 'fogi', 'MAN', 'enjoy', 'chew', 'rusti', 'bill', 'hord', 'quilt', 'franc', 'heliport', 'acacia', 'snowwi', 'bacon', 'direct', 'horn', 'chef', 'hors', 'street', 'tether', 'air', 'panda', 'blue', 'blud', 'hide', 'parkland', 'amus', 'asid', 'lightn', 'near', 'asia', 'launch', 'lakesid', 'haze', 'blur', 'trindl', 'net', 'hazi', 'abou', 'crunch', 'Photograph', 'crouch', 'wakeboard'

## Use the targetVectorIdx to create a label vector of tags for each image

In [15]:
def createTagVectorIdxs(num,tagFolder):
    tagIdx = {}
    idx = 0
    for n in range(num):
        f = tagFolder +"/" + str(n) + ".txt"
        fileTags = readTagsFile(f)
        stemmedTags = stemTags(fileTags)
        for tag in stemmedTags: #tag is unicode, use str() to remove unicode string
            if str(tag) not in tagIdx.keys():
                tagIdx[str(tag)] = idx
                idx+=1
    return tagIdx

def createLabelVectors(tagVectorIdxs,tagFolder,num,trainData):
    labelVec = []
    for n in range(num):
        tagVector = [0.0 for tag in range(len(tagVectorIdxs.keys()))]
        f = tagFolder +"/" + str(n) + ".txt"
        fileTags = readTagsFile(f)
        stemmedTags = stemTags(fileTags)
        for tag in stemmedTags: #tag is unicode, use str() to remove unicode string
            #maybe check that tag is in tagVectorIdxs?
            #give weights to each tag in a file based on frequency in trainData (if tag in  columns of trainData)
            if str(tag) in list(trainData): #list(trainData) returns list of column names
                tagFreq = trainData.iloc[n][str(tag)]
            else:
                tagFreq = 0
            tagVector[tagVectorIdxs[str(tag)]] = tagFreq
        labelVec.append(tagVector)
    return pd.DataFrame(labelVec,columns = [tagVectorIdxs.keys()[tagVectorIdxs.values().index(i)] for i in range(80) ])

In [16]:
tagIdxs = createTagVectorIdxs(numTrain,tagsTrainFolder)

In [17]:
print tagIdxs

{'toilet': 59, 'teddy bear': 45, 'kite': 66, 'train': 35, 'laptop': 22, 'tennis racket': 47, 'sports bal': 43, 'donut': 23, 'snowboard': 33, 'carrot': 9, 'zebra': 65, 'oven': 53, 'keyboard': 18, 'chair': 16, 'couch': 57, 'appl': 75, 'microwav': 52, 'sheep': 74, 'bicycl': 24, 'hors': 51, 'cup': 20, 'tv': 21, 'backpack': 5, 'toaster': 78, 'bowl': 8, 'cell phon': 46, 'bench': 1, 'eleph': 67, 'book': 19, 'boat': 62, 'toothbrush': 73, 'tie': 25, 'airplan': 0, 'ski': 70, 'stop sign': 29, 'knife': 61, 'pizza': 68, 'fork': 55, 'hair drier': 79, 'bottl': 31, 'sandwich': 48, 'umbrella': 56, 'banana': 72, 'parking met': 69, 'orang': 76, 'motorcycl': 39, 'frisbe': 13, 'bear': 63, 'giraff': 38, 'mous': 17, 'bu': 26, 'spoon': 10, 'baseball glov': 28, 'sink': 60, 'refriger': 49, 'handbag': 6, 'suitcas': 27, 'scissor': 77, 'vase': 50, 'bird': 40, 'broccoli': 54, 'potted pl': 36, 'surfboard': 42, 'cow': 64, 'remot': 58, 'baseball bat': 14, 'fire hydr': 44, 'clock': 32, 'skateboard': 2, 'dog': 41, 'bed'

In [18]:
labelTrainDF = createLabelVectors(tagIdxs,tagsTrainFolder,numTrain,vectorizedTrain)

In [19]:
labelTrainDF.head(2)

Unnamed: 0,airplan,bench,skateboard,person,truck,backpack,handbag,dining t,bowl,carrot,...,ski,hot dog,banana,toothbrush,sheep,appl,orang,scissor,toaster,hair drier
0,0,0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0


## Test DataFrames

In [20]:
vectorizedTest = wordFrequency(numTest,descriptionTestFolder,indexedBOW)

7263


the test descriptions have 392 words that are not present in the bag of words from the train.

## KNN to get labels for the test

In [21]:
def kNN(arr,k,test):
    model = NearestNeighbors(n_neighbors= k , algorithm='kd_tree').fit(arr)
    dist,idxs = model.kneighbors(test)
    return dist,idxs

In [22]:
dist,inds = kNN(vectorizedTrain,20,vectorizedTest)

In [23]:
print inds[:2]

[[1185  974  429 6714 9289 6019 6091 1539 4698  457 2731  492 1856 5266
  4506 3565   32  549 3037  842]
 [6650  747 8242 4532  283 7994  726 3782 9383 8294 8901 5723 3235 5181
  5982 5560 8411 2614 2735 2229]]


In [30]:
def getPredictionLabelVec(preds,trainLabelDF):
    trainLabelVec = np.array(trainLabelDF)
    predictionsLabelVec = []
    for testPred in preds:
        labelVec = []
        for idx in testPred:
            idxVec = trainLabelVec[idx]
            labelVec.append(idxVec)
        predictionsLabelVec.append(labelVec)
    return predictionsLabelVec

In [34]:
predictionLabelDF = getPredictionLabelVec(inds,labelTrainDF)

In [41]:
print predictionLabelDF[0]

[array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.

In [39]:
labelTrainDF.iloc[1185]

airplan          0
bench            0
skateboard       0
person           0
truck            0
backpack         0
handbag          0
dining t         0
bowl             0
carrot           0
spoon            0
car              0
traffic light    0
frisbe           0
baseball bat     0
bed              0
chair            0
mous             0
keyboard         0
book             0
cup              0
tv               0
laptop           0
donut            0
bicycl           0
tie              0
bu               0
suitcas          0
baseball glov    0
stop sign        0
                ..
vase             0
hors             0
microwav         0
oven             0
broccoli         0
fork             0
umbrella         0
couch            0
remot            0
toilet           0
sink             0
knife            0
boat             0
bear             0
cow              0
zebra            0
kite             0
eleph            0
pizza            0
parking met      0
ski              0
hot dog     