## Library

In [174]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier

## Folder Locations

### Train

In [2]:
descriptionTrainFolder = "/Users/francescoperera/Desktop/data/descriptions_train"
featuresTrainFolder = "/Users/francescoperera/Desktop/data/features_train"
imagesTrainFolder = "/Users/francescoperera/Desktop/data/images_train"
tagsTrainFolder = "/Users/francescoperera/Desktop/data/tags_train"

### Test

In [3]:
descriptionTestFolder = "/Users/francescoperera/Desktop/data/descriptions_test"
featuresTestFolder = "/Users/francescoperera/Desktop/data/features_test"
imagesTestFolder = "/Users/francescoperera/Desktop/data/images_test"
tagsTestFolder = "/Users/francescoperera/Desktop/data/tags_test"

## Number of Files

In [88]:
numTrain = 10000
numTest = 2000

## Read Files

In [31]:
def readTagsFile(fileName):
    tags = []
    f = open(fileName,"r")
    for line in f:
        line = line.split(":") # possibly also consider using the keys in each line(vehicle,outdoor etc..)
        tag = line[-1].replace("\n","")
        tags.append(tag)
    return tags

def readDescriptionFile(fileName):
    desc = []
    f = open(fileName,"r")
    for line in f:
        noPuncSentence = stripPunctuation(line.replace("\n",""))
        desc.append(noPuncSentence)
    return desc
    
def stripPunctuation(s):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)

In [82]:
tags = readTagsFile(tagsTrainFolder +"/0.txt")

In [59]:
descriptions = readDescriptionFile(descriptionTrainFolder + "/0.txt")

### Process Descriptions with stemming and POS

In [78]:
def stemmedDescriptions(lst):
    stemmedDescriptions = []
    for line in lst:
        posLine = pos(line.split(" "))
        #print posLine
        stemmedDescriptions.append(stemmer(posLine))
    return stemmedDescriptions

def stemmer(line):
    stemmer = PorterStemmer()
    stemmedLine = []
    for word,pos in line:
        if pos == "NN": # only stem nouns
            stem = stemmer.stem(word)
            stemmedLine.append(stem)
    return stemmedLine 

def pos(line):
    return pos_tag(line)
    

In [80]:
stemDescriptions = stemmedDescriptions(descriptions)
print stemDescriptions
print descriptions

[[u'skateboard', u'show', u'tabl', u'stage'], [u'skateboard', u'tabl', u''], [u'man', u'skateboard', u''], [u'skate', u'boarder', u'trick', u'tabl', u''], [u'person', u'skateboard', u'tabl', u'crowd', u'']]
['The skateboarder is putting on a show using the picnic table as his stage ', 'A skateboarder pulling tricks on top of a picnic table ', 'A man riding on a skateboard on top of a table ', 'A skate boarder doing a trick on a picnic table ', 'A person is riding a skateboard on a picnic table with a crowd watching ']


In [86]:
def stemTags(lst):
    newTags = []
    for word in lst:
        stem = PorterStemmer().stem(word)
        newTags.append(stem)
    return newTags 

In [87]:
print stemTags(tags)

[u'airplan', u'bench', u'skateboard', u'person', u'truck', u'backpack', u'handbag', u'dining t']


### Create bag of words

In [167]:
def createBag(num,descFolder):
    bag = {}
    for n in range(num):
        f = descFolder +"/" + str(n) + ".txt"
        fileDesc = readDescriptionFile(f)
        stemDescriptions = stemmedDescriptions(fileDesc)
        for desc in stemDescriptions:
            for word in desc: #word is in unicode, convert it to string with str()
                if str(word) not in bag.keys():
                    bag[str(word)] = 0
    return bag

def indexBag(bag):
    idx = 0
    for tag in bag.keys():
        bag[tag] = idx
        idx+=1
    return bag

def wordFrequency(num,descFolder,bag):
    #for test purposes
    nonBagWords = []
    count = 0
    featureVectorList = []
    for n in range(num):
        imageFeatureVec = [0.0 for tag in range(len(bag.keys()))]
        f = descFolder +"/" + str(n) + ".txt"
        fileDescriptions = readDescriptionFile(f) #2D Array
        stemDescriptions = stemmedDescriptions(fileDescriptions)
        for desc in stemDescriptions:
            for word in desc:
                if str(word) in bag.keys():
                    imageFeatureVec[bag[str(word)]] +=1.0
                else:
                    nonBagWords.append(str(word))
                    count +=1
        #print max(imageFeatureVec) # test
        featureVectorList.append(imageFeatureVec)
        
        
    print count # count of words in descriptions not present in bag
    return pd.DataFrame(featureVectorList,columns = bag.keys())

        

In [131]:
bagOfWords = createBag(numTrain,descriptionTrainFolder)

In [132]:
print len(bagOfWords.keys())

4464


In [135]:
indexedBOW = indexBag(bagOfWords)
vectorizedTrain = wordFrequency(numTrain,descriptionTrainFolder,indexedBOW)

file 0
file 100
file 200
file 300
file 400
file 500
file 600
file 700
file 800
file 900
file 1000
file 1100
file 1200
file 1300
file 1400
file 1500
file 1600
file 1700
file 1800
file 1900
file 2000
file 2100
file 2200
file 2300
file 2400
file 2500
file 2600
file 2700
file 2800
file 2900
file 3000
file 3100
file 3200
file 3300
file 3400
file 3500
file 3600
file 3700
file 3800
file 3900
file 4000
file 4100
file 4200
file 4300
file 4400
file 4500
file 4600
file 4700
file 4800
file 4900
file 5000
file 5100
file 5200
file 5300
file 5400
file 5500
file 5600
file 5700
file 5800
file 5900
file 6000
file 6100
file 6200
file 6300
file 6400
file 6500
file 6600
file 6700
file 6800
file 6900
file 7000
file 7100
file 7200
file 7300
file 7400
file 7500
file 7600
file 7700
file 7800
file 7900
file 8000
file 8100
file 8200
file 8300
file 8400
file 8500
file 8600
file 8700
file 8800
file 8900
file 9000
file 9100
file 9200
file 9300
file 9400
file 9500
file 9600
file 9700
file 9800
file 9900


In [136]:
print vectorizedTrain.shape

(10000, 4464)


In [143]:
def normalize(df,bag):
    normalizedDf = preprocessing.normalize(df, norm='l1')
    return pd.DataFrame(normalizedDf,columns = bag.keys())

In [145]:
normalizedTrain = normalize(vectorizedTrain,bagOfWords)

In [146]:
print normalizedTrain.shape

(10000, 4464)


In [150]:
normalizedTrain.head(1)

Unnamed: 0,Unnamed: 1,disarray,mit,birdfeed,videogam,woodi,Night,all,yellow,sleek,...,baker,bathrob,fest,hiker,tankini,inset,emerg,fifti,sash,buoy
0,0.2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [148]:
vectorizedTrain.head(1)

Unnamed: 0,Unnamed: 1,disarray,mit,birdfeed,videogam,woodi,Night,all,yellow,sleek,...,baker,bathrob,fest,hiker,tankini,inset,emerg,fifti,sash,buoy
0,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Use the targetVectorIdx to create a label vector of tags for each image

In [164]:
def createTagVectorIdxs(num,tagFolder):
    tagIdx = {}
    idx = 0
    for n in range(num):
        f = tagFolder +"/" + str(n) + ".txt"
        fileTags = readTagsFile(f)
        stemmedTags = stemTags(fileTags)
        for tag in stemmedTags: #tag is unicode, use str() to remove unicode string
            if str(tag) not in tagIdx.keys():
                tagIdx[str(tag)] = idx
                idx+=1
    return tagIdx

def createLabelVectors(tagVectorIdxs,tagFolder,num,trainData):
    labelVec = []
    for n in range(num):
        tagVector = [0.0 for tag in range(len(tagVectorIdxs.keys()))]
        f = tagFolder +"/" + str(n) + ".txt"
        fileTags = readTagsFile(f)
        stemmedTags = stemTags(fileTags)
        for tag in stemmedTags: #tag is unicode, use str() to remove unicode string
            #maybe check that tag is in tagVectorIdxs?
            #give weights to each tag in a file based on frequency in trainData (if tag in  columns of trainData)
            if tag in list(trainData): #list(trainData) returns list of column names
                tagFreq = trainData.iloc[n][tag]
            else:
                tagFreq = 0
            tagVector[tagVectorIdxs[str(tag)]] = tagFreq
        labelVec.append(tagVector)
    return pd.DataFrame(labelVec,columns = tagVectorIdxs.keys())

In [156]:
tagIdxs = createTagVectorIdxs(numTrain,tagsTrainFolder)

In [157]:
print tagIdxs

{'toilet': 59, 'teddy bear': 45, 'kite': 66, 'train': 35, 'laptop': 22, 'tennis racket': 47, 'sports bal': 43, 'donut': 23, 'snowboard': 33, 'carrot': 9, 'zebra': 65, 'oven': 53, 'keyboard': 18, 'chair': 16, 'couch': 57, 'appl': 75, 'microwav': 52, 'sheep': 74, 'bicycl': 24, 'hors': 51, 'cup': 20, 'tv': 21, 'backpack': 5, 'toaster': 78, 'bowl': 8, 'cell phon': 46, 'bench': 1, 'eleph': 67, 'book': 19, 'boat': 62, 'toothbrush': 73, 'tie': 25, 'airplan': 0, 'ski': 70, 'stop sign': 29, 'knife': 61, 'pizza': 68, 'fork': 55, 'hair drier': 79, 'bottl': 31, 'sandwich': 48, 'umbrella': 56, 'banana': 72, 'parking met': 69, 'orang': 76, 'motorcycl': 39, 'frisbe': 13, 'bear': 63, 'giraff': 38, 'mous': 17, 'bu': 26, 'spoon': 10, 'baseball glov': 28, 'sink': 60, 'refriger': 49, 'handbag': 6, 'suitcas': 27, 'scissor': 77, 'vase': 50, 'bird': 40, 'broccoli': 54, 'potted pl': 36, 'surfboard': 42, 'cow': 64, 'remot': 58, 'baseball bat': 14, 'fire hydr': 44, 'clock': 32, 'skateboard': 2, 'dog': 41, 'bed'

In [165]:
labelDF = createLabelVectors(tagIdxs,tagsTrainFolder,numTrain,vectorizedTrain)

In [166]:
labelDF.head(2)

Unnamed: 0,toilet,teddy bear,kite,train,laptop,tennis racket,sports bal,donut,snowboard,carrot,...,bed,cat,hot dog,person,dining t,truck,car,cake,wine glass,traffic light
0,0,0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0


In [193]:
row0 =labelDF.iloc[0]
print row0

2.0


In [187]:
print normalizedTrain.iloc[0]['train']

0.0


## Test DataFrames

In [169]:
vectorizedTest = wordFrequency(numTest,descriptionTestFolder,indexedBOW)
normalizedTest = normalize(vectorizedTest,bagOfWords)

392


the test descriptions have 392 words that are not present in the bag of words from the train.

## KNN to get labels for the test

In [175]:
def kNN(xTrain,yTrainLabel,xTest):
    model = KNeighborsClassifier(n_neighbors=20)
    model.fit(xTrain,yTrainLabel)
    pred = model.predict(xTest)
    return pred

In [176]:
testLabels = kNN(vectorizedTrain,labelDF,vectorizedTest)

In [177]:
print len(testLabels)

2000


In [178]:
print testLabels[0]

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


In [198]:
print vectorizedTrain.iloc[0]['']

4.0
