## Library

In [66]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import re

## Folder Locations

### Train

In [2]:
descriptionTrainFolder = "/Users/francescoperera/Desktop/data/descriptions_train"
featuresTrainFolder = "/Users/francescoperera/Desktop/data/features_train"
imagesTrainFolder = "/Users/francescoperera/Desktop/data/images_train"
tagsTrainFolder = "/Users/francescoperera/Desktop/data/tags_train"

### Test

In [3]:
descriptionTestFolder = "/Users/francescoperera/Desktop/data/descriptions_test"
featuresTestFolder = "/Users/francescoperera/Desktop/data/features_test"
imagesTestFolder = "/Users/francescoperera/Desktop/data/images_test"
tagsTestFolder = "/Users/francescoperera/Desktop/data/tags_test"

## Number of Files

In [88]:
numTrain = 10000
numTest = 2000

## Read Files

In [31]:
def readTagsFile(fileName):
    tags = []
    f = open(fileName,"r")
    for line in f:
        line = line.split(":") # possibly also consider using the keys in each line(vehicle,outdoor etc..)
        tag = line[-1].replace("\n","")
        tags.append(tag)
    return tags

def readDescriptionFile(fileName):
    desc = []
    f = open(fileName,"r")
    for line in f:
        noPuncSentence = stripPunctuation(line.replace("\n",""))
        desc.append(noPuncSentence)
    return desc
    
def stripPunctuation(s):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', s)

In [82]:
tags = readTagsFile(tagsTrainFolder +"/0.txt")

In [59]:
descriptions = readDescriptionFile(descriptionTrainFolder + "/0.txt")

## Process Descriptions

In [78]:
def stemmedDescriptions(lst):
    stemmedDescriptions = []
    for line in lst:
        posLine = pos(line.split(" "))
        #print posLine
        stemmedDescriptions.append(stemmer(posLine))
    return stemmedDescriptions

def stemmer(line):
    stemmer = PorterStemmer()
    stemmedLine = []
    for word,pos in line:
        if pos == "NN": # only stem nouns
            stem = stemmer.stem(word)
            stemmedLine.append(stem)
    return stemmedLine 

def pos(line):
    return pos_tag(line)
    

In [80]:
stemDescriptions = stemmedDescriptions(descriptions)
print stemDescriptions
print descriptions

[[u'skateboard', u'show', u'tabl', u'stage'], [u'skateboard', u'tabl', u''], [u'man', u'skateboard', u''], [u'skate', u'boarder', u'trick', u'tabl', u''], [u'person', u'skateboard', u'tabl', u'crowd', u'']]
['The skateboarder is putting on a show using the picnic table as his stage ', 'A skateboarder pulling tricks on top of a picnic table ', 'A man riding on a skateboard on top of a table ', 'A skate boarder doing a trick on a picnic table ', 'A person is riding a skateboard on a picnic table with a crowd watching ']


In [86]:
def stemTags(lst):
    newTags = []
    for word in lst:
        stem = PorterStemmer().stem(word)
        newTags.append(stem)
    return newTags 

In [87]:
print stemTags(tags)

[u'airplan', u'bench', u'skateboard', u'person', u'truck', u'backpack', u'handbag', u'dining t']


### Create bag of tags

In [111]:
def createBag(num,tagFolder):
    bag = {}
    for n in range(num):
        f = tagFolder +"/" + str(n) + ".txt"
        fileTags = readTagsFile(f)
        stemmedTags = stemTags(fileTags)
        for tag in stemmedTags: #tag is unicode, use str() to remove unicode string
            if str(tag) not in bag.keys():
                bag[str(tag)] = 0
    return bag

def indexBag(bag):
    idx = 0
    for tag in bag.keys():
        bag[tag] = idx
        idx+=1
    return bag

def wordFrequency(num,descFolder,bag):
    featureVectorList = []
    for n in range(num):
        imageFeatureVec = [0.0 for tag in range(len(bag.keys()))]
        f = descFolder +"/" + str(n) + ".txt"
        fileDescriptions = readDescriptionFile(f) #2D Array
        stemDescriptions = stemmedDescriptions(fileDescriptions)
        for desc in stemDescriptions:
            for stemTag in desc:
                if str(stemTag) in bag.keys():
                    imageFeatureVec[bag[str(stemTag)]] +=1.0
        featureVectorList.append(imageFeatureVec)
    return pd.DataFrame(featureVectorList,columns = bag.keys())
        
        

In [104]:
bagOfTags = createBag(numTrain,tagsTrainFolder)

In [105]:
print len(bagOfTags.keys())

80


In [112]:
indexedBOT = indexBag(bagOfTags)
vectorizedTrain = wordFrequency(numTrain,descriptionTrainFolder,indexedBOT)

In [114]:
print vectorizedTrain.shape

(10000, 80)
