In [3]:
import numpy as np
import pandas as pd       
import json
from os import listdir
from os.path import isfile, join

In [4]:
def category_word_df(path, category_num):
    files = [(path + '/' + f) for f in listdir(path) if isfile(join(path, f))]
    
    if path == 'sporty_tag_results':
        files.remove(path + '/' + '.DS_Store')

    all_words = []
    for file in files:
        data = json.load(open(file))
        image_concepts = []
        for concept in data:
            image_concepts.append(concept['name'])
        all_words.append(image_concepts)
        
    all_category = pd.Series(all_words).as_matrix()
    category_tags = pd.DataFrame(all_category)
    category_tags.columns = ['words']
    category_tags['category'] = np.repeat([category_num], len(category_tags))
    
    return category_tags

In [5]:
food = category_word_df('food_tag_results', 1)

In [6]:
family = category_word_df('family_tag_results', 2)

In [7]:
sporty = category_word_df('sporty_tag_results', 3)

In [8]:
outdoorsy = category_word_df('outdoorsy_tag_results', 4)

In [9]:
train = pd.concat([food, family, sporty, outdoorsy], axis = 0)
train = train.reset_index().drop(['index'], axis = 1)

In [10]:
train.head(10)

Unnamed: 0,words,category
0,"[woman, people, indoors, girl, family, shoppin...",1
1,"[food, dinner, dish, vegetable, meal, lunch, m...",1
2,"[food, meal, dinner, lunch, dish, vegetable, r...",1
3,"[shop, stock, shopping, restaurant, table, ins...",1
4,"[stock, architecture, no person, travel, shop,...",1
5,"[meal, food, plate, dinner, restaurant, dish, ...",1
6,"[table, furniture, dining, chair, dining room,...",1
7,"[sandwich, food, refreshment, bread, bun, burg...",1
8,"[table, chair, furniture, inside, seat, cafete...",1
9,"[indoors, horizontal plane, horizontal, shop, ...",1


In [11]:

keywords = train['words']
# Set values for various parameters
num_features = 300   # Word vector dimensionality                      
min_word_count = 1   # ignore all words with total frequency lower than this                       
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    


# Initialize and train the model (this will take some time)
from gensim.models import word2vec


print("Training word2vec model... ")
model = word2vec.Word2Vec(keywords, workers = num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context)


# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

Training word2vec model... 


In [12]:
vocab_tmp = list(model.wv.vocab)
print('Vocab length:',len(vocab_tmp))

Vocab length: 1313


In [36]:
from sklearn.metrics.pairwise import cosine_similarity

model.most_similar('outdoors')

[('freedom', 0.9975003004074097),
 ('nature', 0.9970559477806091),
 ('hike', 0.996548056602478),
 ('relaxation', 0.9958226084709167),
 ('fun', 0.9940911531448364),
 ('bikini', 0.9933178424835205),
 ('enjoyment', 0.991752564907074),
 ('sky', 0.9915896654129028),
 ('sun', 0.990017294883728),
 ('summer', 0.9886194467544556)]

In [15]:
def makeFeatureVec(keywords, model):
    # Function to average all of the word vectors in a given paragraph
    featureVec =[]
    
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for n,word in enumerate(keywords):
        if word in index2word_set: 
            featureVec.append(model[word])
            
    # Average the word vectors for a 
    featureVec = np.mean(featureVec,axis=0)
    return featureVec


def getAvgFeatureVecs(keywords, model):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one 
    
    reviewFeatureVecs = []
    # Loop through the reviews
    for counter,keyword in enumerate(keywords):
        
        # Print a status message every 5000th review
        if counter%5000. == 0.:
            print("Keywords %d of %d" % (counter, len(keywords)))

        # Call the function (defined above) that makes average feature vectors
        vector= makeFeatureVec(keyword, model)
        reviewFeatureVecs.append(vector)
            
    return reviewFeatureVecs

In [71]:
from sklearn.ensemble import RandomForestClassifier
# # CountVectorizer can actucally handle a lot of the preprocessing for us
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics # for confusion matrix, accuracy score etc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


np.random.seed(0)

In [72]:
print("1.Creating Feature vectors using word2vec...\n")
trainDataVecs = getAvgFeatureVecs(keywords, model)

print("\n2.Splitting dataset into train and test sets...\n")
X_train, X_test, y_train, y_test = train_test_split(trainDataVecs, train["category"], random_state=0, test_size=0.1)

print("3. Training the random forest classifier...\n")
forest = RandomForestClassifier(n_estimators = 50)
# Fit the forest to the training set, word2vecfeatures 
    # and the sentiment labels as the target variable
forest = forest.fit(X_train, y_train)

1.Creating Feature vectors using word2vec...

Keywords 0 of 1668

2.Splitting dataset into train and test sets...

3. Training the random forest classifier...



In [77]:
train_predictions = forest.predict(X_train)

In [78]:
train_acc = metrics.accuracy_score(y_train, train_predictions)
print(train_acc)

1.0


In [79]:
def word_df(path):
    files = [(path + '/' + f) for f in listdir(path) if isfile(join(path, f))]
    
    if path == 'sporty_tag_results':
        files.remove(path + '/' + '.DS_Store')

    all_words = []
    for file in files:
        data = json.load(open(file))
        image_concepts = []
        for concept in data:
            image_concepts.append(concept['name'])
        all_words.append(image_concepts)
        
    all_words = pd.Series(all_words).as_matrix()
    words_df = pd.DataFrame(all_words)
    words_df.columns = ['words']
    
    return words_df

In [80]:
def predict_pics(path, model, keywords, forest):
    wdf = word_df(path)
    keywords = wdf['words']
    
    trainDataVecs = getAvgFeatureVecs(keywords, model)
    predictions = forest.predict(trainDataVecs)
    return predictions

In [81]:
word_df('luna_tag_results')

Unnamed: 0,words
0,"[no person, outdoors, landscape, seashore, tra..."
1,"[people, recreation, water, one, beach, man, a..."
2,"[people, one, vehicle, adult, two, transportat..."
3,"[beach, people, monochrome, sea, one, ocean, w..."
4,"[no person, landscape, water, nature, monochro..."
5,"[surf, ocean, water, spray, wave, splash, acti..."
6,"[people, one, adult, no person, rock, outdoors..."
7,"[people, child, adult, monochrome, woman, grou..."
8,"[people, adult, group, group together, woman, ..."
9,"[people, beach, seashore, landscape, sea, ocea..."


In [82]:
luna_predictions = pd.Series(predict_pics('luna_tag_results', model, keywords, forest))

Keywords 0 of 31


In [83]:
(luna_predictions.value_counts()/len(luna_predictions)).to_dict()

{1: 0.064516129032258063,
 2: 0.064516129032258063,
 3: 0.29032258064516131,
 4: 0.58064516129032262}