In [1]:
import pandas as pd

In [2]:
X, y, mlbx, mlby, val_y, test_y = pd.read_pickle('test_tag_dataset.pkl')

## reverse the multi-label-binarization

In [3]:
user_tags = mlby.inverse_transform(y)
autotags = mlbx.inverse_transform(X)

In [4]:
train_set = pd.DataFrame(data={'autotags': pd.Series(autotags), 'user_tags': pd.Series(user_tags)})

In [5]:
train_set['user'] = train_set.index

In [6]:
train_set.head()

Unnamed: 0,autotags,user_tags,user
0,"(concert, dark red, ensemble, indoor, light, m...","(chris, dave, jack, march)",0
1,"(amusement park, carousel, display window, flo...","(asia, breakfast, thailand)",1
2,"(cosplay, fashion, outerwear, outfit, parade, ...","(anaheim, cosplay, show)",2
3,"(almond tree, animal, bird, bird of prey, blos...","(asia, bloom, blue, city, february, spring, tr...",3
4,"(animal, blue, bright, depth of field, flower,...","(blue, denmark, flower, flowers, spring, winter)",4


In [7]:
# save the train set for later use
orig_train_set = train_set.copy()

### For each user-tag, add 4 negative training examples.

In [8]:
import numpy as np

In [9]:
import scipy.sparse as sp

In [10]:
np.random.seed(0)
num_negatives = 4
num_tags = y.shape[1]
items, u_tags, labels = [],[],[]
for i, u in y.keys():
    for t in range(num_negatives):
        j = np.random.randint(num_tags)
        while (i, j) in y:
            j = np.random.randint(num_tags)
        items.append(i)
        u_tags.append(j)
        labels.append(0)

In [11]:
y_neg = sp.dok_matrix(y.shape)

In [12]:
for i, u in zip(items, u_tags):
    y_neg[i, u] = 1

In [13]:
neg_user_tags = mlby.inverse_transform(y_neg)

In [37]:
# make function of it
def gen_negative_user_tags(num_negatives, binary_data, mlb):
    """generates <num_negatives> negative user-tag examples
    per positivt user-tag in <binary_data>. Transforms the
    negative examples to a list of tuples of user-tag strings
    using <mlb>.<"""
    np.random.seed(0)
    num_tags = binary_data.shape[1]
    items, u_tags, labels = [],[],[]
    for i, u in binary_data.keys():
        for t in range(num_negatives):
            j = np.random.randint(num_tags)
            while (i, j) in binary_data:
                j = np.random.randint(num_tags)
            items.append(i)
            u_tags.append(j)
            labels.append(0)
            
    negatives = sp.dok_matrix(binary_data.shape)
    for i, u in zip(items, u_tags):
        negatives[i, u] = 1
    return mlb.inverse_transform(negatives)

## Add to train set

In [14]:
train_set.head()

Unnamed: 0,autotags,user_tags,user
0,"(concert, dark red, ensemble, indoor, light, m...","(chris, dave, jack, march)",0
1,"(amusement park, carousel, display window, flo...","(asia, breakfast, thailand)",1
2,"(cosplay, fashion, outerwear, outfit, parade, ...","(anaheim, cosplay, show)",2
3,"(almond tree, animal, bird, bird of prey, blos...","(asia, bloom, blue, city, february, spring, tr...",3
4,"(animal, blue, bright, depth of field, flower,...","(blue, denmark, flower, flowers, spring, winter)",4


In [15]:
train_set['label'] = 1
train_set.sample(1)

Unnamed: 0,autotags,user_tags,user,label
18885,"(ancient, art, ceramic, food, indoor, kitchen ...","(leaves, nature, park)",18885,1


In [16]:
neg_u_set = pd.DataFrame({"user_tags": pd.Series(neg_user_tags)})
neg_u_set.head(3)

Unnamed: 0,user_tags
0,"(ottawa, neighborhood, cliff, blue, ontario, b..."
1,"(xsi, crowd, splash, tx, hands, bristol, wing,..."
2,"(scenery, taxi, sxsw, daniel, 550d, houses, sp..."


In [17]:
neg_u_set['user'] = neg_u_set.index

In [18]:
neg_u_set['autotags'] = train_set.autotags
neg_u_set['label'] = 0

In [19]:
neg_u_set.head(3)

Unnamed: 0,user_tags,user,autotags,label
0,"(ottawa, neighborhood, cliff, blue, ontario, b...",0,"(concert, dark red, ensemble, indoor, light, m...",0
1,"(xsi, crowd, splash, tx, hands, bristol, wing,...",1,"(amusement park, carousel, display window, flo...",0
2,"(scenery, taxi, sxsw, daniel, 550d, houses, sp...",2,"(cosplay, fashion, outerwear, outfit, parade, ...",0


In [20]:
train_set = pd.concat([train_set, neg_u_set], axis = 0, sort=False, ignore_index=True)

In [21]:
train_set.shape

(40000, 4)

In [22]:
train_set.head()

Unnamed: 0,autotags,user_tags,user,label
0,"(concert, dark red, ensemble, indoor, light, m...","(chris, dave, jack, march)",0,1
1,"(amusement park, carousel, display window, flo...","(asia, breakfast, thailand)",1,1
2,"(cosplay, fashion, outerwear, outfit, parade, ...","(anaheim, cosplay, show)",2,1
3,"(almond tree, animal, bird, bird of prey, blos...","(asia, bloom, blue, city, february, spring, tr...",3,1
4,"(animal, blue, bright, depth of field, flower,...","(blue, denmark, flower, flowers, spring, winter)",4,1


Expand user_tag list

In [23]:
expanded = pd.DataFrame(train_set.user_tags.tolist(), index=train_set.index)\
           .stack().reset_index(level=1, drop=True).rename('user_tag')
expanded_train = train_set.join(expanded)

In [24]:
expanded_train.head()

Unnamed: 0,autotags,user_tags,user,label,user_tag
0,"(concert, dark red, ensemble, indoor, light, m...","(chris, dave, jack, march)",0,1,chris
0,"(concert, dark red, ensemble, indoor, light, m...","(chris, dave, jack, march)",0,1,dave
0,"(concert, dark red, ensemble, indoor, light, m...","(chris, dave, jack, march)",0,1,jack
0,"(concert, dark red, ensemble, indoor, light, m...","(chris, dave, jack, march)",0,1,march
1,"(amusement park, carousel, display window, flo...","(asia, breakfast, thailand)",1,1,asia


In [25]:
expanded_train = expanded_train.drop('user_tags', axis=1)

In [26]:
expanded_train.head()

Unnamed: 0,autotags,user,label,user_tag
0,"(concert, dark red, ensemble, indoor, light, m...",0,1,chris
0,"(concert, dark red, ensemble, indoor, light, m...",0,1,dave
0,"(concert, dark red, ensemble, indoor, light, m...",0,1,jack
0,"(concert, dark red, ensemble, indoor, light, m...",0,1,march
1,"(amusement park, carousel, display window, flo...",1,1,asia


Make function of it:

In [27]:
def expand_user_tag_lists(df):
    expand = pd.DataFrame(df.user_tags.tolist(), index=df.index)\
           .stack().reset_index(level=1, drop=True).rename('user_tag')
    df = df.join(expand)
    df = df.drop('user_tags', axis=1)
    return df

In [82]:
asd = expand_user_tag_lists(train_set)

In [95]:
type(y)

scipy.sparse.dok.dok_matrix

In [94]:
y.shape[0]

20000

## Get and prepare val set

In [66]:
val_user_tags = mlby.inverse_transform(val_y)

In [67]:
val_set = pd.DataFrame(data={'user_tags': val_user_tags})
val_set['user'] = pd.Series(range(18000, 20000))
val_set.head()

Unnamed: 0,user_tags,user
0,"(model, portrait, street)",18000
1,"(airplane, aviation, boeing)",18001
2,"(film, graffiti, london)",18002
3,"(florida, kayak, surfing, usa)",18003
4,"(fountain, heritage, history, monument, ruins)",18004


In [68]:
val_set = pd.merge(val_set, orig_train_set[['autotags', 'user']], on='user')
val_set.head()

Unnamed: 0,user_tags,user,autotags
0,"(model, portrait, street)",18000,"(beachwear, bikini, bokeh, cosplay, depth of f..."
1,"(airplane, aviation, boeing)",18001,"(aircraft, airframe, airliner, airplane, airpl..."
2,"(film, graffiti, london)",18002,"(alligator, animal, art, crocodile, painting, ..."
3,"(florida, kayak, surfing, usa)",18003,"(animal, boat, bokeh, depth of field, gymnasti..."
4,"(fountain, heritage, history, monument, ruins)",18004,"(architecture, art, ethereal, fountain, light ..."


In [69]:
val_set['label'] = 1

In [70]:
neg_val = gen_negative_user_tags(4, val_y, mlby)

In [71]:
neg_val_set = pd.DataFrame({'user_tags': pd.Series(neg_val)})
neg_val_set['user'] = val_set.user

In [72]:
neg_val_set['autotags'] = val_set.autotags
neg_val_set['label'] = 0
neg_val_set.head()

Unnamed: 0,user_tags,user,autotags,label
0,"(femme, diesel, sp, national, guy, fujifilm, s...",18000,"(beachwear, bikini, bokeh, cosplay, depth of f...",0
1,"(truck, dutch, male, river, dynamic, portraits...",18001,"(aircraft, airframe, airliner, airplane, airpl...",0
2,"(slide, hawaii, street, del, harbor, africa, g...",18002,"(alligator, animal, art, crocodile, painting, ...",0
3,"(valencia, rabbit, concierto, hdr, american, m...",18003,"(animal, boat, bokeh, depth of field, gymnasti...",0
4,"(euskadi, deporte, department, floor, bleu, at...",18004,"(architecture, art, ethereal, fountain, light ...",0


In [73]:
total_val_set = pd.concat([val_set, neg_val_set], sort=False)
total_val_set.head()

Unnamed: 0,user_tags,user,autotags,label
0,"(model, portrait, street)",18000,"(beachwear, bikini, bokeh, cosplay, depth of f...",1
1,"(airplane, aviation, boeing)",18001,"(aircraft, airframe, airliner, airplane, airpl...",1
2,"(film, graffiti, london)",18002,"(alligator, animal, art, crocodile, painting, ...",1
3,"(florida, kayak, surfing, usa)",18003,"(animal, boat, bokeh, depth of field, gymnasti...",1
4,"(fountain, heritage, history, monument, ruins)",18004,"(architecture, art, ethereal, fountain, light ...",1


In [114]:
dataset = total_val_set.copy()

In [90]:
pd.Series([tag for user_tags in total_val_set.user_tags for tag in user_tags]).value_counts().sum()

43613

In [93]:
expand = pd.DataFrame(dataset.user_tags.tolist(), index=dataset.index)\
           .stack().reset_index(level=1, drop=True).rename('user_tag')


In [104]:
dataset = dataset.join(expand)

In [109]:
expand.head()

0       model
0    portrait
0      street
1    airplane
1    aviation
Name: user_tag, dtype: object

In [108]:
dataset.head()

Unnamed: 0,user_tags,user,autotags,label
0,"(model, portrait, street)",18000,"(beachwear, bikini, bokeh, cosplay, depth of f...",1
1,"(airplane, aviation, boeing)",18001,"(aircraft, airframe, airliner, airplane, airpl...",1
2,"(film, graffiti, london)",18002,"(alligator, animal, art, crocodile, painting, ...",1
3,"(florida, kayak, surfing, usa)",18003,"(animal, boat, bokeh, depth of field, gymnasti...",1
4,"(fountain, heritage, history, monument, ruins)",18004,"(architecture, art, ethereal, fountain, light ...",1


In [110]:
dataset = dataset.join(expand)
dataset = dataset.drop('user_tags', axis=1)

In [118]:
dataset.iloc[2000]

user_tags    (femme, diesel, sp, national, guy, fujifilm, s...
user                                                     18000
autotags     (beachwear, bikini, bokeh, cosplay, depth of f...
label                                                        0
Name: 0, dtype: object

In [74]:
expanded_val_set = expand_user_tag_lists(total_val_set)
expanded_val_set.shape

(87226, 4)

## Prepare test set

asdasd

In [28]:
test_user_tags = mlby.inverse_transform(test_y)
test_set = pd.DataFrame(data={'user_tags': test_user_tags})
test_set['user'] = pd.Series(range(2000))
test_set = pd.merge(test_set, orig_train_set[['autotags', 'user']], on='user')
test_set.head()

Unnamed: 0,user_tags,user,autotags
0,"(ball, david, fl, joe)",0,"(concert, dark red, ensemble, indoor, light, m..."
1,"(island, resort, travel)",1,"(amusement park, carousel, display window, flo..."
2,"(convention, costume, world)",2,"(cosplay, fashion, outerwear, outfit, parade, ..."
3,"(cold, flower, japan, pink, river, sky, sunny,...",3,"(almond tree, animal, bird, bird of prey, blos..."
4,"(garden, green, leaves, march, purple, white, ...",4,"(animal, blue, bright, depth of field, flower,..."


In [30]:
test_set = expand_user_tag_lists(test_set)

In [33]:
test_set['label'] = 1

In [35]:
test_set.head()

Unnamed: 0,user,autotags,user_tag,label
0,0,"(concert, dark red, ensemble, indoor, light, m...",ball,1
0,0,"(concert, dark red, ensemble, indoor, light, m...",david,1
0,0,"(concert, dark red, ensemble, indoor, light, m...",fl,1
0,0,"(concert, dark red, ensemble, indoor, light, m...",joe,1
1,1,"(amusement park, carousel, display window, flo...",island,1


In [62]:
test_set.shape

(8962, 4)

In [65]:
test_set.user_tag[12:33]

3          japan
3           pink
3          river
3            sky
3          sunny
3          tokyo
3           walk
4         garden
4          green
4         leaves
4          march
4         purple
4          white
4         yellow
5           arte
5         design
5       interior
5    m%C3%A9xico
5     university
6       arkansas
6          b%26w
Name: user_tag, dtype: object

In [14]:
test_y

<2000x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 8962 stored elements in Dictionary Of Keys format>

In [3]:
data = pd.read_pickle('preprocessed_user_auto_tags.pkl')

In [16]:
data.head()

Unnamed: 0_level_0,User_tags,autotags
Line_number,Unnamed: 1_level_1,Unnamed: 2_level_1
31,"[america, arab, child, children, facebook, fre...","[arch, architecture, fort, fortification, goth..."
34,"[2004, august, canal, church, grand, health, i...","[architecture, belfry, building, campanile, ca..."
39,"[ancient, architecture, buildings, cambridge, ...","[friends, outdoor, people]"
126,"[%E9%A6%99%E6%B8%AF, farm, garden, hong+kong, ...","[blossom, calliandra, flower, geranium, hibisc..."
143,"[color, film, fujifilm, home, life, minolta]","[drawer, furniture, indoor]"


In [10]:
data['User_tags']

Line_number
31        [america, arab, child, children, facebook, fre...
34        [2004, august, canal, church, grand, health, i...
39        [ancient, architecture, buildings, cambridge, ...
126       [%E9%A6%99%E6%B8%AF, farm, garden, hong+kong, ...
143            [color, film, fujifilm, home, life, minolta]
145       [2012, april, bali, elephant, fall, indonesia,...
198       [europe, germany, music, musicians, singing, t...
201       [hdr, landscape, landschaft, natur, nature, vo...
220       [2, barcelona, best, club, dance, drinks, firs...
227       [ben, castle, day, lakewood, seattle, special,...
248                 [bc, bird, canada, delta, male, winter]
254       [blue, blues, glass, house, japan, japanese, l...
297       [d80, fountain, holiday, hotel, spain, water, ...
307       [cars, chevy, chrome, custom, mustang, wisconsin]
310       [canon, deutschland, germany, hamburg, powersh...
315       [2009, bridge, gothamist, island, manhattan, n...
317       [concert, d90, fes

In [14]:
tag_counts = pd.Series([user_tag for user_tags in data['User_tags'] for user_tag in user_tags]).value_counts()

In [20]:
tag_counts[tag_counts > 80].shape

(1486,)

In [21]:
autotag_counts = pd.Series([autotag for autotags in data['autotags'] for autotag in autotags]).value_counts()

In [24]:
autotag_counts[autotag_counts > 50].shape

(909,)

In [None]:
data['User_tags'].value_counts()

In [20]:
[user_tag for user_tag in data['User_tags']]

50297

In [25]:
data.head()

Unnamed: 0_level_0,User_tags,autotags
Line_number,Unnamed: 1_level_1,Unnamed: 2_level_1
31,"[america, arab, child, children, facebook, fre...","[arch, architecture, fort, fortification, goth..."
34,"[2004, august, canal, church, grand, health, i...","[architecture, belfry, building, campanile, ca..."
39,"[ancient, architecture, buildings, cambridge, ...","[friends, outdoor, people]"
126,"[%E9%A6%99%E6%B8%AF, farm, garden, hong+kong, ...","[blossom, calliandra, flower, geranium, hibisc..."
143,"[color, film, fujifilm, home, life, minolta]","[drawer, furniture, indoor]"


In [None]:
data