In [0]:
import pickle
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline


from sklearn.metrics.pairwise import cosine_similarity
from google_drive_downloader import GoogleDriveDownloader

In [0]:
!rm /tmp/*; wget -q https://gist.githubusercontent.com/Puzer/33c64edd7973a60f2040058a57ea1596/raw/078b2075e356ab9bb557733be80a6c0a82c47913/dataset.py

# Loading already pre-trained publics embeddings

In [0]:
from dataset import *
publics_emb, users_profile = load_data()
publics_ids = set(publics_emb.index)

publics_emb.head()

In [0]:
# Embeddings helper functions

def get_publics_embedings(user_subscriptions):
    idx = publics_emb.index.searchsorted(user_subscriptions)
    return publics_emb.iloc[idx].sum(axis=0)

def get_public_vector(public_id):
    return publics_emb.loc[public_id].values.reshape((1, -1))

def get_recommendations(vector, top_n=5, except_subscriptions={}):
    r = cosine_similarity(publics_emb.values, vector)
    indexes = publics_emb.iloc[r.argsort(axis=0)[::-1].reshape(-1)[:top_n]].index
    indexes = [x for x in indexes if x not in except_subscriptions]
    return indexes

In [0]:
# VK API helper functions
# You can get your accsess_token using https://vkhost.github.io/
# or official guidline https://vk.com/dev/access_token
__ACCESS_TOKEN__ = '5a36e8db5a36e8db5a36e8db165a51cfb055a365a36e8db073056cebd7737a42f4e7ed1'

def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]
        
def search_groups(ids):
    if type(ids) is not list:
        ids = [ids]
    ids = list(map(str, ids))
    result = list()
    for r in chunks(ids, 400):
        result.extend(search_groups_(r))
    return result
    
def search_groups_(ids):
    url='https://api.vk.com/method/groups.getById?fields=members_count&group_ids={0}&v=5.61&access_token={1}'.format(','.join(ids), __ACCESS_TOKEN__)
    response = requests.get(url)
    if response.status_code == 200:
        json_result = response.json()
        return json_result['response']
    return None

def get_public_id(public_url):
    if isinstance(public_url, int) or public_url.isdigit():
        return int(public_url)
    else:
        if 'public' in public_url:
            public_url = public_url.replace('public','')
        if '/' in public_url:
            public_url = public_url.split('/')[3]
        return search_groups_([public_url])[0]['id']

def show_groups_info(responses):
    for item in responses:
        print('http://vk.com/public%d %s'%(item['id'], item['name']))

def get_user_subscriptions(user_id):
    url = 'https://api.vk.com/method/users.getSubscriptions?user_id={0}&v=5.63&access_token={1}'.format(user_id, __ACCESS_TOKEN__)
    response = requests.get(url, timeout=30)
    if response.status_code == 200:
        return response.json()['response']
    return None

# Measuring similarity between VK publics

In [0]:
TEST_PUBLIC = 'https://vk.com/datascience'
SHOW_TOP_N = 20

# retrieving public ID from URL
public_id = get_public_id(TEST_PUBLIC)

# extracting already learned 128D vector (if TEST_PUBLIC in the index)
public_vector = get_public_vector(public_id)

# calculating similarity between TEST public and ALL others publics
publics_similarity = cosine_similarity(publics_emb.values, public_vector) 

# sort ALL publics by similarity and take only TOP_N similar publics
most_similar_indexes = publics_similarity.argsort(axis=0)[::-1].reshape(-1)[:SHOW_TOP_N]

# take exact similarity values
most_similar_values = publics_similarity[most_similar_indexes]

# convert internal indexes into VK indexes
most_similar_publics_ids = publics_emb.iloc[most_similar_indexes].index

# retrieval public information (like public name)
most_similar_publics_info = search_groups(most_similar_publics_ids.tolist())


# show most similar publics
for public_info, public_similarity in zip(most_similar_publics_info, most_similar_values):
    print('[%f] https://vk.com/public%d %s'%(public_similarity, public_info['id'], public_info['name']))

# Making recommendations for a user

In [0]:
TEST_USER_ID = 16184332

# retrieving test user subscriptions (publics)
user_subscriptions_ids = get_user_subscriptions(TEST_USER_ID)['groups']['items']

# filter publics by checking if the public is already learned
user_subscriptions_ids = list(filter(lambda x: x in publics_ids, user_subscriptions_ids))
user_subscriptions_name = [x['name'] for x in search_groups(user_subscriptions_ids)]

if len(user_subscriptions_ids) == 0:
    raise Exception("There are no subsriptions")

# extracting vectors for all user subscriptions (publics)
user_subscriptions_embeddings = np.vstack([get_public_vector(x) for x in user_subscriptions_ids])

### Calculating closest publics to the user's vector representation

In [0]:
# getting *user vector* by summing all embeddings
user_representation = np.sum(user_subscriptions_embeddings, axis=0).reshape(1, -1)

# the same steps as in previous
# calculating similarity between TEST public and ALL others publics
publics_similarity = cosine_similarity(publics_emb.values, user_representation) 

# sort ALL publics by similarity and take only TOP_N similar publics
most_similar_indexes = publics_similarity.argsort(axis=0)[::-1].reshape(-1)[:SHOW_TOP_N]

# take exact similarity values
most_similar_values = publics_similarity[most_similar_indexes]

# convert internal indexes into VK indexes
most_similar_publics_ids = publics_emb.iloc[most_similar_indexes].index

# retrieval public information (like public name)
most_similar_publics_info = search_groups(most_similar_publics_ids.tolist())


# show most similar publics
for public_info, public_similarity in zip(most_similar_publics_info, most_similar_values):
    print('[%f] https://vk.com/public%d %s'%(public_similarity, public_info['id'], public_info['name']))

# Visualization of semantic space for top 10000 VK publics 

Visualization of semantic space for top 10000 VK publics <br>
https://puzer.github.io/projector/

# Predicting user gender based on subscriptions

### Loading a random sample of users

In [0]:
print('Users in sample:', len(users_profile))
print('One example:')
next(iter(users_profile.values()))

### EDA (exploratory data analysis)

In [0]:
plt.hist([len(v['subscriptions']) for k,v in users_profile.items() if v['sex']==1], bins=50, label='female', alpha=0.5)
plt.hist([len(v['subscriptions']) for k,v in users_profile.items() if v['sex']==2], bins=50, label='male', alpha=0.5)

plt.legend()
plt.title('Distribution of subscriptions')
plt.xlabel('Number of subscriptions')
plt.ylabel('Number of people')
plt.show()

In [0]:
plt.hist([v['date'] for k,v in users_profile.items() if v['sex']==1], bins=50, label='female', alpha=0.5)
plt.hist([v['date'] for k,v in users_profile.items() if v['sex']==2], bins=50, label='male', alpha=0.5)

plt.title('Distribution of age in the sample')
plt.xlabel('Age')
plt.ylabel('Number of people')
plt.legend()
plt.show()

### Most popular publics

In [0]:
from itertools import chain
from collections import Counter

all_subscriptions = list(chain(*[v['subscriptions'] for k,v in users_profile.items()]))
publics_frequency = Counter(all_subscriptions)

show_groups_info(search_groups(list(list(zip(*publics_frequency.most_common(10)))[0])))

### Preprocessing training data

In [0]:
from sklearn.feature_extraction import DictVectorizer

target = list()
users_subscriptions = list()

for k,v in users_profile.items():
    user_publics = publics_ids & set( v['subscriptions'])
    if len(user_publics) >= 5:
        target.append(v['sex'] == 2) # is it a male ?
        users_subscriptions.append({uid:1 for uid in user_publics})

dict_vectorizer = DictVectorizer()
X_data = dict_vectorizer.fit_transform(users_subscriptions)
dict_vectorizer.inverse_vocabulary_ = {v:k for k,v in dict_vectorizer.vocabulary_.items()}

y_data = np.array(target)

X_data.shape

### Training model

In [0]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42)

clf = SGDClassifier(loss='log', penalty = 'l2', class_weight='balanced', max_iter=5)
clf.fit(X_train, y_train)

### Evaluating model performance

In [0]:
print('Accuracy:', clf.score(X_test, y_test))
print('AUC-ROC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))

### Analysing of model performance

In [0]:
nb_of_subscriptions = X_test.sum(axis=1).A.reshape(-1)

thresholds = list(range(5, 100, 5))
accuracy_scores = list()

for nb_of_subscriptions_threshold in thresholds:
    mask = nb_of_subscriptions >= nb_of_subscriptions_threshold
    accuracy_scores.append(clf.score(X_test[mask], y_test[mask]))
    
plt.plot(thresholds, accuracy_scores)
plt.ylabel('Accuracy')
plt.xlabel('Number of subscriptions')
plt.title('Influence of number of subscriptions on accuracy')
plt.xticks(thresholds)
plt.show()

In [0]:
dataset_slices = np.linspace(100, X_train.shape[0], 100).astype('int32')

scores = list()
for x_slice in tqdm_notebook(dataset_slices):
  clf = SGDClassifier(loss='log', penalty = 'l2', class_weight='balanced', max_iter=5)
  clf.fit(X_train[:x_slice], y_train[:x_slice])
  scores.append(clf.score(X_test, y_test))
  
plt.plot(dataset_slices, scores)
plt.title('Influence of number of training data on accuracy \n for a linear model trained on sparse data')
plt.xlabel('Number of training points (log)')
plt.xscale('log')
plt.ylabel('Accuracy')
plt.show()

### Analysing model parameters

In [0]:
model_params_args = clf.coef_[0].argsort()

most_feminine_publics = [dict_vectorizer.inverse_vocabulary_[x] for x in model_params_args[:10]]
most_masculine_publics =  [dict_vectorizer.inverse_vocabulary_[x] for x in model_params_args[-10:]]

print('The most masculine publics')
show_groups_info(search_groups(most_masculine_publics))

print('\n\nThe most feminine publics')
show_groups_info(search_groups(most_feminine_publics))

# Training model using pretrained publics embedding

### on limited number of training examples

In [0]:
users_vectors = list()
for subscr in tqdm_notebook(users_subscriptions):
    user_embedding = get_publics_embedings(list(subscr.keys()))
    users_vectors.append(user_embedding)
    
users_vectors = np.array(users_vectors)

In [0]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(users_vectors, y_data, random_state=42)

clf = LogisticRegression(penalty = 'l2', class_weight='balanced', solver='lbfgs')
clf.fit(X_train, y_train)

In [0]:
print('Accuracy:', clf.score(X_test, y_test))
print('AUC-ROC:', roc_auc_score(y_test, clf.predict_proba(X_test)[:,1]))

In [0]:
dataset_slices = np.linspace(100, X_train.shape[0], 100).astype('int32')

scores = list()
for x_slice in tqdm_notebook(dataset_slices):
  clf = LogisticRegression(penalty = 'l2', class_weight='balanced', solver='lbfgs')
  clf.fit(X_train[:x_slice], y_train[:x_slice])
  scores.append(clf.score(X_test, y_test))
  
plt.plot(dataset_slices, scores)
plt.title('Influence of number of training data on accuracy \n for a linear model trained on pre-trained embeddings')
plt.xlabel('Number of training points (log)')
plt.xscale('log')
plt.ylabel('Accuracy')
plt.show()