## VK post stats collector
ferluht, 2019

In [None]:
!pip install -r requirements.txt

In [None]:
import vk
import re
import json
from tqdm import tqdm_notebook as tqdm
from time import sleep
import pandas as pd
from datetime import datetime, date, timedelta
from collections import Counter

In [None]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [None]:
sess = vk.AuthSession(user_login='', user_password='', app_id=6808809, scope='scope')
api = vk.API(sess, v='5.35')
print(sess.access_token)

In [None]:
posts_urls = [
    'https://vk.com/wall-39687507_32353', 
    'https://vk.com/wall-54566161_518165',
    'https://vk.com/wall-83415396_141392',
    'https://vk.com/wall-69563163_989262',
    'https://vk.com/wall-71960446_175792',
]

## Repost stats

In [None]:
def get_reposts(post_urls, group_id=None, debug=False, waittime=0.4):
    
    if not isinstance(post_urls, (list,)):
        post_urls = [post_urls,]
    
    def wait():
        sleep(waittime)
        
    reposts = {
        'group': [],
        'name': [],
        'profile': [],
        'repost text': [],
        'repost likes': [],
        'repost can like': [],
        'repost comments': [],
        'repost link': [],
        'group member': []
    }
        
    for post_url in post_urls:
    
        owner_id = int(re.search('.*wall(.*)_.*', post_url).group(1))
        item_id = int(re.search('.*_(.*)', post_url).group(1))
        likes_list = api.likes.getList(type='post', owner_id=owner_id, item_id=item_id, count=1000)

        group_name = api.groups.getById(group_ids=str(abs(owner_id)))[0]['name']

        print (group_name)
        
        for user in tqdm(likes_list['items']):
            user_info = api.users.get(user_ids=int(user))
            wait()
            if isinstance(user_info, (list,)):
                    user_info = user_info[0]
            try:
                wall = api.wall.get(owner_id=user)
                wait()
                for post in wall['items']:
                    pts = json.dumps(post)
                    if bool(re.search(str(owner_id), pts)) and bool(re.search(str(item_id), pts)):
                        reposts['group'].append(group_name)
                        reposts['name'].append(user_info['first_name'] + ' ' + user_info['last_name'])
                        reposts['profile'].append('https://vk.com/id' + str(user_info['id']))
                        reposts['repost text'].append(post['text'])
                        reposts['repost likes'].append(post['likes']['count'])
                        reposts['repost can like'].append(post['likes']['can_like'])
                        reposts['repost comments'].append(post['comments']['count'])
                        reposts['repost link'].append('{}?w=wall{}_{}'.format(reposts['profile'][-1], post['owner_id'], post['id']))
                        reposts['group member'].append('N/A')
                        if group_id is not None:
                            try:
                                ismember = api.groups.isMember(group_id=group_id, user_id=user_info['id'])
                                reposts['group member'][-1] = bool(ismember)
                            except:
                                pass
                            wait()

                        if debug:
                            print(post)

            except Exception as error:
                if debug:
                    print(user_info['first_name'] + ' ' + user_info['last_name'] + ' ' + str(error))
        
    return pd.DataFrame(reposts)

In [None]:
res = get_reposts(posts_urls, group_id='ed9m_8')

In [None]:
print('overall found reposts {}'.format(len(res)))
res.head(20)

In [None]:
res.to_csv('reposts.csv')

## Like stats

In [None]:
def get_like_stats(post_urls, group_id=None, debug=False, waittime=0.4):
    if not isinstance(post_urls, (list,)):
        post_urls = [post_urls,]
    
    def wait():
        sleep(waittime)
    
    ages = []
    sexes = []
    cities = []
    universities = []
        
    for post_url in post_urls:
        owner_id = int(re.search('.*wall(.*)_.*', post_url).group(1))
        item_id = int(re.search('.*_(.*)', post_url).group(1))
        group_name = api.groups.getById(group_ids=str(abs(owner_id)))[0]['name']
        print (group_name)
        likes_list = api.likes.getList(type='post', owner_id=owner_id, item_id=item_id, count=1000)
        for user in tqdm(likes_list['items']):
            user_info = api.users.get(user_ids=int(user), fields='sex,bdate,city,country,education,followers_count,is_friend')[0]
            if debug:
                print(user_info)
            if 'bdate' in user_info and user_info['bdate'].count('.') == 2:
                age = (date.today() - datetime.strptime(user_info['bdate'], '%d.%M.%Y').date()).days/365
                ages.append(age)
            if 'city' in user_info:
                cities.append(user_info['city']['title'])
            if 'sex' in user_info:
                sexes.append(user_info['sex'])
            if 'university_name' in user_info and user_info['university_name'] != '':
                universities.append(user_info['university_name'])
            wait()
            
    return ages, sexes, cities, universities

In [None]:
ages, sexes, cities, universities = get_like_stats(posts_urls)

### Age distribution

In [None]:
data = [go.Histogram(x=ages, nbinsx = 50)]

layout = go.Layout(title='Likes by age',)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### Cities

In [None]:
topN = 10
counts = Counter(cities)
counts = {k: v for k, v in sorted(counts.items(), key=lambda x: x[1])}
others = {k: counts[k] for k in list(counts)[:-topN]}
counts = {k: counts[k] for k in list(counts)[-topN:]}
counts['others'] = sum(list(others.values()))
trace = go.Pie(labels=list(counts.keys()), values=list(counts.values()))

data = [trace,]
layout = go.Layout(title='Likes by city',)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### Universities

In [None]:
topN = 10
counts = Counter(universities)
counts = {k: v for k, v in sorted(counts.items(), key=lambda x: x[1])}
others = {k: counts[k] for k in list(counts)[:-topN]}
counts = {k: counts[k] for k in list(counts)[-topN:]}
counts['others'] = sum(list(others.values()))
trace = go.Pie(labels=list(counts.keys()), values=list(counts.values()))

data = [trace,]
layout = go.Layout(title='Likes by university',)
fig = go.Figure(data=data, layout=layout)
iplot(fig)