# 1. Loading Data

VK library quiet installation and import into the notebook.

In [1]:
# !pip install vk # makes it quiet
import vk

Starting new vk session in order to parse data

In [2]:
vk_session = vk.Session() # starting new session
vk_api = vk.API(vk_session)

Getting number of posts in selected vk group.

In [3]:
selected_group = 'overhearhse' # no other ideas :c
posts_number = vk_api.wall.get(domain=selected_group)[0] # number of posts is stored in first element
print('Number of posts in selected group: ', posts_number - 1)

Number of posts in selected group:  13124


Writing a function to parse more, than 100 posts from group.

In [4]:
def load_all_posts(page, n_posts, api):
    all_posts = api.wall.get(domain=page, count=n_posts)
    n_loaded = len(all_posts)
    while n_loaded < n_posts: # loop to load more, than 100 posts
        s = api.wall.get(domain=page, offset=n_loaded, count=(n_posts - n_loaded)) # update offset
        all_posts += s[1:] # no need for first element
        n_loaded += len(s) - 1 # update n_loaded
    return all_posts

Loading all posts from group for future analysis

In [10]:
try:
    loaded_posts = load_all_posts(page=selected_group, n_posts=posts_number, api=vk_api)[1:] # no need for posts number element
    print('Number of loaded posts: ', len(loaded_posts))
except: # timout errors are often to occur
    print('Error occured! Try again.')

Number of loaded posts:  13124


# 2. Data preprocessing

Loading required libs to preprocess data.

In [11]:
# !pip install pymorphy2 -q # silent install again
import pymorphy2 # need this one to convert words to normal time
import datetime # needed to convert response date 
import string # needed to work with strings
from nltk.tokenize import TweetTokenizer # needed to split text
import pandas as pd # required to work with dataframes
from ipywidgets import IntProgress # progressbar
from IPython.display import display # progressbar

Writing functions to process text data. Converting words to normal form and removing punctuation here.

In [15]:
def split_text(text):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(text) # spliting text into words

def convert_to_normal_form(words_list):
    morph = pymorphy2.MorphAnalyzer()
    normal_forms_list = []
    for word in words_list:
        if word not in string.punctuation and word[0] != "<":
            norm_form = morph.parse(word)[0].normal_form #getting normal form of a word
            normal_forms_list.append(norm_form) #adding it to list
    return normal_forms_list

def convert_text(text):
    words_list = split_text(text) # spliting text into words
    norm_words_list = convert_to_normal_form(words_list) # words into normal form
    return norm_words_list

Writing a function to convert received list into another with another data.

In [16]:
def convert_posts(posts_list):
    progress = IntProgress() 
    progress.max = len(posts_list) # initializing progressbar
    progress.description = 'Processing data convertion'
    display(progress)
    
    updated_posts = [] # list of new posts' list structure
    for i, post in enumerate(posts_list): 
        tmp_dict = {} # creating empty dictionary for each post
        tmp_dict['likes_number'] = post['likes']['count'] # getting likes count
        tmp_dict['text'] = convert_text(post['text']) # converting text into normal form
        tmp_dict['text_length'] = len(post['text']) # calculating text length
        tmp_dict['post_hour'] = datetime.datetime.fromtimestamp(post['date']).strftime('%H') # parsing only post hour
        tmp_dict['post_month'] = datetime.datetime.fromtimestamp(post['date']).strftime('%m') # and post month
        tmp_dict['signed'] = int(post['from_id'] != -57354358) # checking whether post is signed or not
        # checking if any attacment exists
        tmp_dict['attachment_type'] = post['attachment']['type'] if 'is_pinned' in post.keys() else None
        tmp_dict['pinned'] = 1 if 'is_pinned' in post.keys() else 0 # cheking if post is pinned
        tmp_dict['repost'] = 1 if post['post_type'] == 'copy' else 0 # cheking if repost
        updated_posts.append(tmp_dict)
        progress.value += 1 # increasing progressbar value
    progress.description = 'Done convertion!'
    return updated_posts

Converting list of posts into new more convenient one.

In [17]:
converted_posts = convert_posts(loaded_posts)

In [21]:
converted_posts[4]

{'attachment_type': None,
 'likes_number': 1,
 'pinned': 0,
 'post_hour': '13',
 'post_month': 'April',
 'repost': 0,
 'signed': 0,
 'text': ['почитать',
  'комментарий',
  'наш',
  'студент',
  'в',
  'различный',
  'группа',
  'стыдно',
  'становиться',
  'за',
  'то',
  'что',
  'вшэ',
  'плодоносить',
  'такой',
  'даун',
  'сколько',
  'абитуриент',
  'отогнать',
  'свой',
  'выебон',
  'этот',
  'долбануть',
  'первокурсник',
  'и',
  'второкурсник',
  'что-то',
  'пытаться',
  'доказывать',
  'писать',
  'что',
  'наш',
  'днищенский',
  'вшм',
  'хороший',
  'чем',
  'питерский',
  'что',
  'наш',
  'вуз',
  'есть',
  'в',
  'весь',
  'рейтинг',
  'на',
  'лидировать',
  'позиция',
  'зачем',
  'позорить-то',
  'мы',
  'будто',
  'намеренно',
  'это',
  'писать',
  'когда',
  'знать',
  'что',
  'в',
  'рейтинг',
  'мы',
  'далеко',
  'кроме',
  'экономика',
  'вообще',
  'бесить'],
 'text_length': 467}

# 3. Creating object-feature matrix

Loading pandas

In [22]:
import pandas as pd

In [23]:
pd.DataFrame(converted_posts)

Unnamed: 0,attachment_type,likes_number,pinned,post_hour,post_month,repost,signed,text,text_length
0,photo,71,1,13,April,0,0,[],0
1,,7,0,20,April,1,0,"[восхищать, удивительный, недееспособность, бо...",410
2,,20,0,16,April,0,0,"[иногда, я, жалеть, что, не, поступить, на, би...",163
3,,1,0,13,April,0,0,"[можно, ли, ходить, на, лекция, другой, факуль...",79
4,,1,0,13,April,0,0,"[почитать, комментарий, наш, студент, в, разли...",467
5,,5,0,22,April,0,0,"[сегодня, в, здание, вышка, на, курский, в, ак...",215
6,,2,0,21,April,0,0,"[хэй, вейпереть, а, знаешь, ли, ты, что, в, но...",780
7,,2,0,13,April,0,0,"[в, чем, разница, между, вышкинский, олимпиада...",87
8,,21,0,13,April,0,0,"[анонимно, добрый, вечер, господин, помнить, н...",1320
9,,9,0,19,April,0,0,"[ребята, мгту, мгу, вшэ, собраться, вместе, чт...",349
