# Data analysis project

_In this project I am going to write a program to predict the number likes post is going to get in vk._

__Выполнил__: Булгаков Дмитрий (ИАД16)

__Дедлайн__: 23:59 19.06.16

# 1. Loading Data

VK library quiet installation and import into the notebook.

In [1]:
# pip install vk # makes it quiet
import vk

Starting new vk session in order to parse data

In [2]:
vk_session = vk.Session() # starting new session
vk_api = vk.API(vk_session)

Getting number of posts in selected vk group.

In [3]:
selected_group = 'hse_overheard' # no other ideas :c
posts_number = vk_api.wall.get(domain=selected_group)[0] # number of posts is stored in first element
print('Number of posts in selected group: ', posts_number - 1)

Number of posts in selected group:  14415


Writing a function to parse more, than 100 posts from group.

In [4]:
def load_all_posts(page, n_posts, api):
    all_posts = api.wall.get(domain=page, count=n_posts)
    n_loaded = len(all_posts)
    while n_loaded < n_posts: # loop to load more, than 100 posts
        s = api.wall.get(domain=page, offset=n_loaded, count=(n_posts - n_loaded)) # update offset
        all_posts += s[1:] # no need for first element
        n_loaded += len(s) - 1 # update n_loaded
    return all_posts

Loading all posts from group for future analysis

In [5]:
try:
    loaded_posts = load_all_posts(page=selected_group, n_posts=1251, api=vk_api)[1:] # no need for posts number element
    print('Number of loaded posts: ', len(loaded_posts))
except: # timout errors are often to occur
    print('Error occured! Try again.')

Number of loaded posts:  1250


# 2. Data preprocessing

Loading required libs to preprocess data.

In [6]:
#  !pip install pymorphy2 -q # silent install again
# !pip install stop_words -q # needed to remove stop words
from stop_words import get_stop_words
import pymorphy2 # need this one to convert words to normal time
import datetime # needed to convert response date 
import string # needed to work with strings
from nltk.tokenize import TweetTokenizer # needed to split text
import pandas as pd # required to work with dataframes
from ipywidgets import IntProgress # progressbar
from IPython.display import display # progressbar

Writing functions to process text data. Converting words to normal form and removing punctuation here.

In [7]:
def split_text(text):
    tokenizer = TweetTokenizer()
    return tokenizer.tokenize(text) # spliting text into words

def convert_to_normal_form(words_list):
    morph = pymorphy2.MorphAnalyzer()
    normal_forms_list = []
    for word in words_list:
        if word not in string.punctuation and word[0] != "<":
            norm_form = morph.parse(word)[0].normal_form #getting normal form of a word
            normal_forms_list.append(norm_form) #adding it to list
    return normal_forms_list

def convert_text(text):
    words_list = split_text(text) # spliting text into words
    norm_words_list = convert_to_normal_form(words_list) # words into normal form
    filtered_words = [w for w in norm_words_list if not w in get_stop_words('russian')] # removing stop words
    return " ".join(filtered_words) # joining words to a sentence again

Writing a function to convert received list into another with another data.

In [8]:
a = pd.datetime.today()

In [9]:
def check_posttime(post, cur_date):
    post_time = datetime.datetime.fromtimestamp(post['date'])
    elapsed_time = (cur_date - post_time).seconds / 3600
    elapsed_days = (cur_date - post_time).days
    
    if (elapsed_time < 1):
        return 0
    if (elapsed_time < 5):
        return 1
    if (elapsed_days < 1):
        return 2
    if (elapsed_days < 5):
        return 3
    if (elapsed_days < 10):
        return 4
    if (elapsed_days < 30):
        return 5
    if (elapsed_days < 90):
        return 6
    if (elapsed_days < 180):
        return 7
    if (elapsed_days < 365):
        return 8
    if (elapsed_days < 730):
        return 9
    return 10

In [10]:
def check_attachment(post):
    tmp_dict = {'photo_attachment': 0, 'poll_attachment': 0, 'link_attachment': 0}
    
    if 'attachments' not in post.keys():
        return tmp_dict
    else:
        for attachment in post['attachments']:
            if attachment['type'] == 'photo':
                tmp_dict['photo_attachment'] = 1
            if attachment['type'] == 'poll':
                tmp_dict['poll_attachment'] = 1
            if attachment['type'] == 'link':
                tmp_dict['link_attachment'] = 1
        return tmp_dict

In [11]:
def convert_posts(posts_list):
    progress = IntProgress() 
    progress.max = len(posts_list) # initializing progressbar
    progress.description = 'Processing data convertion'
    display(progress)
    current_date = pd.datetime.today()
    
    updated_posts = [] # list of new posts' list structure
    for i, post in enumerate(posts_list): 
        tmp_dict = {} # creating empty dictionary for each post
        tmp_dict['likes_number'] = int(post['likes']['count']) # getting likes count
        tmp_dict['post_text'] = convert_text(post['text']) # converting text into normal form
        tmp_dict['long_text'] = 1 if len(post['text']) > 400 else 0 # calculating text length
        post_date = datetime.datetime.fromtimestamp(post['date'])
        tmp_dict['post_hour'] = int(post_date.strftime('%H')) # parsing only post hour
        tmp_dict['post_month'] = int(post_date.strftime('%m')) # and post month
        # tmp_dict['signed'] = int(post['from_id'] != -57354358) # checking whether post is signed or not
        # checking if any attacment exists
        tmp_dict['time_elapsed'] = check_posttime(post, current_date)
        tmp_dict.update(check_attachment(post))
        tmp_dict['near_exam_week'] = 1 if post_date.month in [10, 12, 3, 6] else 0
        # tmp_dict['pinned'] = 1 if 'is_pinned' in post.keys() else 0 # cheking if post is pinned
        updated_posts.append(tmp_dict)
        progress.value += 1 # increasing progressbar value
    progress.description = 'Done convertion!'
    return updated_posts

Converting list of posts into new more convenient one.

In [12]:
converted_posts = convert_posts(loaded_posts)

# 3. Creating object-feature matrix

Loading pandas

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

Creating dataframe from parsed data

In [14]:
posts_frame = pd.DataFrame(converted_posts)
posts_frame.head()

Unnamed: 0,likes_number,link_attachment,long_text,near_exam_week,photo_attachment,poll_attachment,post_hour,post_month,post_text,time_elapsed
0,0,0,0,0,0,0,10,7,результат пгаснуть :(,0
1,2,0,0,0,0,0,1,7,посоветовать хороший фильм поднятие айкью моти...,2
2,0,0,0,0,0,0,1,7,айда кататься самокат парк горький,2
3,10,0,0,0,0,0,0,7,god tier миэф эконом пи матфак good tier соцфа...,2
4,1,0,0,0,0,0,0,7,вышка ежегодно вылетать куча народ сей появить...,2


And describing posts data

In [15]:
posts_frame.describe()

Unnamed: 0,likes_number,link_attachment,long_text,near_exam_week,photo_attachment,poll_attachment,post_hour,post_month,time_elapsed
count,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0
mean,8.7408,0.0112,0.1296,0.276,0.084,0.028,15.4144,5.6504,5.3392
std,18.776082,0.105278,0.335997,0.447196,0.277499,0.165039,6.736808,0.933422,1.451147
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,12.0,5.0,5.0
50%,3.0,0.0,0.0,0.0,0.0,0.0,17.0,6.0,6.0
75%,9.0,0.0,0.0,1.0,0.0,0.0,20.0,6.0,6.0
max,332.0,1.0,1.0,1.0,1.0,1.0,23.0,7.0,7.0


Creating object-feature matrix

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer # loading count vectorizer

cv = TfidfVectorizer(norm='l1', max_features = 200, analyzer = 'word', strip_accents='unicode', binary=True)
train_features = cv.fit_transform(posts_frame['post_text']).toarray() # vectorizing texts
train_frame = posts_frame.join(pd.DataFrame(train_features, columns=cv.get_feature_names())) # transfering it to pandas
all_data = train_frame.copy()

value_frame = train_frame['likes_number']
train_frame.drop(['likes_number','post_text'],inplace=True,axis=1,errors='ignore') # removing unnecessary columns

In [17]:
train_frame.describe()

Unnamed: 0,link_attachment,long_text,near_exam_week,photo_attachment,poll_attachment,post_hour,post_month,time_elapsed,10,100,...,ходить,хорошии,хотеться,читать,что,школа,экзамен,эконом,экономика,язык
count,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,...,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0,1250.0
mean,0.0112,0.1296,0.276,0.084,0.028,15.4144,5.6504,5.3392,0.003947,0.001787,...,0.005485,0.012997,0.004203,0.002788,0.003371,0.003999,0.00489,0.004829,0.004653,0.004103
std,0.105278,0.335997,0.447196,0.277499,0.165039,6.736808,0.933422,1.451147,0.036752,0.019275,...,0.046919,0.076672,0.027937,0.0309,0.027092,0.029601,0.038003,0.039838,0.031124,0.040699
min,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,12.0,5.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,17.0,6.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,0.0,20.0,6.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,23.0,7.0,7.0,1.0,0.39308,...,1.0,1.0,0.355244,0.623662,0.582935,0.454604,0.512249,0.57938,0.466193,1.0


Scaling our features

In [18]:
scaler = StandardScaler()
train_frame = scaler.fit_transform(train_frame)

Saving train frame to file

In [19]:
# train_frame.to_csv('traindata.csv')

# 4. Comparing different methods

Splitting into train and test samples

In [20]:
from sklearn.cross_validation import train_test_split

# Splitting it into test and train samples
X_train, X_test, y_train, y_test = train_test_split(train_frame, value_frame, test_size=0.3, random_state=42)

Importing libs

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
import numpy as np


In [22]:
def compare(est, param, est_name):
    cv = GridSearchCV(est, param, n_jobs = 2, scoring = 'r2')
    cv.fit(X_train, y_train);
    print('CV best score for', est_name, ': ', cv.best_score_,'. (R^2)')
    
    # predicted = cv.predict(X_test)
    # mse = mean_squared_error(y_test, predicted)
    # print('MSE for', est_name, ':' , mse)
    # r2 = r2_score(y_test, predicted)
    # print('R^2 for', est_name, ':' , r2)
    
    print('Getting best params: ')
    print(cv.best_params_)

## 4.1 Linear regression

### 4.1.1 Simple linear regression

In [23]:
parameters = {'fit_intercept':[True, False],'normalize':[True, False]}
compare(LinearRegression(), parameters, 'simple linear regression')

CV best score for simple linear regression :  -0.222990987117 . (R^2)
Getting best params: 
{'normalize': True, 'fit_intercept': True}


### 4.1.2 Linear regression with L1 regularization

In [24]:
parameters = {'alpha':np.arange(1, 100, 5), 'positive':[True, False],'normalize':[True, False], 
              'selection':['cyclic', 'random']}
compare(Lasso(), parameters, 'linear regression with L1 regularization')

CV best score for linear regression with L1 regularization :  0.151516939175 . (R^2)
Getting best params: 
{'alpha': 1, 'normalize': False, 'selection': 'random', 'positive': True}


### 4.1.3 Linear regression with L2 regularization

In [25]:
parameters = {'alpha':np.logspace(1.0, 10.0, 101.00), 'fit_intercept':[True, False],'normalize':[True, False],
              'solver':['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'auto']}
compare(Ridge(), parameters, 'linear regression with L2 regularization')

CV best score for linear regression with L2 regularization :  0.149686083993 . (R^2)
Getting best params: 
{'alpha': 512.86138399136485, 'normalize': False, 'solver': 'lsqr', 'fit_intercept': True}


## 4.2 Decision trees and random forests

### 4.2.1 DecisionTreeRegressor

In [26]:
parameters = {'presort':[True, False],'max_depth': np.arange(1, 20), 'splitter':['random', 'best'],
             'max_features':['auto', 'sqrt', 'log2', None]}
compare(tree.DecisionTreeRegressor(), parameters, 'decision tree classifier')

CV best score for decision tree classifier :  0.201048731004 . (R^2)
Getting best params: 
{'presort': True, 'max_features': 'auto', 'splitter': 'random', 'max_depth': 1}


### 4.2.2 RandomForestRegressor

In [27]:
parameters = {'n_estimators':[10, 20, 30],
             'max_features':['auto', 'sqrt', 'log2', None]}
compare(RandomForestRegressor(), parameters, 'random forest classifier')

CV best score for random forest classifier :  0.143505898604 . (R^2)
Getting best params: 
{'max_features': 'log2', 'n_estimators': 30}


### 4.3 kNN

In [28]:
parameters = {'leaf_size':np.arange(30, 100, 10),'n_neighbors': np.arange(5, 20), 
              'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}
compare(KNeighborsRegressor(), parameters, 'kNN')

CV best score for kNN :  0.119745072808 . (R^2)
Getting best params: 
{'leaf_size': 40, 'algorithm': 'auto', 'n_neighbors': 17}


Best prediction is with linear regression with L2 regularization.

# 5. Working with Lasso

Fitting Lasso

In [None]:
ls = Lasso(selection='random', alpha=1, normalize=False, positive=False)
ls.fit(X_train, y_train);
y_pred = ls.predict(X_test)

In [None]:
import matplotlib.pyplot as plt
import sklearn.metrics as metr

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel = 'Real values'
plt.ylabel = 'Predict values'
plt.show()

print('MAE: ', metr.mean_absolute_error(y_test, y_pred))

In [None]:
data = all_data.iloc[:,0:9]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 10))
all_data.drop(['post_text'], inplace=True,axis=1,errors='ignore') 
for idx, feature in enumerate(data.columns[:-1]):
    data.plot(feature, 'likes_number', subplots=True, kind='scatter', ax=axes[idx / 4, idx % 4])
plt.show()