# Response to article

Predict: Upvotes, downvotes, number of comments, length of comments, ...  
Something interpretable would be nice.

See <a href="#summary">summary</a> at the end.

In [118]:
%matplotlib inline
import re
from time import time
from datetime import datetime
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
df_art = pd.read_csv('articles_2017_09.csv')
df_com = pd.read_csv('comments_2017_09.csv')


In [2]:
from datetime import datetime

def get_dt_obj(time):
    time = time.replace('am ', '')
    # Make datetime object from string
    return datetime.strptime(time, '%d.%m.%Y %H:%M')

def time_since_epoch(time):
    return ((time)-datetime(1970,1,1)).total_seconds()

def get_hour_of_day(time):
    return (time).hour

def get_weekday(time):
    return (time).weekday()

df_com['time_dt'] = df_com['time'].apply(get_dt_obj)
df_com['time_since_epoch'] = df_com['time_dt'].apply(time_since_epoch)
df_com['hour'] = df_com['time_dt'].apply(get_hour_of_day)
df_com['weekday'] = df_com['time_dt'].apply(get_weekday) # 0 = Monday
df_com['is_answer'] = df_com['tit'].apply(lambda x: str(x).startswith('@'))
df_com['con_len'] = df_com['con'].apply(lambda x: len(x))
df_com['con_num_words'] = df_com['con'].apply(lambda x: len(x.split()))
df_com['score'] = df_com['vup'] - df_com['vdo']
df_com['activity'] = df_com['vup'] + df_com['vdo']

df_com['tit'] = df_com['tit'].str.lower()
df_com['con'] = df_com['con'].str.lower()

def get_category(link):
    t = link.split('/')
    if len(t) <= 1:
        return ''
    else:
        return t[1]

df_art['cat'] = df_art['link'].apply(get_category)
df_art['cat_copy'] = df_art['cat']
df_art = pd.get_dummies(df_art, columns=['cat'])

df_art['header_len'] = df_art['header'].apply(lambda x: len(x))
df_art['text_len'] = df_art['text'].apply(lambda x: len(str(x)))
df_art['text_num_words'] = df_art['text'].apply(lambda x: len(str(x).split()))
df_art['text_n_periods'] = df_art['text'].apply(lambda x: len(str(x).split('.')))

# Left inner join
df_merge = pd.merge(left=df_com, right=df_art, left_on='tId', right_on='tId')

# Remove rows with missing values
# use .count() to check for missing values
df_merge.dropna(axis=0, how='any', inplace=True)

# Get order of comments per article
df_merge = df_merge.sort_values(['tId', 'time_since_epoch']).groupby('tId')

# Get article specific statistics, group = article
def get_art_statistics(group):
    first = group.iloc[:1]['time_since_epoch']
    group['art_first_weekday'] = group.iloc[:1]['weekday']
    group['art_first_hour'] = group.iloc[:1]['hour']

    group['time_since_first'] = group['time_since_epoch'].apply(lambda x: (x - first) / 3600)

    group['art_score_sum'] = group['score'].sum()
    group['art_activity_sum'] = group['activity'].sum()
    
    return group

df_merge = df_merge.apply(get_art_statistics)
df_merge = df_merge.groupby('tId').mean().reset_index()
df_merge = df_merge.merge(df_art[['tId', 'header', 'sub', 'text']], on='tId')
df_merge.head(2)[['score', 'num_comments', 'header', 'con_len']]

Unnamed: 0,score,num_comments,header,con_len
0,110.904762,21.0,Was steckt hinter der Black-Death-Gruppe?,201.238095
1,49.123288,73.0,Tunesier lesen Aeschi nach Töff-Panne auf,181.479452


In [9]:
temp2 = pd.get_dummies(df_merge[['tId', 'art_first_weekday']], prefix='art_wd_', columns=['art_first_weekday'])
# Merge here first because of index!
df = df_merge.merge(temp2, on='tId') 

df = df.drop(['cat_playview', 'cat_play'], axis=1)

# Memory optimization
# Technical stuff, contributes nothing to analysis
conv = df.select_dtypes(include=['int']).apply(pd.to_numeric,downcast='unsigned')
df[conv.columns] = conv
    
# Get pearson co-efficients
df.corr()
#colormap = plt.cm.viridis
#plt.figure(figsize=(12,12))
#plt.title('Pearson Correlation of Features', y=1.05, size=15)
#sns.heatmap(df.corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)


Unnamed: 0,tId,mob,vup,vdo,time_since_epoch,hour,weekday,is_answer,con_len,con_num_words,...,time_since_first,art_score_sum,art_activity_sum,art_wd__0.0,art_wd__1.0,art_wd__2.0,art_wd__3.0,art_wd__4.0,art_wd__5.0,art_wd__6.0
tId,1.0,-0.018542,-0.007073,-0.022425,-0.02805,-0.020984,0.00675,-0.018922,-0.031802,-0.035247,...,-0.003031,-0.017488,-0.018494,0.008798,-0.000994,-0.019,0.007399,-0.006669,0.016245,-0.001915
mob,-0.018542,1.0,0.054791,0.04938,0.078052,0.069247,0.153564,0.40639,-0.228826,-0.225733,...,-0.062814,0.039142,0.048852,-0.045789,-0.03988,-0.045749,-0.058237,0.009414,0.141889,0.089604
vup,-0.007073,0.054791,1.0,0.61888,0.04319,0.081894,0.016693,-0.105767,0.063202,0.061404,...,-0.018275,0.400877,0.32415,0.011513,-0.001765,-0.038816,-0.010468,0.04104,-0.011043,0.014769
vdo,-0.022425,0.04938,0.61888,1.0,0.039784,0.070275,0.047313,0.015809,0.049966,0.051331,...,-0.003892,0.153751,0.217773,-0.002987,-0.014093,-0.04837,-0.010233,0.053777,0.001759,0.031396
time_since_epoch,-0.02805,0.078052,0.04319,0.039784,1.0,0.028211,0.019473,0.199747,-0.085076,-0.080685,...,-0.144166,0.062181,0.067308,0.00192,-0.024126,-0.011238,0.005448,0.015219,0.005996,0.011393
hour,-0.020984,0.069247,0.081894,0.070275,0.028211,1.0,0.061987,0.035059,-0.143962,-0.145367,...,-0.003809,-0.038671,-0.035141,-0.008612,-0.022406,-0.034495,0.004666,-0.036202,0.06175,0.059101
weekday,0.00675,0.153564,0.016693,0.047313,0.019473,0.061987,1.0,0.111448,-0.034601,-0.037037,...,0.009879,0.008323,0.022944,-0.596753,-0.406757,-0.141742,0.140547,0.359214,0.46067,0.32375
is_answer,-0.018922,0.40639,-0.105767,0.015809,0.199747,0.035059,0.111448,1.0,0.078033,0.078224,...,0.000976,0.137872,0.227306,-0.012877,-0.036953,-0.048042,-0.043075,-0.005886,0.119166,0.07111
con_len,-0.031802,-0.228826,0.063202,0.049966,-0.085076,-0.143962,-0.034601,0.078033,1.0,0.992499,...,0.032836,0.184196,0.189821,-0.009154,0.02376,0.013582,0.040204,-0.011074,-0.043809,-0.033885
con_num_words,-0.035247,-0.225733,0.061404,0.051331,-0.080685,-0.145367,-0.037037,0.078224,0.992499,1.0,...,0.031099,0.183538,0.19156,-0.007212,0.018611,0.018952,0.042909,-0.010403,-0.044945,-0.039834


In [110]:
# https://de.wikipedia.org/wiki/Liste_der_h%C3%A4ufigsten_W%C3%B6rter_der_deutschen_Sprache
#stop_words = "die, der, und, in, zu, den, das, nicht, von, sie, ist, des, sich, mit, dem, dass, er, es, ein, ich, auf, so, eine, auch, als, an, nach, wie, im, für, "
#stop_words += "man, aber, aus, durch, wenn, nur, war, noch, werden, bei, hat, wir, was, wird, sein, einen, welche, sind, oder, zur, um, haben, einer, mir, über, ihm, diese, einem, ihr, uns, "
#stop_words += "da, zum, kann, doch, vor, dieser, mich, ihn, du, hatte, seine, mehr, am, denn, nun, unter, sehr, selbst, schon, hier, im,"
#stop_words += "bis, habe, ihre, dann, ihnen, seiner, alle, wieder, meine, Zeit, gegen, vom, ganz, einzelnen, wo, muss, ohne, eines, können, sei"
#stop_words = stop_words.lower()
#stop_words = stop_words.split(', ')

# Drop everything related to comments
df2 = df.drop(['score', 'activity', 'vup', 'vdo', 'num_comments', 'mob',
                'art_score_sum', 'art_activity_sum', 'con_len', 'con_num_words', 'is_answer',
               # remove text stuff?
              'text', 'header', 'sub'], axis=1)
X = df2
y = df['num_comments']
#y = df['con_len']
#y = df['vup']

from sklearn.model_selection import train_test_split, learning_curve
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print("total data: ", len(X))

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        return (self.clf.fit(x,y).feature_importances_)

def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(X_test)):
        x_tr = X_train.values[train_index]
        y_tr = y_train.values[train_index]
        x_te = X_train.values[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(X_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

ntrain = X_train.shape[0]
ntest = X_test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 3 # set folds for out-of-fold prediction
kf = KFold(n_splits=NFOLDS, random_state=SEED)

#X.info()

total data:  2661


In [111]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold;

# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svr_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingRegressor, seed=SEED, params=gb_params)
svr = SklearnHelper(clf=SVR, seed=SEED, params=svr_params)

# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb) # Gradient Boost
svr_oof_train, svr_oof_test = get_oof(svr) # Support Vector Classifier

print("Training is complete.")

Training is complete.


In [112]:
rf_feature = (rf.feature_importances(X_train,y_train))
et_feature = (et.feature_importances(X_train, y_train))
ada_feature = (ada.feature_importances(X_train, y_train))
gb_feature = (gb.feature_importances(X_train,y_train))

In [113]:
cols = X_train.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_feature,
     'Extra Trees  feature importances': et_feature,
      'AdaBoost feature importances': ada_feature,
    'Gradient Boost feature importances': gb_feature
    })

# Create the new column containing the average of values
feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise
#print(feature_dataframe.head(3))

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
y = feature_dataframe['mean'].values
x = feature_dataframe['features'].values
data = [go.Bar(
            x= x,
             y= y,
            width = 0.5,
            marker=dict(
               color = feature_dataframe['mean'].values,
            colorscale='Portland',
            showscale=True,
            reversescale = False
            ),
            opacity=0.6
        )]

layout= go.Layout(
    autosize= True,
    title= 'Barplots of Mean Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-direct-labels')

In [115]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
#print(base_predictions_train.head())

data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Portland',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

# Less correlation is better

In [None]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svr_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svr_oof_test), axis=1)

gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 100, # was 2000
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)

In [None]:
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, mean_absolute_error

y_pred = predictions
print("R^2: %1.2f" % r2_score(y_test, y_pred))
#print("Explained var: {:3f}".format(explained_variance_score(y_test, y_pred)))
print("Mean absolute err: {:.2f}".format(mean_absolute_error(y_test, y_pred)))
print("Mean squared err: {:.2f}".format(mean_squared_error(y_test, y_pred)))

# Residual plot
X_res = X_test['text_len']
plt.scatter(X_res, y_test, color='black', label='test data')
plt.scatter(X_res, y_pred, color='red', label='predicted')
plt.xlabel('text_len')
plt.legend(); plt.show()

predictions