# Prediction of User Reactions
We want to predict:
* How to get high score/consensus (upvotes - downvotes)
* How to be controversial (upvotes + downvotes)  
in the comments.

### Import data

In [17]:
%matplotlib inline
import re
import numpy as np
import matplotlib.pyplot as plt
# Not necessary, but I like the ggplot style better
plt.style.use('ggplot')

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
df_art = pd.read_csv('articles_2017_08.csv')
df_com = pd.read_csv('comments_2017_08.csv').sample(20000) # crop because battery life, skews data
# Make float better readable
pd.options.display.float_format = '{:.3f}'.format

### Prepare features

In [18]:
from datetime import datetime

def get_dt_obj(time):
    time = time.replace('am ', '')
    # Make datetime object from string
    return datetime.strptime(time, '%d.%m.%Y %H:%M')

def time_since_epoch(time):
    return (get_dt_obj(time)-datetime(1970,1,1)).total_seconds()

def get_hour_of_day(time):
    return get_dt_obj(time).hour

def get_weekday(time):
    return get_dt_obj(time).weekday()

# Basically same as "the hour of week" or "weekday_hourOfDay"
def get_weekday_float(time):
    hour = float(get_hour_of_day(time))
    weekday = get_weekday(time)
    return float(weekday) + hour / 24

def get_weekday_hour(time):
    return '{}_{}'.format(get_weekday(time), get_hour_of_day(time))

df_com['time_since_epoch'] = df_com['time'].apply(time_since_epoch)
df_com['hour'] = df_com['time'].apply(get_hour_of_day)
df_com['weekday'] = df_com['time'].apply(get_weekday) # 0 = Monday
df_com['weekday_fl'] = df_com['time'].apply(get_weekday_float)
df_com['weekday_hour'] = df_com['time'].apply(get_weekday_hour)
df_com['is_answer'] = df_com['tit'].apply(lambda x: str(x).startswith('@'))
df_com['con_len'] = df_com['con'].apply(lambda x: len(x))
df_com['con_num_words'] = df_com['con'].apply(lambda x: len(x.split()))
df_com['score'] = df_com['vup'] - df_com['vdo']
df_com['contr'] = df_com['vup'] + df_com['vdo']

df_com['tit'] = df_com['tit'].str.lower()
df_com['con'] = df_com['con'].str.lower()

def get_category(link):
    t = link.split('/')
    if len(t) <= 1:
        return ''
    else:
        return t[1]

df_art['cat'] = df_art['link'].apply(get_category)
df_art['cat_copy'] = df_art['cat']
df_art = pd.get_dummies(df_art, columns=['cat'])

df_art['header_len'] = df_art['header'].apply(lambda x: len(x))
df_art['text_len'] = df_art['text'].apply(lambda x: len(str(x)))
df_art['text_num_words'] = df_art['text'].apply(lambda x: len(str(x).split()))

# Left inner join
df_merge = pd.merge(left=df_com, right=df_art, left_on='tId', right_on='tId')

# is empty: (= all comments correspond to an article)
#print(df_merge[pd.isnull(df_merge['link'])])
#df_merge[df_merge['vup']>1000]

# Get order of comments per article
df_merge_art = df_merge.sort_values(['tId', 'time_since_epoch']).groupby('tId')
# Get time since the first comment
def get_time_since_first(group):
    first = group.iloc[:1]['time_since_epoch']
    group['time_since_first'] = group['time_since_epoch'].apply(lambda x: (x - first) / 3600)
    # Remove those very late comments, after x hours
    #group = group[group['time_since_first'] < 36]
    return group

df_merge_art = df_merge_art.apply(get_time_since_first)
print(df_art.head(5))

        tId  article_id        updated  num_comments  \
0  18602624    18602624 1502366469.553             0   
1  23276166    23276166 1502975307.473             0   
2  17735228    17735228 1502796186.031            29   
3  16318631    16318631 1502461203.665            40   
4  22118017    22118017 1502796053.100            20   

                                                link  \
0  /schweiz/ostschweiz/story/-Er-rannte-ihm-mit-d...   
1                      /schweiz/basel/story/23276166   
2  /schweiz/zuerich/story/Limmatschwimmen-findet-...   
3     /community/dossier/geldratgeber/story/16318631   
4  /schweiz/news/story/Zivis-sollen-bei-Fluechtli...   

                                             header  \
0       «Er rannte ihm mit der Mistgabel hinterher»   
1  Unter Drogen und Alkohol Zugbegleiter verprügelt   
2     Limmatschwimmen findet am Samstag nicht statt   
3                Muss ich bei Vertragsbruch zahlen?   
4        Zivis sollen bei Flüchtlingsansturm helfen

### Trying to remove skew
__Rescaling__: Add or subtract a constant and then multiply or divide by a constant.  
__Normalizing__: Dividing by a norm of the vector, e.g. make Euclidean length equal to one. Sometimes make all elements lie in [0, 1].  
__Standardizing__: Subtracting a measure of location and dividing by a measure of scale. Eg. subtract the mean and divide by the std, thereby obtaining a standard normal distribution.

These terms are sometimes used interchangeably.

It's usually better to have the input values centered around zero, unless the output activation function has a range of [0, 1] (neural networks).

In [19]:
import math
def show_hist(col):
    df_merge_art['bins'] = pd.cut(df_merge_art[col], 100)
    h = df_merge_art.groupby(['bins']).size().plot.bar(title=col)
    plt.gca().get_xaxis().set_visible(False)
    plt.show()
    
# This categories are random
bins = [-np.inf, 2, 25, np.inf]
labels = ['negative', 'small', 'big']
df_merge_art['score_cat'] = pd.cut(df_merge_art['score'], bins, labels=labels)
print(df_merge_art.groupby('score_cat').size())

# This doesn't change anything, which is interesting
#df = df[(df['score']<2) | (df['score']>=25)]
    
df = df_merge_art.copy(deep=True)
def sgn(x):
    if x == 0: return 0
    else: return x/abs(x)
# Removes left/right skew 
transformation = lambda x: sgn(x)*math.log(abs(x) + 1)
for col in ['weekday_fl', 'con_len', 'text_len', 'time_since_first', 
            'score', 'contr', 'header_len', 'con_num_words', 'text_num_words']:
    #min_val = df_merge_art[col].min()
    df[col + '_trans'] = df_merge_art[col].apply(transformation)
    #show_hist(col + '_trans')

# I removed very weak correlations to "score" and "contr"
cols = ['score', 'score_trans', 'vup', 'vdo', 'weekday', 'hour', 'weekday_fl',
       'con_len_trans', 'text_len_trans', 'con_num_words_trans', 
        'text_num_words_trans',
       'time_since_first_trans', 'header_len_trans', 
        'cat_schweiz', 'cat_finance', 'cat_sport', 'cat_wissen', 'cat_ausland']
# Get pearson co-efficients
df[cols].corr()

#for c in cols:
#    print(c)
#    print(df[np.isnan(df[col])].head(3))

# TODO transform data for learners (non-linear probably)

score_cat
negative    6615
small       7651
big         5983
dtype: int64


Unnamed: 0,score,score_trans,vup,vdo,weekday,hour,weekday_fl,con_len_trans,text_len_trans,con_num_words_trans,text_num_words_trans,time_since_first_trans,header_len_trans,cat_schweiz,cat_finance,cat_sport,cat_wissen,cat_ausland
score,1.0,0.496,0.955,0.061,-0.018,-0.02,-0.02,0.006,0.003,0.003,0.004,-0.184,0.011,0.044,-0.03,-0.023,-0.014,-0.001
score_trans,0.496,1.0,0.368,-0.325,-0.013,-0.017,-0.015,0.036,0.0,0.032,0.001,-0.173,-0.0,0.035,-0.02,-0.025,-0.01,-0.016
vup,0.955,0.368,1.0,0.356,-0.018,-0.018,-0.02,0.013,0.001,0.01,0.001,-0.223,0.01,0.046,-0.043,-0.013,-0.019,0.01
vdo,0.061,-0.325,0.356,1.0,-0.004,0.001,-0.003,0.025,-0.007,0.025,-0.006,-0.17,0.001,0.018,-0.048,0.029,-0.02,0.037
weekday,-0.018,-0.013,-0.018,-0.004,1.0,0.008,0.993,0.002,-0.015,0.003,-0.023,0.099,-0.049,0.0,-0.015,0.01,0.012,-0.024
hour,-0.02,-0.017,-0.018,0.001,0.008,1.0,0.126,-0.048,-0.025,-0.049,-0.027,-0.068,0.022,-0.055,-0.007,0.004,0.032,0.006
weekday_fl,-0.02,-0.015,-0.02,-0.003,0.993,0.126,1.0,-0.004,-0.018,-0.003,-0.026,0.09,-0.046,-0.006,-0.016,0.01,0.015,-0.024
con_len_trans,0.006,0.036,0.013,0.025,0.002,-0.048,-0.004,1.0,0.073,0.989,0.072,0.05,0.02,0.048,0.046,-0.054,-0.013,0.014
text_len_trans,0.003,0.0,0.001,-0.007,-0.015,-0.025,-0.018,0.073,1.0,0.068,0.994,-0.053,0.066,0.165,0.074,-0.008,0.015,0.062
con_num_words_trans,0.003,0.032,0.01,0.025,0.003,-0.049,-0.003,0.989,0.068,1.0,0.068,0.051,0.017,0.045,0.042,-0.05,-0.012,0.009


### Prepare and split for model

In [20]:
# https://de.wikipedia.org/wiki/Liste_der_h%C3%A4ufigsten_W%C3%B6rter_der_deutschen_Sprache
stop_words = "die, der, und, in, zu, den, das, nicht, von, sie, ist, des, sich, mit, dem, dass, er, es, ein, ich, auf, so, eine, auch, als, an, nach, wie, im, für"
#stop_words += "man, aber, aus, durch, wenn, nur, war, noch, werden, bei, hat, wir, was, wird, sein, einen, welche, sind, oder, zur, um, haben, einer, mir, über, ihm, diese, einem, ihr, uns"
#stop_words += "da, zum, kann, doch, vor, dieser, mich, ihn, du, hatte, seine, mehr, am, denn, nun, unter, sehr, selbst, schon, hier"
#stop_words += "bis, habe, ihre, dann, ihnen, seiner, alle, wieder, meine, Zeit, gegen, vom, ganz, einzelnen, wo, muss, ohne, eines, können, sei"
stop_words = stop_words.lower()
stop_words = stop_words.split(', ')

def classify(val):
    if val > 10: return 1
    return 0

X = df.drop(['score', 'contr', 'vup', 'vdo'], axis=1)

#y = df_merge_art['score'].apply(classify)
y = df['score']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Model 

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer, PolynomialFeatures
from sklearn.metrics import r2_score, explained_variance_score
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.linear_model import Ridge, ElasticNet
#from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeRegressor

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

class ItemSelector(BaseEstimator, TransformerMixin):
    '''
    Select a subset of data at a provided key.
    key: hashable, required
        The key corresponding to the desired value in a mappable.
    '''
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, df):
        return df[self.key]
    
no_numbers = lambda x: re.sub(r'(\d[\d\.])+', '', x.lower())
model = Pipeline([
    ('union', FeatureUnion(
        [
            ('statistics', Pipeline([
                ('selector', ItemSelector(
                    key=['weekday', 'hour', 'con_len_trans', 
                         'time_since_first_trans',
                        'cat_schweiz', 'cat_finance', 'cat_sport', 'cat_wissen', 'cat_ausland',
                         'cat_panorama', 'cat_community', 'cat_people', 'cat_digital'
                        ])),
                ('scaler', StandardScaler()),
                ('polynomialfeatures', PolynomialFeatures(degree=3))

            ]))#,
            # add more features here
            
        ]
    )),
    
    ('anova', SelectPercentile(f_regression, percentile=50)),
    
    # Params not optimized yet
    #('model', ElasticNet(alpha=1e-1, l1_ratio=0.3, max_iter=50))
#    ('model', Ridge(alpha=1e3, max_iter=10, solver='svd'))
    ('model', DecisionTreeRegressor(max_depth=3))
    #('model', SVC(kernel='linear', n_jobs=-1))
])


model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R^2: %1.3f" % r2_score(y_test, y_pred))
print("Explained var: {:3f}".format(explained_variance_score(y_test, y_pred)))
print(y_pred)
#print(classification_report(y_test, y_pred))

# Residual plot
X_res = X_test['time_since_first_trans']
plt.scatter(X_res, y_test, color='black')
plt.scatter(X_res, y_pred, color='red')
plt.xlabel('time_since_first_trans')
plt.show()

if False:
    from sklearn.model_selection import GridSearchCV
    param_grid = {#'polynomialfeatures__degree': [2, 3, 4, 5]
        'model__max_depth': list(range(1, 20)),
#        'model__criterion': ['mse', 'friedman_mse', 'mae']
#        'model__alpha': np.logspace(-1, 4, 6),
#        'model__l1_ratio': [0.2, 0.3, 0.4, 0.5, 0.6],
#        'model__max_iter': [10, 50, 100, 500],
#        'model__solver': ['auto', 'svd', 'cholesky', 'lsqr']
        #'anova__percentile': [5, 10, 20, 40, 60]
                 }
    grid = GridSearchCV(model, param_grid, cv=3)
    grid.fit(X_train, y_train)
    
    print("Best estimator:\n{}".format(grid.best_estimator_.named_steps['model']))

### Summary so far
The decision tree regressor depends a lot on max_depth parameter: Depending on it, the model over- or underfits. I reached the best R^2 score with max_depth=3, but model underfits.

So either the model can't learn the features, or the data is not good enough.