## Data Source / Following Along

https://archive.org/details/stackexchange

https://github.com/data-skeptic/feature-engineering-training

https://s3.amazonaws.com/dataskeptic-static/farcon/Posts.xml

# Loading our data to a dataframe

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import xmltodict
import pandas as pd
import datetime
import bleach
import itertools
import math
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from scipy.stats.stats import pearsonr

In [None]:
fname = 'stats.stackexchange.com/Posts.xml'

In [None]:
import xml.etree.ElementTree
e = xml.etree.ElementTree.parse('stats.stackexchange.com/Posts.xml').getroot()

In [None]:
document_file = open(fname, "r")
original_doc = document_file.read()
document = xmltodict.parse(original_doc)

In [None]:
df = pd.DataFrame(document['posts']['row'])

In [None]:
df.columns = list(map(lambda x: x.replace('@', ''), df.columns))

In [None]:
pd.options.display.max_columns = 999

In [None]:
df.head()

## Data Cleaning

In [None]:
df['ViewCount'].fillna(0, inplace=True)
df['ViewCount'] = df['ViewCount'].astype(int)
df['Score'] = df['Score'].astype(int)
df['CreationDate'] = pd.to_datetime(df['CreationDate'])

In [None]:
df.groupby(['PostTypeId'])['ViewCount'].median()

This field seems useless, let's get rid of it

In [None]:
del df['PostTypeId']

## Understanding Our Objective
We should expect that our result can be highly biased by right censorship.

In [None]:
df['cdt'] = df['CreationDate'].apply(lambda x: datetime.datetime.date(x))

In [None]:
g = pd.DataFrame(df.groupby(['cdt'])['ViewCount'].mean())
g.reset_index(inplace=True)
g.sort_values('cdt', inplace=True)

In [None]:
plt.figure(figsize=(15,5))
plt.plot(g['cdt'], g['ViewCount'])
plt.show()

In [None]:
g = pd.DataFrame(df.groupby(['ViewCount'])['Id'].count())
g.reset_index(inplace=True)
g['ViewCount'] = g['ViewCount'].astype(int)
g.sort_values('ViewCount', inplace=True)
g.columns = ['ViewCount', 'frequency']

In [None]:
plt.figure(figsize=(15,5))
plt.plot(g['ViewCount'][1:1000], g['frequency'][1:1000])
plt.ylabel('Frequency')
plt.xlabel('ViewCount')
plt.title('Histogram of ViewCount')
plt.show()

In [None]:
del df['cdt']

What's up with the "score"?

In [None]:
r2 = pearsonr(df['Score'], df['ViewCount'])[0]
plt.scatter(df['Score'], df['ViewCount'], alpha=0.05)
plt.title(r2)
plt.show()

## Remove possible Leakage Features
Let's be specific about what we want to predict.  Since our data is a snapshot in time, we can't predice "ViewCountTomorrow".  We should therefore predict given the merits of the question itself at the time of creation.

In [None]:
leakage_features = [
'Id','FavoriteCount','AnswerCount','CommentCount','AcceptedAnswerId'
,'LastEditDate','CommunityOwnedDate','ParentId','ClosedDate','LastEditorDisplayName'
,'LastActivityDate','LastEditorUserId','Score']

In [None]:
for lf in leakage_features:
    del df[lf]

## Tags

In [None]:
df['Tags'].head()

In [None]:
df['Tags'].fillna('', inplace=True)

How many types of Tags are there?

In [None]:
def tag2arr(s):
    if len(s) < 2:
        return []
    s = s[1:-1]
    arr = s.split('><')
    return arr

tags = df['Tags'].apply(tag2arr)

alltags = []
for arr in tags:
    alltags.extend(arr)

len(set(alltags))

Let's make one quick feature from the metadata

In [None]:
df['tag_count'] = tags.apply(lambda x: len(x))

*What is the distribution of usage over these tags?*

In [None]:
dfx = pd.DataFrame(pd.Series(alltags), columns=['tag'])
dfx['c'] = 1
g = pd.DataFrame(dfx.groupby(['tag'])['c'].sum())
g.reset_index(inplace=True)
g.sort_values('c', ascending=False, inplace=True)
g.index = np.arange(g.shape[0])
plt.plot(g['c'])
plt.title('Histogram of tag use')
plt.xlabel('Popularity rank')
plt.ylabel('Uses')
plt.show()

*Can we focus on just the popular ones and lump the long tail into "Other"?*

In [None]:
df['first_tag'] = tags.apply(lambda x: '' if len(x) == 0 else x[0])
ft = df.groupby(['first_tag']).aggregate({'ViewCount': np.median, 'Title': len})

In [None]:
ft.columns = ['uses', 'MedianViewCount']

In [None]:
dfx = ft[ft['uses'] > 20].copy()

In [None]:
dfx.sort_values('MedianViewCount', inplace=True)
dfx.reset_index(inplace=True)

In [None]:
plt.plot(dfx['MedianViewCount'])
plt.xlabel('Ordered tags')
plt.ylabel('Median View Count')
plt.title('First tag popularity')
plt.show()

In [None]:
dfx.tail(20)

In [None]:
dfx.sort_values('uses', inplace=True, ascending=False)
dfx.head(20)

No tags == no views!

In [None]:
df[df['first_tag']=='']['ViewCount'].max()

Let's make a simple metadata feature

In [None]:
df.groupby(['tag_count'])['ViewCount'].median().plot(kind='bar')
plt.ylabel('Median ViewCount')
plt.show()

In [None]:
tc_dummies = pd.get_dummies(df['tag_count'])
tc_dummies.columns = list(map(lambda x: 'tc_' + str(x), tc_dummies.columns))
df = pd.concat([df, tc_dummies], axis=1)

In [None]:
del df['tag_count']
del df['Tags']

There's more to be done with tags, but let's move along and possibly come back to it.

## Handling long tailed categorical data
Let's create dummy variables for the top 20 features

In [None]:
dfx.sort_values('uses', inplace=True, ascending=False)
top_tags = dfx['first_tag'][1:20]

In [None]:
for tag in top_tags:
    df['tag_' + tag] = df['first_tag'] == tag

In [None]:
del df['first_tag']

## Time features

In [None]:
df['hod'] = df['CreationDate'].apply(lambda x: x.hour)

Naturally, there are trends in posting times, but is it a useful predictor?

In [None]:
df.groupby(['hod'])['CreationDate'].count().plot(kind='bar')
plt.show()

The mean value says no!

In [None]:
df.groupby(['hod'])['ViewCount'].mean().plot(kind='bar')
plt.show()

But the median says yes!

In [None]:
df.groupby(['hod'])['ViewCount'].median().plot(kind='bar')
plt.show()

In [None]:
df['dow'] = df['CreationDate'].apply(lambda x: x.dayofweek)

In [None]:
df.groupby(['dow'])['CreationDate'].count().plot(kind='bar')
plt.show()

In [None]:
df.groupby(['dow'])['ViewCount'].mean().plot(kind='bar')
plt.ylim(400,700)
plt.show()

In [None]:
df['m'] = df['CreationDate'].apply(lambda x: x.month)

Month is a puzzling feature that we'll end up dropping

In [None]:
df.groupby(['m'])['CreationDate'].count().plot(kind='bar')
plt.show()

In [None]:
df.groupby(['m'])['ViewCount'].mean().plot(kind='bar')
plt.show()

In [None]:
df.groupby(['m'])['ViewCount'].median().plot(kind='bar')
plt.show()

In [None]:
tc_dummies = pd.get_dummies(df['dow'])
tc_dummies.columns = list(map(lambda x: 'dow_' + str(x), tc_dummies.columns))
df = pd.concat([df, tc_dummies], axis=1)

In [None]:
tc_dummies = pd.get_dummies(df['hod'])
tc_dummies.columns = list(map(lambda x: 'hod_' + str(x), tc_dummies.columns))
df = pd.concat([df, tc_dummies], axis=1)

In [None]:
del df['dow']
del df['hod']
del df['m']

# Exploring the age of the post

In [None]:
last = df['CreationDate'].max()

In [None]:
df['days_since'] = (last - df['CreationDate']).apply(lambda x: int(x.days))

In [None]:
df.groupby(['days_since'])['ViewCount'].mean().plot()
plt.show()

In [None]:
df['months_since'] = (last - df['CreationDate']).apply(lambda x: int(x.days / 30))

In [None]:
df.groupby(['months_since'])['ViewCount'].mean().plot()
plt.show()

In [None]:
df.groupby(['months_since'])['ViewCount'].median().plot()
plt.show()

In [None]:
df['lvc'] = df['ViewCount'].apply(lambda x: math.log(x+1))

In [None]:
df.groupby(['months_since'])['lvc'].mean().plot()
plt.show()

In [None]:
df['d7'] = df['days_since'].apply(lambda x: x >= 0 and x < 7)
df['d30'] = df['days_since'].apply(lambda x: x >= 7 and x < 30)
df['d60'] = df['days_since'].apply(lambda x: x >= 30 and x < 90)
df['d90p'] = df['days_since'].apply(lambda x: x >= 90)

In [None]:
df.groupby(['d7', 'd30', 'd60', 'd90p'])['ViewCount'].mean()

In [None]:
del df['CreationDate']
del df['lvc']
del df['months_since']
del df['days_since']

## Textual meta-data

In [None]:
df['title_len'] = df['Title'].apply(lambda x: len(str(x)))
df['body_len'] = df['Body'].apply(lambda x: len(str(x)))

In [None]:
plt.scatter(df['body_len'], df['ViewCount'], alpha=0.05)
plt.show()

In [None]:
# Be patient!
df['Body2'] = df['Body'].apply(lambda x: bleach.clean(x, tags=[], attributes={}, styles=[], strip=True))

In [None]:
for bod in df['Body2'][0:10]:
    print(bod)
    print('------------------------------')

## Audience question: Should we really be focusing on the Body?  Any problems with this strategy?

In [None]:
def substringCount(haystack, needle):
    c = 0
    haystack = haystack.strip()
    i = 0
    while i != -1:
        i = haystack.find(needle, i+1)
        if i != -1:
            c += 1
    return c

In [None]:
df['body_newline_count'] = df['Body2'].apply(lambda x: substringCount(x, '\n'))

In [None]:
df['body_newline_count'] = df['body_newline_count'].apply(lambda x: math.log(x+1))

In [None]:
df['body_newline_count'].hist()

In [None]:
df['amt_html'] = 1.0 - df['Body2'].apply(len) / df['Body'].apply(len)

In [None]:
df['amt_html'].hist()
plt.show()

In [None]:
df['amt_html'] = df['amt_html'].apply(lambda x: math.log(x+1))

In [None]:
df['amt_html'].hist()
plt.show()

In [None]:
df['has_link'] = df['Body2'].apply(lambda x: x.find('http:') > 0 or x.find('https:') > 0)

In [None]:
df.groupby(['has_link'])['Body2'].count()

Under some circumstances, I'd add `has_latex` as a feature, but it seems to much like leakage here.

In [None]:
del df['Body']

In [None]:
del df['Body2']

## Crude User Modeling

In [None]:
len(set(df['OwnerDisplayName']))

In [None]:
print(list(set(df['OwnerDisplayName']))[0:100])

In [None]:
g = pd.DataFrame(df.groupby(['OwnerDisplayName']).aggregate({'ViewCount': {'m': np.mean, 'c': len}}))
g = g['ViewCount']
g.sort_values('m', ascending=False, inplace=True)
g.reset_index(inplace=True)
g.head(10)

In [None]:
df[df['OwnerDisplayName']=='user3636']

In [None]:
df[df['OwnerDisplayName']=='ritho']

In [None]:
g.sort_values('m', ascending=False, inplace=True)
g.head(10)

In [None]:
df['default_username'] = df['OwnerDisplayName'].apply(lambda x: str(x).find('user') == 0)

Other ideas

* Some sort of TF-IDF type score

* The number of previous questions from the user at the time a new question is asked

* External parsing of Title with API services (for example, see https://dataskeptic.com/blog/tools-and-techniques/2017/google-cloud-natural-language-api)

## Natural Language Processing

In [None]:
def ends_in_question(s):
    s = str(s)
    i = s.rfind('?')
    if i==-1:
        return False
    if len(s) - i < 5:
        return True
    return False

In [None]:
df['ends_in_qmark'] = df['Title'].apply(ends_in_question)

In [None]:
df.head()

In [None]:
del df['Title']
del df['OwnerDisplayName']
del df['OwnerUserId']

# Further work

* More tag based features

* Accounting for Google organic results


## Model Building and Evaluation

In [None]:
g = pd.DataFrame(df.groupby(['ViewCount'])['d7'].count())
g.reset_index(inplace=True)
g['ViewCount'] = g['ViewCount'].astype(int)
g.sort_values('ViewCount', inplace=True)
g.columns = ['ViewCount', 'frequency']

plt.figure(figsize=(15,5))
plt.plot(g['ViewCount'][1:1000], g['frequency'][1:1000])
plt.ylabel('Frequency')
plt.xlabel('ViewCount')
plt.title('Histogram of ViewCount')
plt.show()

In [None]:
df['v100'] = df['ViewCount'] > 100

In [None]:
kf = KFold(n_splits=5, random_state=None, shuffle=False)

In [None]:
features = df.columns.tolist()

In [None]:
features.remove('ViewCount')

In [None]:
features.remove('v100')

In [None]:
print(features)

In [None]:
df = df.fillna(0)

In [None]:
X = df[features]
y = df['v100']

In [None]:
df.shape

In [None]:
for train_index, test_index in kf.split(df):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
class_names = ['100views', 'less']

cnf_matrix = confusion_matrix(y_test, preds, labels=None, sample_weight=None)
np.set_printoptions(precision=2)

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

In [None]:
fi = rf.feature_importances_
dr = pd.DataFrame({'importance': fi})
dr.sort_values('importance', inplace=True, ascending=True)
dr.index = np.arange(dr.shape[0])
plt.figure(figsize=(5,15))
plt.barh(dr.index, dr['importance'])
x = np.arange(len(features))
plt.yticks(x,features)
plt.show()