In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk, collections, re, string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
data = pd.read_csv('Eluvio_DS_Challenge.csv')

## Feature Set 

features to include:
- `time_created`
- `over_18`

engineered features:
- `day_of_the_week`
- `author` mean upvotes
- `autor` standard deviation upvotes
- `title_length`
- 50 dimensional `title` GloVe embeddings

feature to predict:
- `up_votes`

## 57 dimensional feature array

can impute missing values or remove entries with any missing values.

### make `over_18` 1 or 0 

In [56]:
data['nsfw'] = data['over_18'].values*1

### add day of the week

In [7]:
from datetime import datetime
creation_dates = np.array([datetime.utcfromtimestamp(time_created) for time_created in data['time_created']])
day_of_the_week = np.array([date.weekday() for date in creation_dates])

In [9]:
data['day_of_the_week'] = day_of_the_week

### add author data

In [22]:
author_upvotes_counts = pd.concat([data.groupby('author', sort=False).mean()['up_votes'], data.groupby('author', sort=False).std()['up_votes'],data['author'].value_counts()], axis=1, keys=['mean_upvotes', 'std_upvotes','post_count'])
author_mean = np.array([author_upvotes_counts.loc[author]['mean_upvotes'] for author in data['author']])
author_std = np.array([author_upvotes_counts.loc[author]['std_upvotes'] for author in data['author']])

In [26]:
data['author_mean'] = author_mean
data['author_std'] = author_std

### add title data

In [28]:
title_length = np.array([len(title.split()) for title in data['title']])
data['title_length'] = title_length

#### GloVe embeddings 

In [31]:
punctuation_numbers_to_exclude = r"[\d+{}’‘“”…£]".format(string.punctuation)
stop_words = [re.sub(punctuation_numbers_to_exclude, "", stop) for stop in stopwords.words('english')]
ss = SnowballStemmer('english')

def clean_text(text, remove_punctuation=True, remove_stopwords=True, stem=True):
    
    text=text.lower()
    
    if remove_punctuation:
        #remove punctuation and numbers
        text = re.sub(punctuation_numbers_to_exclude, "", text)
    
    if remove_stopwords:
        #remove stopwords
        text = [word for word in text.split() if word not in stop_words]
   
    if stem:
        #stem
        text = [ss.stem(word) for word in text]
    
    return ' '.join(text)


In [32]:
cleaned_titles_no_stemming = [clean_text(title, stem=False) for title in data['title'].values]

In [33]:
#load pretrained model
embeddings_dict = {}
dimensions = 50
with open("glove.6b/glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [34]:
titles_vectors = []

#using nonstemming because word variants with the same stem often appear with separate embeddings.
for title in cleaned_titles_no_stemming:
    title_words_vectors = []
    for word in title.lower().split():
        try:
            title_words_vectors.append(embeddings_dict[word])
        except KeyError: #if word not in model, then pass
            pass
    
    #if there are no words from title in GloVe model, return array of nans, else return mean of embedding values for all words in title
    if len(title_words_vectors) == 0:
        titles_vectors.append(np.full(dimensions, np.nan))
    else:
        titles_vectors.append(np.mean(title_words_vectors, axis=0))

titles_vectors = np.array(titles_vectors)


In [65]:
glove_columns = ['glove_{0}'.format(j) for j in range(titles_vectors.shape[1])]

for j in range(titles_vectors.shape[1]):
    data[glove_columns[j]] = titles_vectors[:,j]

## Take care of missing values 

### for simplicity, rather than impute missing values, we will simply eliminate those entries 

In [95]:
missing_values_mask = np.all([np.isfinite(data['author_std']).values, np.isfinite(data['glove_0']).values], axis=0)

In [100]:
missing_n = len(data) - sum(missing_values_mask)

print('We lose {0} entries ({1}%)'.format(missing_n, 100*round(missing_n/len(data),1)))

We lose 54700 entries (10.0%)


It's an important loss of data. Mostly from Authors that only have a single post and therefore no standard deviation. But we will continue with that loss for now.

## select columns and convert to numpy array 

In [102]:
columns = ['up_votes','time_created','day_of_the_week', 'nsfw', 'author_mean', 'author_std', 'title_length']
data[columns+glove_columns][missing_values_mask]

Unnamed: 0,up_votes,time_created,day_of_the_week,nsfw,author_mean,author_std,title_length,glove_0,glove_1,glove_2,...,glove_40,glove_41,glove_42,glove_43,glove_44,glove_45,glove_46,glove_47,glove_48,glove_49
0,3,1201232046,4,0,23.020000,89.754197,5,0.395450,-0.487710,0.577686,...,-0.686205,0.539595,0.111257,-0.336770,0.807028,0.184083,-0.514305,0.448858,0.001137,-1.198285
1,2,1201232075,4,0,23.020000,89.754197,4,0.763765,-0.113493,-0.059670,...,-0.244318,-0.520625,0.478451,-0.645957,0.018015,0.170771,0.314402,0.883963,-0.354881,-0.007585
2,3,1201232523,4,0,23.020000,89.754197,6,0.281622,-0.062788,0.422594,...,0.083637,0.331160,0.742914,-0.773070,0.297838,0.002664,-0.239062,0.610976,-0.358832,-0.767040
3,1,1201233290,4,0,0.500000,0.707107,7,0.308578,-0.056740,0.350345,...,-0.777184,-0.411264,0.185698,-0.054106,0.373954,0.360538,0.035844,0.658474,0.394973,0.554069
5,15,1201287889,4,0,9.000000,8.485281,16,0.185331,0.032454,0.275928,...,0.088249,0.084278,0.154433,-0.096411,0.427682,-0.375497,-0.155194,-0.597045,0.388277,-0.164167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509230,2,1479816582,1,0,5.111111,7.406829,9,-0.316480,-0.071289,-0.248686,...,-0.032479,0.260980,0.324924,-0.194732,0.248947,0.298520,-0.630963,0.677534,0.236023,0.067901
509231,5,1479816764,1,0,21.000000,22.461077,13,-0.233507,0.100422,-0.006792,...,0.069630,0.321455,-0.187375,0.177768,0.234017,-0.162837,-0.628947,-0.402305,-0.022119,-0.053674
509233,1,1479817056,1,0,5.333333,7.505553,5,-0.248053,0.981899,-0.253868,...,0.124997,0.200728,-0.087928,-0.083809,0.055016,-0.059511,-0.142616,0.557844,-0.404922,0.123416
509234,1,1479817157,1,0,8.250000,16.209263,8,0.060415,-0.141901,0.293550,...,0.284009,0.297247,-0.232101,-0.387104,0.508659,-0.164194,-0.322761,0.277495,0.265465,0.067527


In [103]:
feature_data = data[columns+glove_columns][missing_values_mask].to_numpy()

In [104]:
np.save('feature_data.npy', feature_data)