In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import os
import sys
from utils import split_inds, mrr, shuffle_3

In [46]:
SPLIT_MODE = "time"
SPLIT = 0.7
train_data = "data" # "data_14_1-14_5" # "data/"
test_data = "data_2016" # "data_2016_3" #{ }"data_15_1-15_5" # "data_later" # "data_2016"
# data_2016: wrong processig - no user features
# data_2016_2: with user features, but some do not have user features because to early
# data 2016_3: the ones without the ones with no user features

# Load data
dataframes = []
for file in os.listdir(train_data):
    if file[0]=="." or "example" in file:
        continue
    df_read = pd.read_csv(os.path.join(train_data,file), index_col="id")
    # print("Successfully loaded ", file)
    dataframes.append(df_read)

if SPLIT_MODE=="user":
    lengths = [len(d) for d in dataframes]
    nr_data = sum(lengths)*SPLIT
    inds = np.random.permutation(len(lengths))
    dataframes_train = []
    summed_data = 0
    k=0
    while summed_data < nr_data:
        dataframes_train.append(dataframes[inds[k]])
        k+=1
        summed_data += lengths[inds[k]]
    dataframes_test = [dataframes[j] for j in inds[k:]]
    df_train = pd.concat(dataframes_train)
    df_test = pd.concat(dataframes_test)
    print("Number of users in train set:", len(dataframes_train))
    print("Number of users in test set:", len(dataframes_test))
    print("Number of samples including all answer-question pairs: Train:", len(df_train), " Test:", len(df_test))
elif SPLIT_MODE=="mixed":
    # Take completely random sample (same user might be in test and train set, for different answers)
    df = pd.concat(dataframes)
    ## Test what values are in question age for ground truth --> mostly 0 or 1 days old, largest 100
    # print(len(df))
    # gt_df = df.loc[df["label"]==1]
    # print(len(gt_df))
    # print(np.around(gt_df["questionage"].values, 2))
    # Split in train and tests - split by group (one answer-open_questiosn block must be in same part)
    df_grouped = df.groupby("decision_time")
    # df_train, df_test = split_groups(df_grouped)
    nr_groups = len(df_grouped)
    train_inds, test_inds = split_inds(nr_groups, split=SPLIT)
    df_train = pd.concat([ df_grouped.get_group(group) for i,group in enumerate(df_grouped.groups) if i in train_inds])
    df_test = pd.concat([ df_grouped.get_group(group) for i,group in enumerate(df_grouped.groups) if i in test_inds])
elif SPLIT_MODE=="time":
    df_train = pd.concat(dataframes)
    dataframes_test = []
    for file in os.listdir(test_data):
        if file[0]=="." or "example" in file:
            continue
        df_read = pd.read_csv(os.path.join(test_data,file), index_col="id")
        # print("Successfully loaded ", file)
        dataframes_test.append(df_read)
    df_test = pd.concat(dataframes_test)
    print("Number of users in train set:", len(dataframes))
    print("Number of users in test set:", len(dataframes_test))
    print("Number of samples including all answer-question pairs: Train:", len(df_train), " Test:", len(df_test))
else:
    print("ERROR: SPLIT MODE DOES NOT EXIST")
    sys.exit()

Number of users in train set: 200
Number of users in test set: 30
Number of samples including all answer-question pairs: Train: 192162  Test: 26663


### Get question body 

In [3]:
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine

postgres_str = ('postgresql://localhost/crossvalidated')
cnx = create_engine(postgres_str)

In [4]:
post_body = pd.read_sql_query('''SELECT Id, body FROM Posts WHERE PostTypeId=1''', cnx)

In [5]:
post_body.head()

Unnamed: 0,id,body
0,1,<p>How should I elicit prior distributions fro...
1,2,<p>In many different statistical methods there...
2,3,<p>What are some valuable Statistical Analysis...
3,4,<p>I have two groups of data. Each with a dif...
4,6,"<p>Last year, I read a blog post from <a href=..."


# prepare train and test set

In [47]:
df_train = pd.merge(df_train, post_body, how="left", on="id")
print(len(df_test))
df_test = pd.merge(df_test, post_body, how="left", on="id")
print(len(df_test))

26663
26663


In [48]:
# Prepare training set
X_train = df_train.drop(['label', 'decision_time', 'reputation_user', 'reputation_asker', 'body'], axis=1)
features = X_train.columns.tolist()
X_train = np.asarray(X_train)
X_train = np.asarray(X_train)[:, 1:]
Y_train = df_train['label'].values
G_train = df_train['decision_time'].values
B_train = df_train["body"]
# print(sorted(np.unique(G_train//100)))

# Prepare testing set
X_test = df_test.drop(['label', 'decision_time', 'reputation_user', 'reputation_asker', 'body'], axis=1)
X_test = np.asarray(X_test)[:, 1:]
Y_test = df_test['label'].values
G_test = df_test['decision_time'].values
B_test = df_test["body"]
# print(sorted(np.unique(G_test//100)))
assert(len(X_train)==len(Y_train))

print("Size of training set: ", len(Y_train), " Test set:", len(Y_test))
class_counts = np.unique(Y_train, return_counts=True)[1]
print("Class imbalance: 1:", class_counts[0]//class_counts[1])

Size of training set:  192162  Test set: 26663
Class imbalance: 1: 99


In [39]:
def shuffle_4(X,Y,G,B):
    assert(len(X)==len(Y))
    assert(len(X)==len(G))
    assert(len(X)==len(B))
    randinds = np.random.permutation(len(Y))
    return X[randinds], Y[randinds], G[randinds], B[randinds]

X_train, Y_train, G_train, B_train = shuffle_4(X_train, Y_train, G_train, B_train)

In [49]:
means = np.mean(X_train, axis=0)
stds = np.std(X_train, axis=0)
X_train_norm = (X_train-means)/stds
X_test_norm = (X_test-means)/stds

# Deep learning model for text and features

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
import tensorflow_hub as hub

In [11]:
from tensorflow.keras.optimizers import Adam, Adagrad
from tensorflow.keras.layers import Dense, Input, Concatenate, concatenate
from tensorflow.keras.models import Model, Sequential

### hub layer

In [24]:
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-es-dim50-with-normalization/1" # "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding,output_shape=[50], input_shape=[], 
                           dtype=tf.string)


### only text: MRR 0.05

In [13]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_1 (KerasLayer)   (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [19]:
opt = Adam(lr=0.001)
model.compile(loss='mean_squared_error',
              optimizer=opt,
              weighted_metrics=['accuracy'])

In [22]:
model.fit(B_train, Y_train, validation_data=(B_test, Y_test), class_weight={0: 1.,1: 50.})

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 192162 samples, validate on 39243 samples


<tensorflow.python.keras.callbacks.History at 0x1a1e5841d0>

In [25]:
out = model.predict(B_test)

In [26]:
score, ranks = mrr(out, G_test, Y_test)

In [27]:
score

0.058730711053754

## only feature

In [61]:
learning_rate = 0.001
batch_size = 30
epochs = 3

In [44]:
X_test[0,0]

144354.0

In [73]:
ip = Input(shape=(23))
# ip2 = Input(shape=(801,))
x = Dense(1024, activation="relu")(ip)
# x = Dropout(0.5)(x)
x = Dense(256,  activation="relu")(x)
# x = Dropout(0.5)(x)
out = Dense(1,  activation="relu")(x)

model = Model(ip , out)

opt = Adam(lr=learning_rate) # , epsilon=None, decay=0.0)
# rmsprop = RMSprop(lr=learning_rate)
model.compile(loss='mean_squared_error',
              optimizer=opt,
              weighted_metrics=['accuracy'])

In [59]:
def balanced_generator(X_data, Y_data, batch_size):
    inds0 = np.where(Y_data==0)[0]
    sample0 = np.random.choice(inds0, batch_size//2)
    inds1 = np.where(Y_data==1)[0]
    sample1 = np.random.choice(inds1, batch_size//2)   
    x_batch = np.concatenate((X_data[sample0], X_data[sample1]),axis=0)
    y_batch = np.concatenate((Y_data[sample0], Y_data[sample1]),axis=0)
    return x_batch, y_batch

# x,y = balanced_generator(X_test, Y_test, 10)

In [None]:
history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
          shuffle=True, verbose=1, validation_data=(X_test, Y_test)) # , class_weight={0:1, 1:50})
#does not work so far: history = model.fit_generator(balanced_generator(X_train, Y_train,batch_size), epochs=epochs,
          # shuffle=True, verbose=1, validation_data=(X_test, Y_test)) # , class_weight={0:1, 1:50})

In [49]:
out = model.predict(X_test)
score, ranks = mrr(out, G_test, Y_test)
score

0.019623742864469614

## Network together

### preprocess test: embedding

In [50]:
B_train_hub = hub_layer(B_train)
B_test_hub = hub_layer(B_test)
print(B_train_hub.shape, B_test_hub.shape)

(192162, 50) (26663, 50)


### model

In [42]:
first_input = Input(shape=(B_train_hub.shape[1], ))
first_dense = Dense(128, activation='relu')(first_input)

second_input = Input(shape=(X_train.shape[1], ))
second_dense = Dense(128, activation='relu')(second_input)

merge_one = concatenate([first_dense, second_dense])
merge_one = Dense(1024, activation='relu')(merge_one)
merge_one = Dense(256, activation='relu')(merge_one)
out = Dense(1, activation='sigmoid')(merge_one)

model = Model(inputs=[first_input, second_input], outputs=out)
ada_grad = Adagrad(lr=0.1, epsilon=1e-08, decay=0.0)
model.compile(optimizer=ada_grad, loss='binary_crossentropy',
               metrics=['accuracy'])
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 23)]         0                                            
__________________________________________________________________________________________________
dense_25 (Dense)                (None, 128)          6528        input_11[0][0]                   
__________________________________________________________________________________________________
dense_26 (Dense)                (None, 128)          3072        input_12[0][0]                   
____________________________________________________________________________________________

In [45]:
model.fit([B_train_hub, X_train_norm], Y_train, validation_data=([B_test_hub, X_test_norm], Y_test), epochs=2, class_weight={0: 1.,1: 100.}) # 


Train on 192162 samples, validate on 39243 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1a21f99f98>

In [51]:
out = model.predict([B_test_hub, X_test_norm])
score, ranks = mrr(out, G_test, Y_test)
score

0.24506347603378517

## TODO:
- other embeddings

## Results

Embedding https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1 

* Train model with 128 / 128 --> concat --> 1024 - 256 - 1 (class weight 1:50) on all features: 0.27
* Train model with 128 / 128 --> concat --> 1024 - 256 - 1 (class weight 1:50) on features WITHOUT question age: 0.1 after 10 epochs, 0.11 after 1 epoch

Embedding https://tfhub.dev/google/tf2-preview/nnlm-es-dim50-with-normalization/1

* Train model with 128 / 128 --> concat --> 1024 - 256 - 1 (class weight 1:50) on features without question age: 0.09 after 1 epoch, 0.11 after 10
* Train model with 128 / 128 --> concat --> 1024 - 256 - 1 (CLASS WEIGHT 1:100) on features without question age: 0.12 after 10

* Train model with 128 / 128 --> concat --> 1024 - 256 - 1 (CLASS WEIGHT 1:100) on features WITH question age: 0.2755 after 10

Use this trained model to compute the score for 2016 data --> 0.245 MRR