In [1]:
import sys
import os
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import tensorflow_text as text

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

tf.__version__

'2.4.0-rc3'

In [2]:
datapath = "./data"

df = pd.concat([
    pd.read_csv(os.path.join(datapath, p))
    for p in os.listdir(datapath)
    if p.endswith(".csv")
]).reset_index(drop=True)

df.body = df.body.str.lower()
df.title = df.title.str.lower()

print(df.shape)

(800469, 22)


In [3]:
median_score = df.score.median()
print("Score median: %0.4f" % median_score)
df["target"] = df["score"] >= median_score

print("Target Mean: %0.4f" % df["target"].mean())

FEATURES = [
    "wh_word_count",
    "sentence_count",
    "word_count",
    "example_count",
    "n_linebreaks",
    "title_word_count",
    "title_question_marks",
    "num_question_marks",
    "n_links",
#     "n_tags",
    "n_lists",
]

x = df[FEATURES + ["body", "title"]]
y = df["target"]

Score median: 2.0000
Target Mean: 0.5932


In [4]:
y.value_counts()

True     474809
False    325660
Name: target, dtype: int64

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

train_bodies = x_train["body"]
train_titles = x_train["title"]

test_bodies = x_test["body"]
test_titles = x_test["title"]

print(x_train.shape, x_test.shape)

(640375, 12) (160094, 12)


In [17]:
keras.backend.clear_session()

body_input = keras.layers.Input(shape=(), dtype=tf.string, name="body_text")
title_input = keras.layers.Input(shape=(), dtype=tf.string, name="title_text")
features_input = keras.layers.Input(shape=(len(FEATURES),), name="features_input")

preprocessor = hub.load(
    "https://tfhub.dev/tensorflow/albert_en_preprocess/1")

# Step 1: tokenize batches of text inputs.
text_inputs = [title_input, body_input] # This SavedModel accepts up to 2 text inputs.
tokenize = hub.KerasLayer(preprocessor.tokenize)
tokenized_inputs = [tokenize(segment) for segment in text_inputs]

# Step 2 (optional): modify tokenized inputs.


# Step 3: pack input sequences for the Transformer encoder.
seq_length = 128  # Your choice here.
bert_pack_inputs = hub.KerasLayer(
    preprocessor.bert_pack_inputs,
    arguments=dict(seq_length=seq_length))  # Optional argument.
encoder_inputs = bert_pack_inputs(tokenized_inputs)









In [18]:
albert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/albert_en_base/2", trainable=False)

albert_output = albert_encoder(encoder_inputs)['pooled_output']

In [19]:
alb = keras.Model(inputs=[title_input, body_input], outputs=albert_output)

In [20]:
res = alb([x_train.iloc[:512]['title'], x_train.iloc[:512]['body']])













In [8]:
dense = keras.Sequential([
    keras.layers.Dense(16, activation='elu'),
    keras.layers.Dense(64, activation='elu'),
    keras.layers.Dense(1)
])

dense_input = tf.concat([albert_output, features_input], axis=1)

output = dense(dense_input)

model = keras.Model(inputs=[title_input, body_input, features_input], outputs=output)

In [9]:
model.compile(optimizer='adam',
              loss=keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'],
)

In [10]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title_text (InputLayer)         [(None,)]            0                                            
__________________________________________________________________________________________________
body_text (InputLayer)          [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        (None, None)         0           title_text[0][0]                 
                                                                 body_text[0][0]                  
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      {'input_word_ids': ( 0           keras_layer[0][0]            

In [None]:
model.fit(
    [train_titles, train_bodies, x_train[FEATURES]],
    y_train,
    validation_data=([test_titles, test_bodies, x_test[FEATURES]],
    y_test),
    epochs=1,
    batch_size=512
)

   6/1251 [..............................] - ETA: 34:42:05 - loss: 0.9374 - accuracy: 0.4457