# Importing Packages 

In [1]:
#============ Importing Packages ============# 

#--------- Drawing Packages ---------#

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator, NullFormatter, LogLocator)
from set_size import set_size
from collections import Counter

#--------- Tensorflow Packages ---------#
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import f1_score
from sklearn.utils import class_weight
from tensorflow.keras.metrics import Metric
from sklearn.metrics import balanced_accuracy_score

#============== Packages for word2vec ==============#
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#============== Packages for classification ==============#
from sklearn.linear_model import LinearRegression, PoissonRegressor
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.svm import SVC

#--------- Utilities Packages ---------#

import sys
print(sys.executable)
import os
import re
import pdb
import shelve
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, hstack
from tqdm import tqdm
from sklearn.utils import class_weight
import enchant

import nltk
import obspy

#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('word_tokenize')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.corpus import words as dict_w
from nltk.stem.porter import PorterStemmer

# Scipy Signal
from scipy import signal

# Detrend the Signal
from obspy.signal.detrend import polynomial

#--------- Remove Warnings ---------#
import warnings
warnings.filterwarnings("ignore")


/home/chiangwe/anaconda3/envs/NetHawkes/bin/python


In [2]:
#========= Read in =========#
df = pd.read_csv('Eluvio_DS_Challenge_processes.csv')
#display( df.sort_values('up_votes', ascending=False).head(5)['title'].values )

class_weights = class_weight.compute_class_weight('balanced',  np.unique(df['label']), df['label'])
class_weights = dict(zip( np.unique(df['label']), class_weights))


df = df[ df['title_clean'].apply(lambda x: type(x)==str) ] 
y_true = df['label']


In [3]:
#========= TDIDF =========#

#print(  df['title_clean'].apply(lambda x: type(x)!=str ).sum()  )
bow_converter = CountVectorizer()
x = bow_converter.fit_transform(df['title_clean'])

words = bow_converter.get_feature_names()

bigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[2,2]) 
trigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3])

tfidf_transform = TfidfTransformer(norm=None)
X_tfidf = tfidf_transform.fit_transform(x)

X_tfidf = normalize(X_tfidf,axis=1)

#========= ===  =========#


In [9]:
#========== PCS ===================#
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.decomposition import PCA

X_new = SelectKBest(chi2, k=np.round(X_tfidf.shape[1]/4).astype(int) ).fit_transform(X_tfidf, y_true)

pca = PCA(n_components=10)
X_tfidf_pca = pca.fit_transform(X_new.toarray())
X_new = np.concatenate((X_new.toarray(), X_tfidf_pca), axis=1)

(504717, 2811)
(504717, 10)


In [49]:
#========= Define Binary Labels =========#
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.decomposition import PCA

pairs_true_pred = []
for each_seed in [42, 50, 123]:
    X_train, X_test, y_train, y_test = \
        train_test_split(X_tfidf, y_true, test_size=0.33, random_state=each_seed)
    
    skb = SelectKBest(chi2, k=np.round(X_train.shape[1]/4).astype(int) ).fit(X_train, y_train)
    X_train = skb.transform(X_train)
    X_test = skb.transform(X_test)
    
    pca = PCA(n_components=10)
    pcscs = pca.fit(X_train.toarray())
    X_tfidf_pca = pcscs.transform(X_train.toarray())
    X_train = np.concatenate((X_train.toarray(), X_tfidf_pca), axis=1)
    
    X_test_pca = pcscs.transform(X_test.toarray())
    X_test = np.concatenate((X_test.toarray(), X_test_pca), axis=1)

    clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3, verbose=1, class_weight=class_weights))
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    #pred_prob = clf.predict_proba(X_test)
    pairs_true_pred.append([y_test, pred])


-- Epoch 1
Norm: 744.00, NNZs: 2821, Bias: 52.910881, T: 338160, Avg. loss: 195.550404
Total training time: 1.80 seconds.
-- Epoch 2
Norm: 390.17, NNZs: 2821, Bias: 26.410292, T: 676320, Avg. loss: 39.461048
Total training time: 3.55 seconds.
-- Epoch 3
Norm: 255.24, NNZs: 2821, Bias: 16.735255, T: 1014480, Avg. loss: 22.657354
Total training time: 5.32 seconds.
-- Epoch 4
Norm: 182.81, NNZs: 2821, Bias: 12.209250, T: 1352640, Avg. loss: 16.064742
Total training time: 7.10 seconds.
-- Epoch 5
Norm: 147.22, NNZs: 2821, Bias: 8.744431, T: 1690800, Avg. loss: 12.091957
Total training time: 8.88 seconds.
-- Epoch 6
Norm: 123.30, NNZs: 2821, Bias: 7.901758, T: 2028960, Avg. loss: 10.081349
Total training time: 10.69 seconds.
-- Epoch 7
Norm: 107.08, NNZs: 2821, Bias: 6.525889, T: 2367120, Avg. loss: 8.698781
Total training time: 12.51 seconds.
-- Epoch 8
Norm: 94.99, NNZs: 2821, Bias: 5.551746, T: 2705280, Avg. loss: 7.626218
Total training time: 14.31 seconds.
-- Epoch 9
Norm: 86.36, NNZs:

Norm: 13.30, NNZs: 2821, Bias: 0.334912, T: 23333040, Avg. loss: 1.446481
Total training time: 134.04 seconds.
-- Epoch 70
Norm: 13.34, NNZs: 2821, Bias: 0.364839, T: 23671200, Avg. loss: 1.437437
Total training time: 136.31 seconds.
-- Epoch 71
Norm: 13.04, NNZs: 2821, Bias: 0.319904, T: 24009360, Avg. loss: 1.439032
Total training time: 138.90 seconds.
-- Epoch 72
Norm: 12.85, NNZs: 2821, Bias: 0.425113, T: 24347520, Avg. loss: 1.431874
Total training time: 140.95 seconds.
-- Epoch 73
Norm: 12.60, NNZs: 2821, Bias: 0.328720, T: 24685680, Avg. loss: 1.407047
Total training time: 142.96 seconds.
-- Epoch 74
Norm: 12.49, NNZs: 2821, Bias: 0.344698, T: 25023840, Avg. loss: 1.397943
Total training time: 144.97 seconds.
-- Epoch 75
Norm: 12.25, NNZs: 2821, Bias: 0.286192, T: 25362000, Avg. loss: 1.388411
Total training time: 146.95 seconds.
-- Epoch 76
Norm: 12.40, NNZs: 2821, Bias: 0.324801, T: 25700160, Avg. loss: 1.371707
Total training time: 148.94 seconds.
-- Epoch 77
Norm: 12.26, NNZ

Norm: 7.67, NNZs: 2821, Bias: 0.054241, T: 45989760, Avg. loss: 1.089687
Total training time: 274.41 seconds.
-- Epoch 137
Norm: 7.61, NNZs: 2821, Bias: 0.007579, T: 46327920, Avg. loss: 1.084300
Total training time: 276.53 seconds.
-- Epoch 138
Norm: 7.54, NNZs: 2821, Bias: 0.028249, T: 46666080, Avg. loss: 1.086508
Total training time: 278.70 seconds.
-- Epoch 139
Norm: 7.55, NNZs: 2821, Bias: 0.036073, T: 47004240, Avg. loss: 1.067346
Total training time: 280.82 seconds.
-- Epoch 140
Norm: 7.58, NNZs: 2821, Bias: 0.030342, T: 47342400, Avg. loss: 1.072400
Total training time: 283.02 seconds.
-- Epoch 141
Norm: 7.55, NNZs: 2821, Bias: 0.047304, T: 47680560, Avg. loss: 1.073983
Total training time: 285.19 seconds.
-- Epoch 142
Norm: 7.49, NNZs: 2821, Bias: 0.030670, T: 48018720, Avg. loss: 1.074484
Total training time: 287.36 seconds.
-- Epoch 143
Norm: 7.42, NNZs: 2821, Bias: 0.017413, T: 48356880, Avg. loss: 1.066570
Total training time: 289.52 seconds.
-- Epoch 144
Norm: 7.33, NNZs

Norm: 61.74, NNZs: 2821, Bias: 3.638984, T: 4734240, Avg. loss: 4.505544
Total training time: 25.36 seconds.
-- Epoch 15
Norm: 57.33, NNZs: 2821, Bias: 2.979402, T: 5072400, Avg. loss: 4.253581
Total training time: 27.67 seconds.
-- Epoch 16
Norm: 53.61, NNZs: 2821, Bias: 2.758193, T: 5410560, Avg. loss: 4.018119
Total training time: 29.52 seconds.
-- Epoch 17
Norm: 50.56, NNZs: 2821, Bias: 2.622007, T: 5748720, Avg. loss: 3.782858
Total training time: 31.35 seconds.
-- Epoch 18
Norm: 47.98, NNZs: 2821, Bias: 2.130572, T: 6086880, Avg. loss: 3.652208
Total training time: 33.17 seconds.
-- Epoch 19
Norm: 46.19, NNZs: 2821, Bias: 2.401489, T: 6425040, Avg. loss: 3.502360
Total training time: 35.01 seconds.
-- Epoch 20
Norm: 43.84, NNZs: 2821, Bias: 2.231207, T: 6763200, Avg. loss: 3.334449
Total training time: 36.85 seconds.
-- Epoch 21
Norm: 42.11, NNZs: 2821, Bias: 2.103176, T: 7101360, Avg. loss: 3.183245
Total training time: 38.70 seconds.
-- Epoch 22
Norm: 40.13, NNZs: 2821, Bias: 2

Norm: 11.96, NNZs: 2821, Bias: 0.279289, T: 27729120, Avg. loss: 1.325708
Total training time: 160.57 seconds.
-- Epoch 83
Norm: 11.96, NNZs: 2821, Bias: 0.306727, T: 28067280, Avg. loss: 1.323961
Total training time: 163.01 seconds.
-- Epoch 84
Norm: 11.79, NNZs: 2821, Bias: 0.287934, T: 28405440, Avg. loss: 1.311401
Total training time: 165.41 seconds.
-- Epoch 85
Norm: 11.74, NNZs: 2821, Bias: 0.292895, T: 28743600, Avg. loss: 1.317502
Total training time: 167.37 seconds.
-- Epoch 86
Norm: 11.56, NNZs: 2821, Bias: 0.339328, T: 29081760, Avg. loss: 1.298275
Total training time: 169.37 seconds.
-- Epoch 87
Norm: 11.35, NNZs: 2821, Bias: 0.251576, T: 29419920, Avg. loss: 1.294140
Total training time: 171.36 seconds.
-- Epoch 88
Norm: 11.25, NNZs: 2821, Bias: 0.239940, T: 29758080, Avg. loss: 1.282011
Total training time: 173.34 seconds.
-- Epoch 89
Norm: 11.28, NNZs: 2821, Bias: 0.275125, T: 30096240, Avg. loss: 1.273794
Total training time: 175.35 seconds.
-- Epoch 90
Norm: 11.16, NNZ

Norm: 262.23, NNZs: 2821, Bias: 16.719232, T: 1014480, Avg. loss: 22.592377
Total training time: 5.28 seconds.
-- Epoch 4
Norm: 193.32, NNZs: 2821, Bias: 12.462147, T: 1352640, Avg. loss: 15.863026
Total training time: 7.06 seconds.
-- Epoch 5
Norm: 157.09, NNZs: 2821, Bias: 9.614282, T: 1690800, Avg. loss: 12.168015
Total training time: 8.84 seconds.
-- Epoch 6
Norm: 131.85, NNZs: 2821, Bias: 7.668883, T: 2028960, Avg. loss: 9.863567
Total training time: 10.64 seconds.
-- Epoch 7
Norm: 115.49, NNZs: 2821, Bias: 6.556457, T: 2367120, Avg. loss: 8.527280
Total training time: 12.43 seconds.
-- Epoch 8
Norm: 102.55, NNZs: 2821, Bias: 6.288954, T: 2705280, Avg. loss: 7.556575
Total training time: 14.23 seconds.
-- Epoch 9
Norm: 92.96, NNZs: 2821, Bias: 5.036031, T: 3043440, Avg. loss: 6.765219
Total training time: 16.22 seconds.
-- Epoch 10
Norm: 83.77, NNZs: 2821, Bias: 4.530820, T: 3381600, Avg. loss: 6.078408
Total training time: 18.62 seconds.
-- Epoch 11
Norm: 77.89, NNZs: 2821, Bias:

Norm: 13.83, NNZs: 2821, Bias: 0.429215, T: 24009360, Avg. loss: 1.423033
Total training time: 139.64 seconds.
-- Epoch 72
Norm: 13.62, NNZs: 2821, Bias: 0.398515, T: 24347520, Avg. loss: 1.421035
Total training time: 141.61 seconds.
-- Epoch 73
Norm: 13.40, NNZs: 2821, Bias: 0.343569, T: 24685680, Avg. loss: 1.416343
Total training time: 143.57 seconds.
-- Epoch 74
Norm: 13.32, NNZs: 2821, Bias: 0.351422, T: 25023840, Avg. loss: 1.391306
Total training time: 145.66 seconds.
-- Epoch 75
Norm: 13.01, NNZs: 2821, Bias: 0.326954, T: 25362000, Avg. loss: 1.390410
Total training time: 147.73 seconds.
-- Epoch 76
Norm: 12.96, NNZs: 2821, Bias: 0.341476, T: 25700160, Avg. loss: 1.372651
Total training time: 150.02 seconds.
-- Epoch 77
Norm: 12.95, NNZs: 2821, Bias: 0.412097, T: 26038320, Avg. loss: 1.371468
Total training time: 152.65 seconds.
-- Epoch 78
Norm: 12.60, NNZs: 2821, Bias: 0.288929, T: 26376480, Avg. loss: 1.365228
Total training time: 154.74 seconds.
-- Epoch 79
Norm: 12.44, NNZ

Norm: 8.03, NNZs: 2821, Bias: 0.107186, T: 46666080, Avg. loss: 1.083326
Total training time: 279.04 seconds.
-- Epoch 139
Norm: 8.00, NNZs: 2821, Bias: 0.121836, T: 47004240, Avg. loss: 1.082462
Total training time: 281.44 seconds.
-- Epoch 140
Norm: 7.93, NNZs: 2821, Bias: 0.064066, T: 47342400, Avg. loss: 1.078781
Total training time: 283.44 seconds.
-- Epoch 141
Norm: 7.92, NNZs: 2821, Bias: 0.086882, T: 47680560, Avg. loss: 1.069384
Total training time: 285.48 seconds.
-- Epoch 142
Norm: 7.86, NNZs: 2821, Bias: 0.060372, T: 48018720, Avg. loss: 1.075510
Total training time: 287.54 seconds.
-- Epoch 143
Norm: 7.80, NNZs: 2821, Bias: 0.068482, T: 48356880, Avg. loss: 1.066829
Total training time: 289.57 seconds.
-- Epoch 144
Norm: 7.76, NNZs: 2821, Bias: 0.049538, T: 48695040, Avg. loss: 1.058411
Total training time: 291.58 seconds.
-- Epoch 145
Norm: 7.77, NNZs: 2821, Bias: 0.069865, T: 49033200, Avg. loss: 1.064975
Total training time: 293.60 seconds.
-- Epoch 146
Norm: 7.66, NNZs

In [53]:
all_true_pred = np.vstack([ np.vstack(each) for each in pairs_true_pred])
np.save('SGD_PCA.npy', all_true_pred)

# ============== 

In [52]:
from sklearn.metrics import recall_score, precision_score, f1_score, precision_recall_curve, precision_recall_curve

baccu=[]; recall = []; prec =[]; f1=[]; 
for each in range(0, 3):
    baccu.append( balanced_accuracy_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:]) )
    recall.append( recall_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    prec.append( precision_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    f1.append( f1_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    
# Use AUC function to calculate the area under the curve of precision recall curve
print("baccu: ", np.mean(baccu))
print("recall: ", np.mean(recall))
print("prec: ", np.mean(prec))
print("f1: ", np.mean(f1))


baccu:  0.587755934705
recall:  0.524108643432
prec:  0.176768482007
f1:  0.26434979333


In [32]:
#========= Median positive and negative =========#

X_train, X_test, y_train, y_test, weight_train, weight_test = \
    train_test_split(X_new, y_true, sample_weight, test_size=0.33, random_state=42)

print( np.array([type(each)!=bool for each in y_true]).sum()  ) 
#class_weights = class_weight.compute_class_weight('balanced',
#                                                 np.unique(y_train),
#                                                 y_train)
#class_weights = dict(zip( np.unique(y_train), class_weights))
#class_weights[True] = class_weights[True]*1.0
##
## Let's do sample weights
#min_pos = df[ df['label'] == True]['up_votes'].min()
#max_neg = df[ df['label'] == False]['up_votes'].max()
#
#
#
#print(dict(zip( np.unique(y_train), class_weights)) )
#


0


In [33]:

# Define a simple sequential model
def create_model():
    model = Sequential()
    model.add(Dense(10, activation='relu', kernel_initializer='he_normal', input_shape=(X_train.shape[1],)))
    model.add(Dense(2, activation= 'relu', kernel_initializer='he_normal'))
    model.add(Dense(1, activation='sigmoid'))

    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')])
    
    return model

# Create a basic model instance
model = create_model()

# Display the model's architecture
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 10)                28120     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 22        
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 3         
Total params: 28,145
Trainable params: 28,145
Non-trainable params: 0
_________________________________________________________________


In [34]:
# Callback define
patience = 3; epochs = 70;
checkpoint_filepath = './check_point/01_sim_tdidfNN_mdl.ckpt';

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='max')

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False, monitor='val_loss', mode='max', save_best_only=True)

# fit the model
history = model.fit(X_train.toarray(), y_train, validation_data=(X_test.toarray(), y_test), \
                    epochs=epochs, batch_size=36, verbose=2, class_weight=class_weights,\
                    callbacks=[early_stopping, model_checkpoint_callback])


Epoch 1/70
INFO:tensorflow:Assets written to: ./check_point/01_sim_tdidfNN_mdl.ckpt/assets
9394/9394 - 23s - loss: 0.6653 - binary_accuracy: 0.6103 - val_loss: 0.6366 - val_binary_accuracy: 0.6348
Epoch 2/70
INFO:tensorflow:Assets written to: ./check_point/01_sim_tdidfNN_mdl.ckpt/assets
9394/9394 - 22s - loss: 0.6475 - binary_accuracy: 0.6190 - val_loss: 0.6456 - val_binary_accuracy: 0.6286
Epoch 3/70
INFO:tensorflow:Assets written to: ./check_point/01_sim_tdidfNN_mdl.ckpt/assets
9394/9394 - 23s - loss: 0.6428 - binary_accuracy: 0.6258 - val_loss: 0.6530 - val_binary_accuracy: 0.6217
Epoch 4/70
9394/9394 - 23s - loss: 0.6398 - binary_accuracy: 0.6360 - val_loss: 0.6422 - val_binary_accuracy: 0.6335
Epoch 5/70
INFO:tensorflow:Assets written to: ./check_point/01_sim_tdidfNN_mdl.ckpt/assets
9394/9394 - 23s - loss: 0.6374 - binary_accuracy: 0.6406 - val_loss: 0.6702 - val_binary_accuracy: 0.6102
Epoch 6/70
9394/9394 - 23s - loss: 0.6354 - binary_accuracy: 0.6480 - val_loss: 0.6468 - val_bi

In [35]:
# Load model and evaluate on test
model = tf.keras.models.load_model('./check_point/01_sim_tdidfNN_mdl.ckpt')
pred_test = model.predict(X_test.toarray()) > 0.5;

from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
print( "accuracy_score: ", accuracy_score(y_test, pred_test) )
print( "balanced_accuracy_score: ", balanced_accuracy_score(y_test, pred_test) )

tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
print(tn, fp, fn, tp)
print(pred_test.shape)

accuracy_score:  0.610151479674
balanced_accuracy_score:  0.609126626756
88952 56753 8179 12673
(166557, 1)


In [36]:
### Try Google trends

In [None]:
# Define a sequential model
# Use Token based text embedding trained on English Google News 7B corpus
def create_model(): 
    
    embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
    hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
    model = tf.keras.Sequential()
    model.add(hub_layer)
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')])
    
    return model
    
# Create a basic model instance
model = create_model()

# Display the model's architecture
model.summary()


In [None]:
# Train and Test seprate
X_train, X_test, y_train, y_test = train_test_split(df['title_clean'], y_true, test_size=0.33, random_state=42)

# Callback define
patience = 3; epochs = 70;
checkpoint_filepath = './check_point/02_pre_nnlm-en-dim50_mdl.ckpt';

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='max')

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False, monitor='val_loss', mode='max', save_best_only=True)


# fit the model
history = model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), \
                    epochs=epochs, batch_size=64, verbose=2, sample_weight=weight_train,\
                    callbacks=[early_stopping, model_checkpoint_callback])


In [None]:
# Load model and evaluate on test
model = tf.keras.models.load_model('./check_point/02_pre_nnlm-en-dim50_mdl.ckpt')
pred_test = model.predict(X_test) > 0.5;

from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
print( "accuracy_score: ", accuracy_score(y_test, pred_test) )
print( "balanced_accuracy_score: ", balanced_accuracy_score(y_test, pred_test) )

tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
print(tn, fp, fn, tp)
print(pred_test.shape)

In [None]:
# Define a sequential model
# Use Token based text embedding trained on English Google News 7B corpus
# Use pretrain embedding

embed = hub.load("https://tfhub.dev/google/nnlm-en-dim50/2")
X_train = embed(df['title_clean'].values).numpy()

# Train and Test seprate
X_train, X_test, y_train, y_test = train_test_split(X_train, y_true, test_size=0.33, random_state=42)

# Define a simple sequential model
def create_model():
    model = Sequential()
    model.add(Dense(10, activation='relu', kernel_initializer='he_normal', input_shape=(X_train.shape[1],)))
    model.add(Dense( 2, activation= 'relu', kernel_initializer='he_normal'))
    model.add(Dense( 1, activation='sigmoid'))

    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')])
    
    return model

# Create a basic model instance
model = create_model()

# Display the model's architecture
model.summary()



In [None]:

# Callback define
patience = 3; epochs = 70;
checkpoint_filepath = './check_point/03_preEmbed_nnlm-en-dim50_mdl.ckpt';

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='max')

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False, monitor='val_loss', mode='max', save_best_only=True)


# fit the model
history = model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), \
                    epochs=epochs, batch_size=64, verbose=2, sample_weight=weight_train,\
                    callbacks=[early_stopping, model_checkpoint_callback])


In [None]:
# Load model and evaluate on test
model = tf.keras.models.load_model('./check_point/03_preEmbed_nnlm-en-dim50_mdl.ckpt')
pred_test = model.predict(X_test) > 0.5;

from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
print( "accuracy_score: ", accuracy_score(y_test, pred_test) )
print( "balanced_accuracy_score: ", balanced_accuracy_score(y_test, pred_test) )

tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
print(tn, fp, fn, tp)
print(pred_test.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['title_clean'], y_true, test_size=0.33, random_state=42)

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history = model.fit(x=X_train, y=y_train,
                    epochs=150, batch_size=32, verbose=2, class_weight=class_weights,
                    validation_data=(X_test, y_test))

In [None]:
reg.score(X_tfidf, df['up_votes'])

In [None]:
reg.predict(X_tfidf).mean()


In [None]:
display(X_tfidf.sum(0).mean() )
display(df['up_votes'].mean() )

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
#regr = MLPRegressor(random_state=1, max_iter=500).fit(X_tfidf, df['up_votes'].values)

In [None]:
print((df['up_votes'].values > df['up_votes'].values.mean()).sum())
print(df.shape)

In [None]:
y_true = (df['up_votes'].values>np.quantile( df['up_votes'].values, 0.50))

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_tfidf, y_true )

In [None]:
y_pred = clf.predict(X_tfidf)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
'''
# Customize my own metrics

class BalancedAccuracy(Metric):
    def __init__(self, name="balanced_accuracy", **kwargs):
        super(BalancedAccuracy, self).__init__(name=name, **kwargs)
        self.balanced_accuracy = self.add_weight(name="ctp", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = y_pred.nupmy()
        y_true = y_true.nupmy()

        value = balanced_accuracy_score(y_true, y_pred, sample_weight)
        #values = tf.multiply(values, sample_weight)
        self.balanced_accuracy.assign_add((value))

    def result(self):
        return self.balanced_accuracy

    def reset_states(self):
        # The state of the metric will be reset at the start of each epoch.
        self.balanced_accuracy.assign(0.0)
'''