# Importing Packages 

In [6]:
#============ Importing Packages ============# 

#--------- Drawing Packages ---------#

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator, NullFormatter, LogLocator)
from set_size import set_size
from collections import Counter

#--------- Tensorflow Packages ---------#
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import f1_score
from sklearn.utils import class_weight
from tensorflow.keras.metrics import Metric
from sklearn.metrics import balanced_accuracy_score

#============== Packages for word2vec ==============#
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#============== Packages for classification ==============#
from sklearn.linear_model import LinearRegression, PoissonRegressor
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.svm import SVC
import keras
#--------- Utilities Packages ---------#

import sys
print(sys.executable)
import os
import re
import pdb
import shelve
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, hstack
from tqdm import tqdm
from sklearn.utils import class_weight
import enchant

import nltk
import obspy

#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('word_tokenize')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.corpus import words as dict_w
from nltk.stem.porter import PorterStemmer

# Scipy Signal
from scipy import signal

# Detrend the Signal
from obspy.signal.detrend import polynomial

#--------- Remove Warnings ---------#
import warnings
warnings.filterwarnings("ignore")


/home/chiangwe/anaconda3/envs/NetHawkes/bin/python


Using TensorFlow backend.


In [7]:
#========= Read in =========#
df = pd.read_csv('Eluvio_DS_Challenge_processes.csv')
#display( df.sort_values('up_votes', ascending=False).head(5)['title'].values )

class_weights = class_weight.compute_class_weight('balanced',  np.unique(df['label']), df['label'])
class_weights = dict(zip( np.unique(df['label']), class_weights))


df = df[ df['title_clean'].apply(lambda x: type(x)==str) ] 
y_true = df['label']


In [8]:
#========= TDIDF =========#

#print(  df['title_clean'].apply(lambda x: type(x)!=str ).sum()  )
bow_converter = CountVectorizer()
x = bow_converter.fit_transform(df['title_clean'])

words = bow_converter.get_feature_names()

bigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[2,2]) 
trigram_converter = CountVectorizer(tokenizer=lambda doc: doc, ngram_range=[3,3])

tfidf_transform = TfidfTransformer(norm=None)
X_tfidf = tfidf_transform.fit_transform(x)

X_tfidf = normalize(X_tfidf,axis=1)

#========= ===  =========#


In [9]:
# Parameters
params = {'dim': ( X_tfidf.shape[1], ),
          'batch_size': 32,
          'n_classes': 2,
          'n_channels': 1,
          'shuffle': True}

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=32, dim=(32,32,32), n_channels=1,
                 n_classes=2, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, self.dim[0] ))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            # Or load in here
            X[i,] =  X_tfidf[i, :]

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [10]:
def create_model(input_shape):
    model = Sequential()
    model.add(Dense(20, activation='relu', kernel_initializer='he_normal', input_shape=(input_shape,)))
    model.add(Dense(10, activation= 'relu', kernel_initializer='he_normal'))
    model.add(Dense(1, activation='sigmoid'))

    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')])
    
    return model

In [5]:
# Define a simple sequential model
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA

pairs_true_pred = []
for each_seed in [42, 50, 123]:
    X_train, X_test, y_train, y_test, list_tr, list_te = \
        train_test_split(X_tfidf, y_true, range(0, X_tfidf.shape[1]), test_size=0.33, random_state=each_seed)
    
    # Create a basic model instance
    model = create_model(X_train.shape[1])

    # Display the model's architecture
    #model.summary()
    
    # Callback define
    patience = 3; epochs = 70;
    checkpoint_filepath = './check_point/01_sim_tdidfNN_mdl.ckpt';

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='max')

    
    # Generators
    training_generator = DataGenerator(list_tr, y_train, **params)
    validation_generator = DataGenerator(list_te, y_train, **params)


    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False, monitor='val_loss', mode='max', save_best_only=True)

    # fit the model
    history = model.fit_generator(training_generator, y_train, validation_data=validation_generator, \
                    epochs=epochs, batch_size=36, verbose=2, class_weight=class_weights,\
                    callbacks=[early_stopping, model_checkpoint_callback])

    # Load model and evaluate on test
    model = tf.keras.models.load_model('./check_point/01_sim_tdidfNN_mdl.ckpt')
    pred_test = model.predict(X_test.toarray()) > 0.5;
    
    pairs_true_pred.append([y_test, pred_test])

Epoch 1/70
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ./check_point/01_sim_tdidfNN_mdl.ckpt/assets
9394/9394 - 37s - loss: 0.6702 - binary_accuracy: 0.5861 - val_loss: 0.6384 - val_binary_accuracy: 0.6451
Epoch 2/70
9394/9394 - 34s - loss: 0.6464 - binary_accuracy: 0.6134 - val_loss: 0.6135 - val_binary_accuracy: 0.6678
Epoch 3/70
INFO:tensorflow:Assets written to: ./check_point/01_sim_tdidfNN_mdl.ckpt/assets
9394/9394 - 34s - loss: 0.6339 - binary_accuracy: 0.6429 - val_loss: 0.6611 - val_binary_accuracy: 0.6136
Epoch 4/70
9394/9394 - 34s - loss: 0.6203 - binary_accuracy: 0.6591 - val_loss: 0.6543 - val_binary_accuracy: 0.6052
Epoch 5/70
INFO:tensorflow:Assets written to: ./check_point/01_sim_tdidfNN_mdl.ckpt/assets
9394/9394 - 34s - loss: 0.6050 - binary_accuracy: 0.6695 - val_loss: 0.6654 - val_binary_accuracy: 0.6057
Epoch 6/70
INFO:tensorflow:Assets written to: ./check_point/01_sim_tdidfNN_mdl.ckpt/assets
939

In [8]:

all_true_pred = np.hstack([ np.hstack([np.expand_dims(each[0], 1), each[1]]) for each in pairs_true_pred]).T
print(all_true_pred.shape)
np.save('SGD_NN_PCA_orig.npy', all_true_pred)

(6, 166557)


In [9]:
from sklearn.metrics import recall_score, precision_score, f1_score, precision_recall_curve, precision_recall_curve
print(all_true_pred.shape)
baccu=[]; recall = []; prec =[]; f1=[]; 
for each in range(0, 3):
    baccu.append( balanced_accuracy_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:]) )
    recall.append( recall_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    prec.append( precision_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    f1.append( f1_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    
# Use AUC function to calculate the area under the curve of precision recall curve
print(baccu)
print("baccu: ", np.mean(baccu))
print("recall: ", np.mean(recall))
print("prec: ", np.mean(prec))
print("f1: ", np.mean(f1))


(6, 166557)
[0.57176108886647581, 0.6019019051266763, 0.60190144160449544]
baccu:  0.591854811866
recall:  0.598351091917
prec:  0.170759413345
f1:  0.265585035887


In [None]:
all_true_pred = np.vstack([ np.vstack(each) for each in pairs_true_pred])
np.save('SGD_PCA.npy', all_true_pred)

# ============== 

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score, precision_recall_curve, precision_recall_curve

baccu=[]; recall = []; prec =[]; f1=[]; 
for each in range(0, 3):
    baccu.append( balanced_accuracy_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:]) )
    recall.append( recall_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    prec.append( precision_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    f1.append( f1_score(all_true_pred[2*each,:], all_true_pred[2*each+1,:], average='binary') )
    
# Use AUC function to calculate the area under the curve of precision recall curve
print("baccu: ", np.mean(baccu))
print("recall: ", np.mean(recall))
print("prec: ", np.mean(prec))
print("f1: ", np.mean(f1))


In [None]:
#========= Median positive and negative =========#

X_train, X_test, y_train, y_test, weight_train, weight_test = \
    train_test_split(X_new, y_true, sample_weight, test_size=0.33, random_state=42)

print( np.array([type(each)!=bool for each in y_true]).sum()  ) 
#class_weights = class_weight.compute_class_weight('balanced',
#                                                 np.unique(y_train),
#                                                 y_train)
#class_weights = dict(zip( np.unique(y_train), class_weights))
#class_weights[True] = class_weights[True]*1.0
##
## Let's do sample weights
#min_pos = df[ df['label'] == True]['up_votes'].min()
#max_neg = df[ df['label'] == False]['up_votes'].max()
#
#
#
#print(dict(zip( np.unique(y_train), class_weights)) )
#


In [None]:

# Define a simple sequential model
def create_model():
    model = Sequential()
    model.add(Dense(10, activation='relu', kernel_initializer='he_normal', input_shape=(X_train.shape[1],)))
    model.add(Dense(2, activation= 'relu', kernel_initializer='he_normal'))
    model.add(Dense(1, activation='sigmoid'))

    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')])
    
    return model

# Create a basic model instance
model = create_model()

# Display the model's architecture
model.summary()


In [None]:
# Callback define
patience = 3; epochs = 70;
checkpoint_filepath = './check_point/01_sim_tdidfNN_mdl.ckpt';

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='max')

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False, monitor='val_loss', mode='max', save_best_only=True)

# fit the model
history = model.fit(X_train.toarray(), y_train, validation_data=(X_test.toarray(), y_test), \
                    epochs=epochs, batch_size=36, verbose=2, class_weight=class_weights,\
                    callbacks=[early_stopping, model_checkpoint_callback])


In [None]:
# Load model and evaluate on test
model = tf.keras.models.load_model('./check_point/01_sim_tdidfNN_mdl.ckpt')
pred_test = model.predict(X_test.toarray()) > 0.5;

from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
print( "accuracy_score: ", accuracy_score(y_test, pred_test) )
print( "balanced_accuracy_score: ", balanced_accuracy_score(y_test, pred_test) )

tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
print(tn, fp, fn, tp)
print(pred_test.shape)

In [None]:
### Try Google trends

In [None]:
# Define a sequential model
# Use Token based text embedding trained on English Google News 7B corpus
def create_model(): 
    
    embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
    hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
    model = tf.keras.Sequential()
    model.add(hub_layer)
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(10, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')])
    
    return model
    
# Create a basic model instance
model = create_model()

# Display the model's architecture
model.summary()


In [None]:
# Train and Test seprate
X_train, X_test, y_train, y_test = train_test_split(df['title_clean'], y_true, test_size=0.33, random_state=42)

# Callback define
patience = 3; epochs = 70;
checkpoint_filepath = './check_point/02_pre_nnlm-en-dim50_mdl.ckpt';

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='max')

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False, monitor='val_loss', mode='max', save_best_only=True)


# fit the model
history = model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), \
                    epochs=epochs, batch_size=64, verbose=2, sample_weight=weight_train,\
                    callbacks=[early_stopping, model_checkpoint_callback])


In [None]:
# Load model and evaluate on test
model = tf.keras.models.load_model('./check_point/02_pre_nnlm-en-dim50_mdl.ckpt')
pred_test = model.predict(X_test) > 0.5;

from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
print( "accuracy_score: ", accuracy_score(y_test, pred_test) )
print( "balanced_accuracy_score: ", balanced_accuracy_score(y_test, pred_test) )

tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
print(tn, fp, fn, tp)
print(pred_test.shape)

In [None]:
# Define a sequential model
# Use Token based text embedding trained on English Google News 7B corpus
# Use pretrain embedding

embed = hub.load("https://tfhub.dev/google/nnlm-en-dim50/2")
X_train = embed(df['title_clean'].values).numpy()

# Train and Test seprate
X_train, X_test, y_train, y_test = train_test_split(X_train, y_true, test_size=0.33, random_state=42)

# Define a simple sequential model
def create_model():
    model = Sequential()
    model.add(Dense(10, activation='relu', kernel_initializer='he_normal', input_shape=(X_train.shape[1],)))
    model.add(Dense( 2, activation= 'relu', kernel_initializer='he_normal'))
    model.add(Dense( 1, activation='sigmoid'))

    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')])
    
    return model

# Create a basic model instance
model = create_model()

# Display the model's architecture
model.summary()



In [None]:

# Callback define
patience = 3; epochs = 70;
checkpoint_filepath = './check_point/03_preEmbed_nnlm-en-dim50_mdl.ckpt';

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='max')

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False, monitor='val_loss', mode='max', save_best_only=True)


# fit the model
history = model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), \
                    epochs=epochs, batch_size=64, verbose=2, sample_weight=weight_train,\
                    callbacks=[early_stopping, model_checkpoint_callback])


In [None]:
# Load model and evaluate on test
model = tf.keras.models.load_model('./check_point/03_preEmbed_nnlm-en-dim50_mdl.ckpt')
pred_test = model.predict(X_test) > 0.5;

from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
print( "accuracy_score: ", accuracy_score(y_test, pred_test) )
print( "balanced_accuracy_score: ", balanced_accuracy_score(y_test, pred_test) )

tn, fp, fn, tp = confusion_matrix(y_test, pred_test).ravel()
print(tn, fp, fn, tp)
print(pred_test.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['title_clean'], y_true, test_size=0.33, random_state=42)

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
history = model.fit(x=X_train, y=y_train,
                    epochs=150, batch_size=32, verbose=2, class_weight=class_weights,
                    validation_data=(X_test, y_test))

In [None]:
reg.score(X_tfidf, df['up_votes'])

In [None]:
reg.predict(X_tfidf).mean()


In [None]:
display(X_tfidf.sum(0).mean() )
display(df['up_votes'].mean() )

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
#regr = MLPRegressor(random_state=1, max_iter=500).fit(X_tfidf, df['up_votes'].values)

In [None]:
print((df['up_votes'].values > df['up_votes'].values.mean()).sum())
print(df.shape)

In [None]:
y_true = (df['up_votes'].values>np.quantile( df['up_votes'].values, 0.50))

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_tfidf, y_true )

In [None]:
y_pred = clf.predict(X_tfidf)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
'''
# Customize my own metrics

class BalancedAccuracy(Metric):
    def __init__(self, name="balanced_accuracy", **kwargs):
        super(BalancedAccuracy, self).__init__(name=name, **kwargs)
        self.balanced_accuracy = self.add_weight(name="ctp", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = y_pred.nupmy()
        y_true = y_true.nupmy()

        value = balanced_accuracy_score(y_true, y_pred, sample_weight)
        #values = tf.multiply(values, sample_weight)
        self.balanced_accuracy.assign_add((value))

    def result(self):
        return self.balanced_accuracy

    def reset_states(self):
        # The state of the metric will be reset at the start of each epoch.
        self.balanced_accuracy.assign(0.0)
'''