## Keras MLP 

For more information about keras, have a look [here](https://keras.io/).

**Classification**

In [None]:
# setup
%matplotlib notebook
# set this to your working directory

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_pickle('sc_cases_cleaned.pkl',compression='gzip')
df=df.reset_index(drop=True)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.01, # at min 1% of docs
                        max_df=.9,  
                        max_features=1000,
                        stop_words='english',
                        ngram_range=(1,3))
X = vectorizer.fit_transform(df['opinion_text'])
pd.to_pickle(X,'X.pkl')
vocab = vectorizer.get_feature_names()
pd.to_pickle(vocab,'vocab.pkl')
Y = df['x_republican']

X.shape

In [None]:
# Getting started with Keras
from keras.models import Sequential
from keras.layers import Dense

model = Sequential() # create a sequential model
model.add(Dense(50, # output neurons in layer       
          input_dim=X.shape[1], # number of inputs
          activation='relu')) # activation function
model.add(Dense(50, activation='relu')) # hidden layer
model.add(Dense(1, activation='sigmoid')) # output layer
model.summary()

In [None]:
# Visualize a model

# Requires graphviz!

!pip install pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
dot = model_to_dot(model,
                   show_shapes=True,
                   show_layer_names=False,
                   dpi=70)
SVG(dot.create(prog='dot', format='svg'))

In [None]:
# fit the model
model.compile(loss='binary_crossentropy', # cost function
              optimizer='adam', # use adam as the optimizer
              metrics=['accuracy']) # compute accuracy, for scoring



In [None]:
model_info = model.fit(X.todense(), Y, 
                      epochs=5,
                      validation_split=.2)

In [None]:
# these are the learned coefficients
model.get_weights()

In [None]:
# Plot performance by epoch
plt.plot(model_info.epoch,model_info.history['accuracy'])
plt.plot(model_info.epoch,model_info.history['val_accuracy'])
plt.legend(['train', 'val'], loc='best')

In [None]:
# form probability distribution over classes
Ypred_prob = model.predict(X.todense())
print (Ypred_prob.squeeze()[:5])
Ypred = (Ypred_prob > .5).astype(float)
print (Ypred.squeeze()[:5])

In [None]:
# Save a model
model.save('keras-clf.pkl')

In [None]:
# load model
from keras.models import load_model
model = load_model('keras-clf.pkl')

**Regression**

In [None]:
# Regression model with R-squared
Yreg = df['log_cite_count']


model = Sequential() # create a sequential model
model.add(Dense(100, # output neurons in layer       
          input_dim=X.shape[1], # number of inputs
          activation='relu')) # activation function
model.add(Dense(50, activation='relu')) # hidden layer
model.add(Dense(1)) # output layer

from keras import backend as K
def r2(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

model.compile(loss='mean_squared_error', # cost function
              optimizer='adam', # use adam as the optimizer
              metrics=[r2]) # compute r-squared
model_info = model.fit(X.todense(), Yreg, 
                      epochs=15)

from sklearn.metrics import r2_score
Ypred = model.predict(X.todense())

print (Yreg[:5], Ypred.squeeze()[:5])
r2_score(Yreg,Ypred.squeeze())


In [None]:
%matplotlib inline

# Plot performance by epoch
plt.plot(model_info.epoch,model_info.history['r2'])
plt.legend(['train', 'val'], loc='best')


## Autoencoders

neural nets that perform domain-specific lossy compression

In [None]:
# Autoencoder

from keras.models import Sequential
from keras.layers import Dense

model = Sequential() # create a sequential model
model.add(Dense(100, # first compression layer       
          input_dim=X.shape[1], # number of inputs
          activation='relu')) # activation function
model.add(Dense(25, activation='relu', name="compression_layer")) # final compression layer layer
model.add(Dense(100, activation='relu')) # first reconstruction layer
model.add(Dense(X.shape[1], activation='relu')) # final reconstruction layer
model.summary()

In [None]:
# Visualize a model

# Requires graphviz
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
dot = model_to_dot(model,
                   show_shapes=True,
                   show_layer_names=False,
                   dpi=70)
SVG(dot.create(prog='dot', format='svg'))

In [None]:
# fit the model
model.compile(loss='mean_squared_error', # cost function
              optimizer='adam', # use adam as the optimizer
              metrics=[r2]) # compute accuracy, for scoring

model_info = model.fit(X.todense(), X.todense(), 
                      epochs=10,
                      validation_split=.2)

In [None]:
# compress the data

import keras


compression_model = keras.Model(inputs=model.input,
                                       outputs=model.get_layer("compression_layer").output)
X_compressed = compression_model(X.todense())
print (X_compressed.shape)

In [None]:
#%% PCA Viz

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# visualize X and X_compressed
from sklearn.decomposition import PCA
pca = PCA(n_components=3,svd_solver='randomized')
Xpca = pca.fit_transform(X.todense())
print(pca.explained_variance_ratio_)

sns.scatterplot(
    x=Xpca[:,0], y=Xpca[:,1],
    hue=Y,
    palette=sns.color_palette("hls", len(set(Y))), alpha=0.3)

plt.show()

In [None]:
# visualize X and X_compressed
from sklearn.decomposition import PCA
pca = PCA(n_components=3,svd_solver='randomized')
Xpca = pca.fit_transform(X_compressed)
print(pca.explained_variance_ratio_)


sns.scatterplot(
    x=Xpca[:,0], y=Xpca[:,1],
    hue=Y,
    palette=sns.color_palette("hls", len(set(Y))), alpha=0.3)
plt.show()

## Embedding Lookup

Keras provides functionality to feed just words (actually indices of words) as model input. The model then performs an embedding lookup (we go from sparse one-hot to dense) which then becomes the input for further computation in the model. For a more detailed tutorial, have a look [here](https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/). 

First, we have to pre-process the data once again

In [None]:
#df['opinion_text']

from keras.preprocessing.text import text_to_word_sequence

# tokenize the text

tokenized = [text_to_word_sequence(opinion) for opinion in df["opinion_text"]]
print (tokenized[0][:50])


In [None]:
from collections import Counter
counter = Counter()
for i in tokenized:
        counter.update(i)
print (counter.most_common(10))
num_words = len(counter)
print (num_words) ## 58'787
print (max(len(i) for i in tokenized)) # 26'097, this is one of the challenges of working with legal text

In [None]:
# create one_hot representation for each word
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
length_vocab = 10000
X_one_hot = [one_hot(opinion, n=length_vocab) for opinion in df["opinion_text"]]
print (X_one_hot[0][:50])

In [None]:
# next, we pad (or truncate) such that all the inputs have same length

max_seq_length = 2000
X_one_hot_padded = pad_sequences(X_one_hot, padding='post', maxlen=max_seq_length, truncating='post')
X_one_hot_padded.shape # (768, 2000)

**Embedding lookup**

In [None]:
from keras.layers import Embedding
model = Sequential() # create a sequential model
model.add(Embedding(length_vocab, 64, input_length=max_seq_length, name="embedding_layer"))
model.summary() #640'000 params because 64 dim for 10'000 words

# that's it


## LSTM in keras

Because we have an embedding lookup now, we can train an LSTM.


In [None]:
from keras.layers import LSTM

model = Sequential() # create a sequential model
model.add(Embedding(length_vocab, 32, input_length=max_seq_length, name="embedding_layer"))
model.add(LSTM(32))
model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="sigmoid")) # output layer
model.summary()


In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
dot = model_to_dot(model,
                   show_shapes=True,
                   show_layer_names=False,
                   dpi=70)
SVG(dot.create(prog='dot', format='svg'))

In [None]:
# fit the model
model.compile(loss='binary_crossentropy', # cost function
              optimizer='adam', # use adam as the optimizer
              metrics=['accuracy']) # compute accuracy, for scoring

model_info = model.fit(X_one_hot_padded, Y, 
                      epochs=3,
                      validation_split=.2, batch_size=32)

**Text Vectorization Layer** <br>
more details [here](https://keras.io/api/layers/preprocessing_layers/core_preprocessing_layers/text_vectorization/).

In [None]:
from keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf
from keras.layers import LSTM


text_dataset = tf.data.Dataset.from_tensor_slices(df["opinion_text"])
max_features = 10000  # Maximum vocab size.
max_len = 2000  # Sequence length to pad the outputs to.

# Create the layer.  
vectorize_layer = TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)
# Now that the vocab layer has been created, call `adapt` on the text-only  
# dataset to create the vocabulary. You don't have to batch, but for large  
# datasets this means we're not keeping spare copies of the dataset.


vectorize_layer.adapt(text_dataset.batch(64))


In [None]:
model = tf.keras.models.Sequential()



model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(max_features, 64, name="embedding_layer"))
model.add(LSTM(64))
model.add(Dense(64, activation="relu"))
model.add(Dense(1, activation="sigmoid")) # output layer
model.summary()

model.compile(loss='binary_crossentropy', # cost function
              optimizer='adam', # use adam as the optimizer
              metrics=['accuracy']) # compute accuracy, for scoring

model_info = model.fit(df["opinion_text"], Y, 
                      epochs=3,
                      validation_split=.2, batch_size=32)


**Deep learning tips, tricks and advanced features**

In [None]:
# Set up a basic model again for advanced features.
from keras.models import Sequential
from keras.layers import Activation, Dense
model = Sequential()
# set custom activation, specify input dim
model.add(Dense(64, input_dim=1000, activation='gelu')) 


In [None]:
# initializers
model.add(Dense(64, kernel_initializer='he_normal'))
model.add(Dense(64, kernel_initializer='he_uniform'))


In [None]:
# other activation functions (https://keras.io/activations/)
model.add(Dense(64, activation="elu"))

In [None]:
# batch normalization
from keras.layers.normalization import BatchNormalization
model.add(Dense(64, use_bias=False)) 
model.add(BatchNormalization())
model.add(Activation('relu'))

In [None]:
# regularization
from keras.regularizers import l1, l2, l1_l2
model.add(Dense(64, 
                kernel_regularizer=l2(0.01),
                activity_regularizer=l1(0.01)))
model.add(Dense(64, 
                kernel_regularizer=l1_l2(l1=0.01,l2=.01),
                activity_regularizer=l1_l2(l1=0.01,l2=.01)))

In [None]:
# Dropout
from keras.layers import Dropout
# np.random.rand(1000)
model.add(Dropout(0.5))

In [None]:
model.add(Dense(1,activation='sigmoid'))

In [None]:
model.summary()

In [None]:
# Optimizers
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# different loss functions

model.compile(optimizer='sgd',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
# Early stopping
from keras.callbacks import EarlyStopping
earlystop = EarlyStopping(monitor='val_accuracy', 
                          min_delta=0.0001, 
                          patience=5, 
                          mode='auto')


model.fit(X.todense(), Y, batch_size=128, 
           epochs=100, 
           callbacks=[earlystop], 
           validation_split=0.2)

In [None]:
# Batch Training with Large Data
from numpy import memmap
X_mm = memmap('X.pkl',shape=(768, 1000))

model.fit(X_mm, Y, batch_size=128, 
           epochs=3, 
           validation_split=0.2)

In [None]:
# Grid search with KerasClassifier


from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

# instantiate KerasClassifier with build function
def create_model(hidden_layers=1):  
    model = Sequential()
    model.add(Dense(16, input_dim=1000, 
                    activation='relu')) 
    for i in range(hidden_layers):
        model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', 
                optimizer='adam', 
                metrics= ['accuracy'])
    return model

clf = KerasClassifier(create_model)

# set of grid search CV to select number of hidden layers
params = {'hidden_layers' : [0,1,2,3]}
grid = GridSearchCV(clf, param_grid=params)
grid.fit(X.todense(),Y)
grid.best_params_