## Keras MLP 

For more information about keras, have a look [here](https://keras.io/).

**Classification**

In [None]:
# setup
%matplotlib notebook
# set this to your working directory

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_pickle('sc_cases_cleaned.pkl',compression='gzip')
df=df.reset_index(drop=True)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.01, # at min 1% of docs
                        max_df=.9,  
                        max_features=1000,
                        stop_words='english',
                        ngram_range=(1,3))
X = vectorizer.fit_transform(df['opinion_text'])
pd.to_pickle(X,'X.pkl')
vocab = vectorizer.get_feature_names()
pd.to_pickle(vocab,'vocab.pkl')
Y = df['x_republican']

X.shape

In [None]:
# Getting started with Keras
from keras.models import Sequential
from keras.layers import Dense

model = Sequential() # create a sequential model
model.add(Dense(50, # output neurons in layer       
          input_dim=X.shape[1], # number of inputs
          activation='relu')) # activation function
model.add(Dense(50, activation='relu')) # hidden layer
model.add(Dense(1, activation='sigmoid')) # output layer
model.summary()

In [None]:
# Visualize a model

# Requires graphviz!

!pip install pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
# fit the model
model.compile(loss='binary_crossentropy', # cost function
              optimizer='adam', # use adam as the optimizer
              metrics=['accuracy']) # compute accuracy, for scoring



In [None]:
model_info = model.fit(X.todense(), Y, 
                      epochs=5,
                      validation_split=.2)

In [None]:
# these are the learned coefficients
model.get_weights()

In [None]:
# Plot performance by epoch
plt.plot(model_info.epoch,model_info.history['accuracy'])
plt.plot(model_info.epoch,model_info.history['val_accuracy'])
plt.legend(['train', 'val'], loc='best')

In [None]:
# form probability distribution over classes
Ypred_prob = model.predict(X.todense())
print (Ypred_prob.squeeze()[:5])
Ypred = (Ypred_prob > .5).astype(float)
print (Ypred.squeeze()[:5])

In [None]:
# Save a model
model.save('keras-clf.pkl')

In [None]:
# load model
from keras.models import load_model
model = load_model('keras-clf.pkl')

**Regression**

In [None]:
# Regression model with R-squared
Yreg = df['log_cite_count']


model = Sequential() # create a sequential model
model.add(Dense(100, # output neurons in layer       
          input_dim=X.shape[1], # number of inputs
          activation='relu')) # activation function
model.add(Dense(50, activation='relu')) # hidden layer
model.add(Dense(1)) # output layer

from keras import backend as K
def r2(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

model.compile(loss='mean_squared_error', # cost function
              optimizer='adam', # use adam as the optimizer
              metrics=[r2]) # compute r-squared
model_info = model.fit(X.todense(), Yreg, 
                      epochs=15)

from sklearn.metrics import r2_score
Ypred = model.predict(X.todense())

print (Yreg[:5], Ypred.squeeze()[:5])
r2_score(Yreg,Ypred.squeeze())


In [None]:
%matplotlib inline

# Plot performance by epoch
plt.plot(model_info.epoch,model_info.history['r2'])
plt.legend(['train', 'val'], loc='best')


## Autoencoders

neural nets that perform domain-specific lossy compression

In [None]:
# Autoencoder

from keras.models import Sequential
from keras.layers import Dense

model = Sequential() # create a sequential model
model.add(Dense(100, # first compression layer       
          input_dim=X.shape[1], # number of inputs
          activation='relu')) # activation function
model.add(Dense(25, activation='relu', name="compression_layer")) # final compression layer layer
model.add(Dense(100, activation='relu')) # first reconstruction layer
model.add(Dense(X.shape[1], activation='relu')) # final reconstruction layer
model.summary()

In [None]:
# Visualize a model

# Requires graphviz
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
dot = model_to_dot(model,
                   show_shapes=True,
                   show_layer_names=False,
                   dpi=70)
SVG(dot.create(prog='dot', format='svg'))

In [None]:
# fit the model
model.compile(loss='mean_squared_error', # cost function
              optimizer='adam', # use adam as the optimizer
              metrics=[r2]) # compute accuracy, for scoring

model_info = model.fit(X.todense(), X.todense(), 
                      epochs=10,
                      validation_split=.2)

In [None]:
# compress the data

import keras


compression_model = keras.Model(inputs=model.input,
                                       outputs=model.get_layer("compression_layer").output)
X_compressed = compression_model(X.todense())
print (X_compressed.shape)

In [None]:
#%% PCA Viz

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# visualize X and X_compressed
from sklearn.decomposition import PCA
pca = PCA(n_components=3,svd_solver='randomized')
Xpca = pca.fit_transform(X.todense())
print(pca.explained_variance_ratio_)

sns.scatterplot(
    x=Xpca[:,0], y=Xpca[:,1],
    hue=Y,
    palette=sns.color_palette("hls", len(set(Y))), alpha=0.3)

plt.show()

In [None]:
# visualize X and X_compressed
from sklearn.decomposition import PCA
pca = PCA(n_components=3,svd_solver='randomized')
Xpca = pca.fit_transform(X_compressed)
print(pca.explained_variance_ratio_)


sns.scatterplot(
    x=Xpca[:,0], y=Xpca[:,1],
    hue=Y,
    palette=sns.color_palette("hls", len(set(Y))), alpha=0.3)
plt.show()