<a href="https://colab.research.google.com/github/emrapport/207_projects/blob/master/hyperparam_experiments_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

{DATASET_NAME = "contractions", 
 MODEL_NAME = "contractions",
 NUM_EPOCHS = 20,
 BATCH_SIZE = 1000,
 MAX_SEQUENCE_LENGTH = 20,
 N_MOST_FREQ_WORDS_TO_KEEP = 5000,
 MAX_RESPONSES_PER_POST = 50,
 EMBEDDING_DIM = 50}

In [0]:
hyp_combos = [{'NUM_EPOCHS' = 20,
                'BATCH_SIZE' = 1000,
                'MAX_SEQUENCE_LENGTH' = 20,
                'N_MOST_FREQ_WORDS_TO_KEEP' = 5000,
                'MAX_RESPONSES_PER_POST' = 50,
                # needs to map to one of the glove versions: 50, 100, 200, 300 
                'EMBEDDING_DIM' = 50},
              
                {'NUM_EPOCHS' = 20,
                'BATCH_SIZE' = 1000,
                'MAX_SEQUENCE_LENGTH' = 20,
                'N_MOST_FREQ_WORDS_TO_KEEP' = 5000,
                'MAX_RESPONSES_PER_POST' = 50,
                # needs to map to one of the glove versions: 50, 100, 200, 300 
                'EMBEDDING_DIM' = 50}
                ]

# TODO make it so party as label just gets auto-run every time 
PARTY_AS_LABEL = False

In [0]:
## all this stuff just needs to get run one time per notebook
# Set seeds for reproducible results.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import HashingVectorizer
from keras.preprocessing.sequence import pad_sequences
from scipy.sparse import hstack, vstack
from keras.preprocessing.text import Tokenizer
from tensorflow.keras import Sequential, layers
from keras.utils import plot_model
import pandas as pd
import numpy as np
import copy
import time
import pickle
!pip install gcsfs

pd.set_option('max_colwidth', 100)

project_id = 'w266-251323'
import uuid
bucket_name = 'fb-congressional-data/'
from google.colab import auth
auth.authenticate_user()
!gcloud config set project {project_id}

train_df = pd.read_csv("gs://fb-congressional-data/contraction_expanded_data/train.csv", index_col=0)
dev_df = pd.read_csv("gs://fb-congressional-data/contraction_expanded_data/dev.csv", index_col=0)
!gsutil cp gs://fb-congressional-data/glove* /tmp/

In [0]:
# all the functions go here
def remove_excess_rows_per_post(df, max_per_post):
  num_responses_per_post = df.post_id.value_counts().reset_index()
  num_responses_per_post.columns = ['post_id', 'num_responses']
  
  too_big_posts = num_responses_per_post[num_responses_per_post.num_responses > max_per_post]
  posts_to_sample = too_big_posts.post_id.values
  
  # this gets all the rows for posts we DON'T need to sample 
  new_train_df = df[~df.post_id.isin(posts_to_sample)]
  # this should be true
  assert(len(too_big_posts) + new_train_df.post_id.nunique() == df.post_id.nunique())
  
  too_big_post_rows = df[df.post_id.isin(posts_to_sample)]
  sampled_rows = too_big_post_rows.groupby('post_id').apply(lambda x: x.sample(n=max_per_post)).reset_index(drop=True)
  new_train_df = pd.concat([new_train_df, sampled_rows])
  
  return new_train_df

def get_labels(train_df, test_df, party_label_ind):

  def turn_to_ints(li):
    final_list = []
    for gender in li:
        if gender=='M':
            final_list.append(1)
        else:
            final_list.append(0)
    return final_list

  def turn_to_ints_party(li):
    final_list = []
    for party in li:
        if party=='Congress_Republican':
            final_list.append(1)
        else:
            final_list.append(0)
    return final_list

  if party_label_ind:
    train_df = train_df[train_df.op_category!='Congress_Independent']

    y_train = train_df.op_category.values
    y_dev = test_df.op_category.values
    y_train = turn_to_ints_party(y_train)
    y_dev = turn_to_ints_party(y_dev) 

  else:
    y_train = train_df.op_gender.values
    y_dev = test_df.op_gender.values
    y_train = turn_to_ints(y_train)
    y_dev = turn_to_ints(y_dev)

  y_train = np.asarray(y_train)
  y_dev = np.asarray(y_dev)

  return y_train, y_dev

def get_inputs(train_df, 
               test_df, 
               n_words_to_keep,
               max_seq_length):
  def get_text_list(init_list):
      sentences = []
      for sentence in init_list:
          if type(sentence) != str:
              sentences.append("")
          else:
              sentences.append(sentence)
      return sentences

  new_sentences_train = get_text_list(new_train_df.response_text.values)
  new_sentences_test = get_text_list(dev_df.response_text.values)

  time_start = time.time()

  # this is the default list of filters + apostrophe
  # added because we have dealt with common contractions, so other apostrophes should mostly be possessive 
  tokenizer = Tokenizer(filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', oov_token='UNK')
  tokenizer.fit_on_texts(new_sentences_train)


  currentTime = time.gmtime(time.time() - time_start)

  #Convert the gmtime struct to a string
  timeStr = time.strftime("%M minutes, %S seconds", currentTime)

  print("Tokenized in {}".format(timeStr))

  # suggestion from this issue: https://github.com/keras-team/keras/issues/8092
  # seems like OOV and num_words don't work correctly by default 
  tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() 
                              if i <= n_words_to_keep + 1} 
  tokenizer.word_index[tokenizer.oov_token] = n_words_to_keep + 1

  X_train = tokenizer.texts_to_sequences(new_sentences_train)
  X_test = tokenizer.texts_to_sequences(new_sentences_test)

  X_train = pad_sequences(X_train, padding='post', maxlen=max_seq_length)
  X_test = pad_sequences(X_test, padding='post', maxlen=max_seq_length)
  return X_train, X_test, tokenizer

def create_embedding_matrix(filepath, 
                            word_index, 
                            embedding_dim):
    vocab_size = len(word_index) + 2  # Now we have to add 2 (reserved 0 plus the manual UNK token)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

def make_model(embedding_matrix, max_seq_length):
  model = Sequential()
  model.add(layers.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], 
                            weights=[embedding_matrix], 
                            input_length=max_seq_length, 
                            trainable=False))
  model.add(layers.Conv1D(128, 2, activation='relu', padding="same"))
  model.add(layers.Dropout(.5))
  #model.add(layers.Conv1D(32, 3, activation='relu'))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(20, activation='relu'))
  model.add(layers.Dropout(.5))
  model.add(layers.Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model

In [0]:
for hyp_dict in hyp_combos:
  new_train_df = remove_excess_rows_per_post(train_df, hyp_dict['MAX_RESPONSES_PER_POST'])
  print("You now have {} training rows".format(new_train_df.shape[0]))

  new_train_df = new_train_df.sample(frac=1)
  dev_df = dev_df.sample(frac=1)

  y_train, y_dev = get_labels(new_train_df, dev_df, hyp_dict['PARTY_AS_LABEL'])
  X_train, X_test, tokenizer = get_inputs(new_train_df, 
                                          dev_df, 
                                          hyp_dict['N_MOST_FREQ_WORDS_TO_KEEP'], 
                                          hyp_dict['MAX_SEQUENCE_LENGTH'])
  embedding_matrix = create_embedding_matrix(
                     '/tmp/glove.6B.{}d.txt'.format(hyp_dict['EMBEDDING_DIM']),
                      tokenizer.word_index, hyp_dict['EMBEDDING_DIM'])
  model = make_model(embedding_matrix, MAX_SEQUENCE_LENGTH)
  print(model.summary())

  try:
    time_start = time.time()

    history = model.fit(X_train, y_train,
                        epochs=NUM_EPOCHS,
                        verbose=True,
                        class_weight={1: 1, 0: 2},
                        validation_data=(smaller_X_dev, smaller_y_dev),
                        batch_size=BATCH_SIZE)

    currentTime = time.gmtime(time.time() - time_start)

    #Convert the gmtime struct to a string
    timeStr = time.strftime("%M minutes, %S seconds", currentTime)

    print("Trained in {}".format(timeStr))

  except Exception as ex:
    print(ex)
    currentTime = time.gmtime(time.time() - time_start)

    #Convert the gmtime struct to a string
    timeStr = time.strftime("%M minutes, %S seconds", currentTime)

    print("Trained in {}".format(timeStr))  
  
  
    