<a href="https://colab.research.google.com/github/francesita/ProfnerTask7a/blob/main/Copy_of_Profner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Code Profner Task 7a: Binary classification: Identifying occupations in social media text




In [None]:
#mount googleDrive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy, Recall
import tensorflow.keras.backend as K

##Preprocess tweets 

1. remove twitter handles, links, hashtags, punctuation etc. so that we are left solely with text in the tweet
2. tokenize tweets

In [None]:
import re

def preprocess_tweets(tweet):
  """
  - remove hashtags, twitter handles, url's
  """
  #might want to try replacing @mentions with a word, such as person/persona?
  #tweet = re.findall(r'#(\w+)', tweet, re.UNICODE)
  tweet = ' '.join(re.sub(r"(@[A-Za-z0-9_]+)|([áéíóúñü][^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet).split())
  tweet = tweet.replace("#", "").replace("_", " ")
  #remove emojis from tweets
  #tweet = remove_emoji(tweet)
  return tweet

In [None]:
import string
def remove_punctuation(tknzd_tweet):
  spanish_pnct = (['¿','¡',':','¨','...',"'",'€','£','$','"','@'])
  cln_twt = []
  for token in tknzd_tweet:
    if token not in string.punctuation and token not in spanish_pnct:
      cln_twt.append(token)
  
  return cln_twt

In [None]:
!pip install demoji
import demoji
demoji.download_codes()
#remove emojis from tweet
def remove_emoji(tweet):
  tweet = demoji.replace(tweet, "")

  return tweet

We check that the tweets are preprocessed as expected by use of an example:

In [None]:
text = 'Cuñado, vén conmigo mañana para comer arroz. ¡Tú eres tán estúpida! @lal_oca #ror_dfa 😜🇵🇷💀⛵🈵🇸🇻 @'
preprocess_tweets(text)

'Cuñado, vén conmigo mañana para comer arroz. ¡Tú eres tán estúpida! ror dfa 😜🇵🇷💀⛵🈵🇸🇻 @'

##Preprocess tweet for pretrained mBERT and BERT


In [None]:
#defining function to preprocess with Bert
def preprocess_bert(tweets, pt_tokenizer_model, max_len):
    '''
    This function will do a variety of things to prep data for Roberta (through encode_plus) which includes:
      -tokenizing tweet
      -adding <s> BOS token (used for classification) and </s></s> as a [SEP] token to start and end of tweet
      -pad or truncate the tweet to max length
      -map tokens to their encoding or id
      -creates attention mask: this is a mask used for attention when a batch has varying length of sentences
      -returns a dict of outputs
      -all this comes from fine-tuning tutorial from skimai.com
    '''

    tokenizer = BertTokenizer.from_pretrained(pt_tokenizer_model, do_lower_case = True)

    encoded_tweets = []
    #attention masks indicated to the model we will use, which tokens should be attended to, example indicates position of padding
    #so model should not pay attention to these. 
    attention_masks = []


    for tweet in tweets:
        encoded_tweet = tokenizer.encode_plus(
            text = preprocess_tweets(tweet),
            add_special_tokens=True, #cls and sep tokens 
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True)

        #add outputs to list
        encoded_tweets.append(encoded_tweet.get('input_ids'))
        attention_masks.append(encoded_tweet.get('attention_mask'))
      
    #we now convert the lists to tensors
    encoded_tweets = np.asarray(encoded_tweets, dtype='int32')
    attention_masks = np.asarray(attention_masks, dtype='int32')

    return encoded_tweets, attention_masks



#Model using mBert Sequence Classifier


Load data for the model. The data needs to be encoded differently from the BiLSTM classifier above. 

In [None]:
#import data
train_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/train.csv')
val_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/val.csv')
aug_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/train-aug-trans.csv')

In [None]:
train_data.loc[train_data['label'] == 1].head()

Unnamed: 0.1,Unnamed: 0,tweet,label
8,1252741192577581056,['Repartidores teniendo que trabajar en medio ...,1
9,1260266403409670144,['@carlesenric @salvadorilla Es imprescindible...,1
17,1256473507157999616,"['El mismo esfuerzo que heces tu , que ni te ...",1
18,1251254299029618690,"['Contexto:\n', '\n', '- Comisaría con cajas d...",1
19,1260647157549264896,['Que la directora del Centro Nacional de Epid...,1


In [None]:
aug_data.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,tweet,label
0,8,1252741192577581056,"[""Los repartidores tienen que trabajar en medi...",1
1,9,1260266403409670144,['@carlesenric @salvadorilla Es imprescindible...,1
2,17,1256473507157999616,"['El mismo esfuerzo que haces, ni siquiera abr...",1
3,18,1251254299029618690,"['Contexto: \ n', '\ n', '- Comisaría de polic...",1
4,19,1260647157549264896,['Que el director del Centro Nacional de Epide...,1


In [None]:
es_train = list(train_data.tweet)
aug_train = list(aug_data.tweet)
val = val_data.tweet

es_labels = list(train_data.label)
aug_labels = list(aug_data.label) 
val_labels = val_data.label

In [None]:
#turn train data lists to numpy arrays
train = np.asarray(es_train)
aug_train = np.asarray(aug_train)

labels = np.asarray(es_labels)
aug_labels= np.asarray(aug_labels)

#Combine training data
train = np.append(es_train, aug_train)
labels = np.append(es_labels, aug_labels)

In [None]:
print('Number of tweets augmented are: ', len(aug_train))

Number of tweets augmented are:  1393


In [None]:
#shuffle data for both train and val sets
idx = np.random.permutation(len(train))
x_train, y_train = train[idx],labels[idx]

x_val, y_val = val,val_labels

In [None]:
print('Total number of training examples after agumemnted data is addeds is: ' , len(x_train))

Total number of training examples after agumemnted data is addeds is:  7393


In [None]:
!pip install transformers
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
#load mBERT tokenizer, we will use it later in a function (this one is used to encode to find max_len)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case = True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




In [None]:
#Now we find maximum length in our list of tweets with the special tokens

#we concatenate training and validation sets 
all_tweets = np.concatenate([train_data.tweet, val_data.tweet])

#encode using tokenizer function (not our fucntion with encoded_plus, this is because we want to see max_length of tweet with special tokens)
all_tweets_encoded = [tokenizer.encode(tweet, add_special_tokens=True) for tweet in all_tweets]
print(all_tweets_encoded[0])
print(tokenizer.decode(all_tweets_encoded[0]))

#find maximum length
max_len = max([len(tweet) for tweet in all_tweets_encoded])
print('max length of tweets:', max_len)

#find avg length
# we might want to use avg len rather than max length in future
avg_len = int(np.average([len(tweet) for tweet in all_tweets_encoded]))
print('average len:' ,avg_len)

[101, 138, 112, 11707, 131, 35366, 10155, 14618, 22754, 10278, 139, 156, 112, 117, 112, 23329, 119, 31270, 131, 35366, 34742, 12905, 139, 156, 112, 117, 112, 12761, 131, 35366, 78509, 139, 156, 112, 117, 112, 10209, 10295, 10295, 10295, 10295, 10295, 10295, 112, 140, 102]
[CLS] ['china : libera una pandemia \ n ','ee. uu : libera ovnis \ n ','argentina : libera presos \ n ','jajajajajajaja'] [SEP]
max length of tweets: 343
average len: 69


In [None]:
#define var with model type
m_tokenizer = 'bert-base-multilingual-uncased'
max_len = 80
#tokenize our tweets using the pre-proprocess function we defined earlier
train_inputs, train_masks = preprocess_bert(x_train, m_tokenizer, max_len)
val_inputs, val_masks = preprocess_bert(x_val, m_tokenizer, max_len)

In [None]:
print(train_inputs[0])
print(train_masks[0])
print(y_train[0])

[  101   138   112 10265 16422 10107 10128 23145 41881 11823 10119 12715
   110 10109 10117 29346 10102 10106 84315 10102 20241 70294   117 10173
 11589   119 10292 46720 38183 10321 10109 10426 17530 11310 10190   112
   140   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]
0


In [None]:
print('shape of train_inputs:', train_inputs.shape)
print('shape of train_masks', train_masks.shape)

shape of train_inputs: (7393, 80)
shape of train_masks (7393, 80)


##Fine-tune mBERT and mBERT-Aug classification model

The following function is used to create both the mBERT model and the mBERT-Aug model

In [None]:
import tensorflow as tf

#making labels into numpy arrays for training in transformers and keras
def create_transformer_model(pre_trained_model, num_labels, max_len):
  transformer_model = TFBertForSequenceClassification.from_pretrained(pre_trained_model, num_labels=num_labels)

  #We create keras tensors
  input_ids = tf.keras.layers.Input(shape=(max_len,), name='train_input', dtype=tf.int32)
  input_masks = tf.keras.layers.Input(shape=(max_len), name='train_masks', dtype=tf.int32)

  #take into account single dimension
  seq_outputs = transformer_model(input_ids, input_masks)[0]
  outputs = tf.keras.layers.Dense(num_labels, activation='sigmoid')(seq_outputs)

  model = tf.keras.models.Model(inputs=[input_ids,input_masks], outputs=outputs)

  print(model.summary())

  return model

In [None]:
pt_trans_model = 'bert-base-multilingual-uncased'
num_labels = 1
loss = 'binary_crossentropy'
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)  #learning-rate is changed depending on the model we wish to train


model = create_transformer_model(pt_trans_model, num_labels, 80)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
train_input (InputLayer)        [(None, 80)]         0                                            
__________________________________________________________________________________________________
train_masks (InputLayer)        [(None, 80)]         0                                            
__________________________________________________________________________________________________
tf_bert_for_sequence_classifica TFSequenceClassifier 167357185   train_input[0][0]                
                                                                 train_masks[0][0]                
__________________________________________________________________________________________________
d

In [None]:
#make labels into numpy arrays, and prep x_val
x_val = [val_inputs, val_masks]
#turn labels into tensors
y_train = np.asarray(y_train, dtype='int32')
y_val = np.asarray(y_val, dtype='int32')

In [None]:
#fit model to input data

with tf.device('/device:GPU:0'):
  model.fit([train_inputs, train_masks], y_train, validation_data=(x_val, y_val), epochs=3, batch_size = 32)


In [None]:
#save model
model.save_weights('filepath/filename')

##Testing and Error Analysis on mBert Classifier

In [None]:
#load model if not already---> in this case we load the augmented data model
model.load_weights('/content/drive/MyDrive/Colab/saved_model/mBert_profner_aug.hdf5')

In [None]:
def assign_class(a_pred):
  pred = []
  for i in range(len(a_pred)):
    if a_pred[i] >= 0.5:
      pred.append(1)
    else:
      pred.append(0)
  return pred

In [None]:
#make predictions on model
y_pred = model.predict([val_inputs, val_masks])
es_pred = assign_class(y_pred)


###Confusion Matrix for mBert Model

In [None]:
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import classification_report

y_true = y_val
y_pred = es_pred

print(confusion_matrix(y_true, y_pred, labels=[1,0]))
print(classification_report(y_true, y_pred, target_names=["non-prof", "prof"]))
print('f1-score:', f1_score(y_true, y_pred))

[[ 420   57]
 [  54 1469]]
              precision    recall  f1-score   support

    non-prof       0.96      0.96      0.96      1523
        prof       0.89      0.88      0.88       477

    accuracy                           0.94      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.94      0.94      0.94      2000

f1-score: 0.8832807570977917


#Model using Bert Uncased Classifier
We train a model using bert uncased using the English translation of the tweets 

In [None]:
#load english tweet data
train_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/train-eng.csv')
val_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/val-eng.csv')

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/test-eng.csv')

In [None]:
eng_train = train_data.tweet
eng_val = val_data.tweet

labels = train_data.label
val_labels = val_data.label

In [None]:
eng_test = test_data.tweet 

In [None]:
#shuffle data for both train and val sets
idx = np.random.permutation(len(eng_train))
x_train, y_train = eng_train[idx],labels[idx]

x_val, y_val = eng_val, val_labels

##Preprocess tweet for Bert
- we use a function preprocess_bert defined earlier in the code the encode our tweets, except we encode tweets to the pretrained bert-uncased model

In [None]:
#define var with model type
eng_tokenizer = 'bert-base-uncased'
max_len = 80
#preprocess using bert
eng_train_input, eng_train_masks = preprocess_bert(x_train, eng_tokenizer, max_len) 
eng_val_input, eng_val_masks = preprocess_bert(x_val, eng_tokenizer, max_len) 

In [None]:
#define var with model type
eng_tokenizer = 'bert-base-uncased'
max_len = 80
#preprocess using bert
eng_test_input, eng_test_masks = preprocess_bert(eng_test, eng_tokenizer, max_len)

##Fine-Tune Bert uncased classification model
We use the create_model function defined in the mBert section. We will provide the transformer model type (bert-uncased in our case), the number of classes in our problem, and max_len of each tweet

In [None]:
import tensorflow as tf

#making labels into numpy arrays for training in transformers and keras
def create_eng_transformer_model(pre_trained_model, num_labels, max_len):
  transformer_model = TFBertForSequenceClassification.from_pretrained(pre_trained_model, num_labels=num_labels)

  #We create keras tensors
  input_ids = tf.keras.layers.Input(shape=(max_len,), name='train_input', dtype=tf.int32)
  input_masks = tf.keras.layers.Input(shape=(max_len), name='train_masks', dtype=tf.int32)

  #take into account single dimension
  seq_outputs = transformer_model(input_ids, input_masks)[0]
  outputs = tf.keras.layers.Dense(num_labels, activation='sigmoid')(seq_outputs)
  
  model = tf.keras.models.Model(inputs=[input_ids,input_masks], outputs=outputs)

  print(model.summary())

  return model

In [None]:
pt_trans_model = 'bert-base-uncased'
num_labels = 1
loss = 'binary_crossentropy'
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)


eng_model = create_eng_transformer_model(pt_trans_model, num_labels, 80)
eng_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
#make labels into numpy arrays, and prep x_val
x_val = [eng_val_input, eng_val_masks]
#turn labels into tensors
y_train = np.asarray(y_train, dtype='int32')
y_val = np.asarray(y_val, dtype='int32')

In [None]:
#fit model to input data
with tf.device('/device:GPU:0'):
  eng_model.fit([eng_train_input, eng_train_masks], y_train, validation_data=(x_val, y_val), epochs=3, batch_size = 32)

In [None]:
eng_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
train_input (InputLayer)        [(None, 80)]         0                                            
__________________________________________________________________________________________________
train_masks (InputLayer)        [(None, 80)]         0                                            
__________________________________________________________________________________________________
tf_bert_for_sequence_classifica TFSequenceClassifier 109483009   train_input[0][0]                
                                                                 train_masks[0][0]                
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1)            2           tf_bert_for_sequence_classi

In [None]:
eng_model.save_weights('filepath/filename')

##Testing and Error Analysis on Bert Classifier


In [None]:
#import model if needed
eng_model.load_weights('/content/drive/MyDrive/Colab/saved_model/engBert_profner.hdf5')

In [None]:
#make predictions on model
eng_y_pred = eng_model.predict(x_val)
eng_pred = assign_class(eng_y_pred)

## Confusion Maxtrix and classification report

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from sklearn.metrics import classification_report

y_true = y_val
y_pred = eng_pred

print(confusion_matrix(y_true, eng_pred, labels=[1,0]))
print(classification_report(y_true, eng_pred, target_names=["non-prof", "prof"]))
print('f1-score:', f1_score(y_true, y_pred))

[[ 418   59]
 [  53 1470]]
              precision    recall  f1-score   support

    non-prof       0.96      0.97      0.96      1523
        prof       0.89      0.88      0.88       477

    accuracy                           0.94      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.94      0.94      0.94      2000

f1-score: 0.8818565400843882


#Concatenate transformers  and create Bilingual model

Step one:
  encode Spanish text to mbert, encode English text to bert-base.
  Encodings will be used to init a transoformer model in Spanish, and the init a transformer model in English. 
  

In [None]:
!pip install transformers
from transformers import BertTokenizer, TFBertForSequenceClassification

Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.43 tokenizers-0.10.1 transformers-4.3.3


In [None]:
#import data in English and Spanish for training
eng_train_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/train-eng.csv')
es_train_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/train.csv')

eng_train = eng_train_data.tweet
es_train = es_train_data.tweet

labels = eng_train_data.label


In [None]:
#shuffle data for train data in english
idx = np.random.permutation(len(eng_train))
eng_x_train, eng_y_train = eng_train[idx],labels[idx]

#shuffle train data in Spanish
es_x_train, es_y_train = es_train[idx],labels[idx]

In [None]:
#Import data in English and in Spanish Validation
#Import data for English and Spanish tweets
eng_val_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/val-eng.csv')
es_val_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/val.csv')

#use dataframe to assign appropriate values to variables
eng_val = eng_val_data.tweet
es_val = es_val_data.tweet

val_labels = eng_val_data.label

In [None]:
#Import data in English and in Spanish Validation
#Import data for English and Spanish tweets
es_test_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/test.csv')
eng_test_data = pd.read_csv('/content/drive/MyDrive/Datasets/profner/test-eng.csv')

#use dataframe to assign appropriate values to variables
es_test = es_test_data.tweet
eng_test = eng_test_data.tweet

In [None]:
eng_tokenizer = 'bert-base-uncased'
es_tokenizer = 'bert-base-multilingual-uncased'
max_len = 80

#train
#eng_train_input, eng_train_masks = preprocess_bert(eng_x_train, eng_tokenizer, max_len)
#es_train_input, es_train_masks = preprocess_bert(es_x_train, es_tokenizer, max_len)

#validation
es_val_input, es_val_masks = preprocess_bert(es_val, es_tokenizer, max_len)
eng_val_input, eng_val_masks = preprocess_bert(eng_val, eng_tokenizer, max_len)

In [None]:
eng_tokenizer = 'bert-base-uncased'
es_tokenizer = 'bert-base-multilingual-uncased'
max_len = 80

es_test_input, es_test_masks = preprocess_bert(es_test, es_tokenizer, max_len)
eng_test_input, eng_test_masks = preprocess_bert(eng_test, eng_tokenizer, max_len)

In [None]:
#make labels into numpy arrays, and prep x_val
x_val = [eng_val_input, eng_val_masks, es_val_input, es_val_masks]

#turn labels into numpy arrays and then concatenate
y_train = np.asarray(eng_y_train, dtype='int32')

#turn labels into numpy arrays and then concatenate (the same labels in same order bc not randomized)
y_val = np.asarray(val_labels, dtype='int32')


##Create Bilingual model
We concatenate the output of mBERT and BERT and feed it to a dense layer to obtain predictions. Code for building model and training is below.

In [None]:
import tensorflow as tf

#making labels into numpy arrays for training in transformers and keras
def create_concat_transformer_model(pre_trained_model_1, pre_trained_model_2, num_labels, max_len):
  eng_transformer_model = TFBertForSequenceClassification.from_pretrained(pre_trained_model_1, num_labels=num_labels)
  es_transformer_model = TFBertForSequenceClassification.from_pretrained(pre_trained_model_2, num_labels=num_labels)
  
  #We create keras tensors english
  eng_input_ids = tf.keras.layers.Input(shape=(max_len,), name='eng_train_input', dtype=tf.int32)
  eng_input_masks = tf.keras.layers.Input(shape=(max_len), name='eng_train_masks', dtype=tf.int32)

  #We create keras tensors Spanish
  es_input_ids = tf.keras.layers.Input(shape=(max_len,), name='es_train_input', dtype=tf.int32)
  es_input_masks = tf.keras.layers.Input(shape=(max_len), name='es_train_masks', dtype=tf.int32)
  
  #take into account single dimension
  eng_seq_outputs = eng_transformer_model(eng_input_ids, eng_input_masks)[0]
  es_seq_outputs = es_transformer_model(es_input_ids, es_input_masks)[0]

  #concatenate outputs sequences from transformer models
  concat_outputs = tf.concat([eng_seq_outputs, es_seq_outputs],1)
  outputs = tf.keras.layers.Dense(num_labels, activation='sigmoid')(concat_outputs)
  
  model = tf.keras.models.Model(inputs=[eng_input_ids,eng_input_masks, es_input_ids, es_input_masks], outputs=outputs)

  print(model.summary())

  return model

In [None]:
#call create model function and compile
eng_trans_model = 'bert-base-uncased'
es_trans_model = 'bert-base-multilingual-uncased'
num_labels = 1
loss = 'binary_crossentropy'
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)


concat_model = create_concat_transformer_model(eng_trans_model, es_trans_model, num_labels, 80)
concat_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
eng_train_input (InputLayer)    [(None, 80)]         0                                            
__________________________________________________________________________________________________
eng_train_masks (InputLayer)    [(None, 80)]         0                                            
__________________________________________________________________________________________________
es_train_input (InputLayer)     [(None, 80)]         0                                            
__________________________________________________________________________________________________
es_train_masks (InputLayer)     [(None, 80)]         0                                            
_

In [None]:
#fit model to input data

with tf.device('/device:GPU:0'):
  concat_model.fit([eng_train_input, eng_train_masks, es_train_input, es_train_masks], y_train, validation_data=(x_val, y_val), epochs=4, batch_size = 32)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
#save model
concat_model.save_weights('filepath')

##Predict and Confusion Matrix

In [None]:
concat_model.load_weights('/content/drive/MyDrive/Colab/saved_model/concat_model_weight_1.hdf5')

In [None]:
x_test = [eng_val_input, eng_val_masks, es_val_input, es_val_masks]
pred = concat_model.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

y_true = y_val
y_pred = assign_class(pred)

print(confusion_matrix(y_true, y_pred, labels=[1,0]))
print(classification_report(y_true, y_pred, target_names=["non-prof", "prof"]))
print('f1-score:', f1_score(y_true, y_pred))

[[ 419   58]
 [  38 1485]]
              precision    recall  f1-score   support

    non-prof       0.96      0.98      0.97      1523
        prof       0.92      0.88      0.90       477

    accuracy                           0.95      2000
   macro avg       0.94      0.93      0.93      2000
weighted avg       0.95      0.95      0.95      2000

f1-score: 0.8972162740899358


In [None]:
print(len(y_pred))

27000
