In [1]:
from keras.layers import TextVectorization, Embedding
from keras_nlp.layers import TransformerEncoder
import os
import pandas as pd
import sys

PATH = "/Users/davidflorezmazuera/Library/CloudStorage/GoogleDrive-270191@student.pwr.edu.pl/Mi unidad/Spanish_V2"



In [2]:
authors_path = os.path.join(PATH, "authors-genre.csv")

dataframe = pd.read_csv(authors_path, usecols=[1,3], sep=";")
dataframe = dataframe.dropna()


In [3]:
sys.path.append(os.path.dirname(os.path.abspath('')))
from web_scrapping.script import Book, Library

path = os.path.join(PATH, "metadata_cleaned.json")
library = Library.from_books_path(path)

def get_words_from_metadata(library: Library, book_id: str):
    '''
        Input: library - Library object
               book_id - id of the book
               Output: words - int'''
    id_in_metadata = book_id.split('_')[0]
    # each book in metadata_cleaned.json has an id that is the same that id_in_metadata 
    books = library.search('book_id', id_in_metadata)
    return books[0].words


Loading books from path


100%|██████████| 1001/1001 [00:00<00:00, 214209.10it/s]


In [4]:
dataframe['words'] = dataframe['book_id'].apply(lambda book_id: get_words_from_metadata(library, book_id))
print(dataframe.head())

  book_id gender   words
0     0_0   male  108712
1     1_0   male   10160
2     2_0   male    9046
3     3_0   male   10955
4     4_0   male   16000


In [5]:
# Filter the dataframe to get only the books that have more than 100 words
dataframe = dataframe[dataframe['words'].apply(lambda words: words > 100)]
# Subsampling to get same number of females and males

female_dataframe = dataframe[dataframe['gender']=='female']
male_dataframe = dataframe[dataframe['gender']=='male']
number_of_females = female_dataframe['book_id'].count()
male_dataframe = male_dataframe.sample(n=number_of_females, random_state=1)

mixed_df = pd.concat((female_dataframe,male_dataframe))
dataframe = mixed_df
dataframe = dataframe.sample(frac=1, random_state=1)

In [6]:
books = []
for book_id in dataframe['book_id']:
    # get the .txt file
    path = os.path.join(PATH, f"{book_id}.txt")
    with open(path) as f:
        books.append(f.read())


In [7]:
from transformers import TFXLMRobertaModel
model = TFXLMRobertaModel.from_pretrained("jplu/tf-xlm-roberta-base")
#tokenizer = XLMRobertaTokenizer.from_pretrained("jplu/tf-xlm-roberta-base")


Some layers from the model checkpoint at jplu/tf-xlm-roberta-base were not used when initializing TFXLMRobertaModel: ['lm_head']
- This IS expected if you are initializing TFXLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLMRobertaModel were initialized from the model checkpoint at jplu/tf-xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [8]:
from transformers import XLMRobertaTokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [9]:
inputs = tokenizer(books, return_tensors="tf", padding=True, truncation=True, max_length=512)
inputs

{'input_ids': <tf.Tensor: shape=(22, 512), dtype=int32, numpy=
array([[     0,  33172, 109113, ...,     90,      4,      2],
       [     0,   3994,    618, ...,    110,     40,      2],
       [     0,  14467,    141, ...,    441,    320,      2],
       ...,
       [     0,    241,    339, ...,    458,    272,      2],
       [     0,    572,     62, ...,     10,    876,      2],
       [     0,  70661,  44060, ...,  53317,     12,      2]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(22, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}

In [10]:
pre_y = dataframe['gender'].values

# Transform female to 0 and male to 1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(pre_y)
y

array([1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0])

In [11]:
# detaset
import keras
import tensorflow as tf

In [12]:
dataset=dict(inputs) #Create a tensorflow dataset
#train test split, we use 10% of the data for validation
dataset.keys()

dict_keys(['input_ids', 'attention_mask'])

In [13]:
inputs

{'input_ids': <tf.Tensor: shape=(22, 512), dtype=int32, numpy=
array([[     0,  33172, 109113, ...,     90,      4,      2],
       [     0,   3994,    618, ...,    110,     40,      2],
       [     0,  14467,    141, ...,    441,    320,      2],
       ...,
       [     0,    241,    339, ...,    458,    272,      2],
       [     0,    572,     62, ...,     10,    876,      2],
       [     0,  70661,  44060, ...,  53317,     12,      2]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(22, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)>}

In [14]:
ds = tf.data.Dataset.from_tensor_slices((dataset, y))
ds = ds.shuffle(1000).batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [17]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
history = model.fit(
    ds,
    batch_size=32,
)




InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert' defined at (most recent call last):
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/traitlets/config/application.py", line 976, in launch_instance
      app.start()
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 619, in start
      self.io_loop.start()
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/tornado/ioloop.py", line 688, in <lambda>
      lambda f: self._run_callback(functools.partial(callback, future))
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/tornado/ioloop.py", line 741, in _run_callback
      ret = callback()
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/tornado/gen.py", line 814, in inner
      self.ctx_run(self.run)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/tornado/gen.py", line 775, in run
      yielded = self.gen.send(value)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 358, in process_one
      yield gen.maybe_future(dispatch(*args))
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell
      yield gen.maybe_future(handler(stream, idents, msg))
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 536, in execute_request
      self.do_execute(
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 302, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 539, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2881, in run_cell
      result = self._run_cell(
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2936, in _run_cell
      return runner(coro)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3135, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3338, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "<ipython-input-17-2c10428570f9>", line 5, in <cell line: 5>
      history = model.fit(
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/engine/training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/engine/training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/engine/training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/engine/training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/transformers/modeling_tf_utils.py", line 1554, in train_step
      loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/engine/compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/losses.py", line 272, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/losses.py", line 2084, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "/opt/homebrew/anaconda3/envs/torch-gpu/lib/python3.8/site-packages/keras/backend.py", line 5630, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert'
assertion failed: [Condition x == y did not hold element-wise:] [x (sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/Shape_1:0) = ] [22 1] [y (sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/strided_slice:0) = ] [22 512]
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert}}]] [Op:__inference_train_function_47619]