In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [None]:
data_path = "/content/ner_dataset.csv"

data = pd.read_csv(data_path, encoding= 'unicode_escape')
# filling the first column that determines which sentence each word belongs to.
data.fillna(method = 'ffill', inplace = True)
df = data.groupby('Sentence #').agg({'Word': ' '.join, 'Tag': list}).reset_index()
df.rename(columns={'Word': 'Sentence', 'Tag': 'Tag'}, inplace=True)
df.head()

In [None]:
#data['Sentences'] = data['Sentence'].apply(lambda x: ' '.join(x))
X = df['Sentence']
Y = df['Tag']

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

X_text_train, X_text_test, y_tag_train, y_tag_test = train_test_split(X,Y, test_size=0.2,random_state=42)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_text_train)

In [None]:
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
vocab_size = tokenizer.num_words

In [None]:
X_train = tokenizer.texts_to_sequences(X_text_train)

X_test = tokenizer.texts_to_sequences(X_text_test)

maxlen = 100

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen, padding='post', truncating='post', value=0)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen, padding='post', truncating='post', value=0)


In [None]:
# Récupérer les tags uniques
unique_tags = df['Tag'].explode().unique()

# Créer le dictionnaire de mapping
word2idx_tag = {tag: idx for idx, tag in enumerate(unique_tags)}
word2idx_tag['O'] = 0

print(unique_tags)


['O' 'B-geo' 'B-gpe' 'B-tim' 'B-org' 'I-geo' 'B-per' 'I-per' 'I-org'
 'I-tim' 'B-art' 'I-art' 'B-nat' 'I-gpe' 'I-nat' 'B-eve' 'I-eve']


In [None]:
y_train = y_tag_train.apply(lambda tags: [word2idx_tag[tag] for tag in tags])
print(y_train[0])
y_test = y_tag_test.apply(lambda tags: [word2idx_tag[tag] for tag in tags])

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


In [None]:
y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train, maxlen=maxlen, padding='post', truncating='post')
y_test = tf.keras.preprocessing.sequence.pad_sequences(y_test, maxlen=maxlen, padding='post', truncating='post')

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(38367, 100)
(9592, 100)
(38367, 100)
(9592, 100)


In [None]:
# ---------------------------------------------------------
# modele simple AVEC TENSOR SLICE
# --------------------------------------------------------

In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
embedding_dim = 300
maxlen = 100
max_words = 36000
nb_tags = len(unique_tags)

BATCH_SIZE = 132
SHUFFLE_BUFFER_SIZE = 132

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

model = Sequential()
#model.add(Input(shape=(maxlen,), name='input_layer'))
model.add(Embedding(input_dim = max_words, output_dim = nb_tags, mask_zero=True, name='embedding_layer'))
model.add(RNN(GRUCell(32), return_sequences=True, name='gru_layer'))
#test ajout de pooling
#test ajout de droput
model.add(Dense(nb_tags, name='output_layer'))

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding  (None, 100, 17)           612000    
 )                                                               
                                                                 
 gru_layer (RNN)             (None, 100, 32)           4896      
                                                                 
 output_layer (Dense)        (None, 100, 17)           561       
                                                                 
Total params: 617457 (2.36 MB)
Trainable params: 617457 (2.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, validation_data = [X_test, y_test])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.evaluate(y_test)
# tres tres mauvais



[0.0, 0.0]

In [None]:
# ---------------------------------------------------------
# modele bidirectionnel SANS TENSOR SLICE
# --------------------------------------------------------

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D, RNN, GRUCell, Dropout, Bidirectional, LSTMCell, LSTM, Input, Dropout


embedding_dim = 300
maxlen = 100
max_words = 36000
nb_tags = len(unique_tags)

model_bi_lstm = Sequential()
model_bi_lstm.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model_bi_lstm.add(Bidirectional(LSTM(units=100, activation='tanh', return_sequences=True)))
model_bi_lstm.add(Bidirectional(LSTM(units=100, activation='tanh', return_sequences=True)))
model_bi_lstm.add(Dense(nb_tags, activation='softmax'))

model_bi_lstm.summary()


Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 300)          10800000  
                                                                 
 bidirectional_4 (Bidirecti  (None, 100, 200)          320800    
 onal)                                                           
                                                                 
 bidirectional_5 (Bidirecti  (None, 100, 200)          240800    
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 100, 17)           3417      
                                                                 
Total params: 11365017 (43.35 MB)
Trainable params: 11365017 (43.35 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model_bi_lstm.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model_bi_lstm.fit(X_train, y_train, epochs=15, validation_data = [X_test, y_test])

model_bi_lstm.evaluate(y_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15

In [None]:
# ---------------------------------------------------------
# modele bidirectionnel SANS TENSOR SLICE
# --------------------------------------------------------

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Masking, Embedding, Dense, GlobalAveragePooling1D, RNN, GRUCell, Dropout, Bidirectional, LSTMCell, LSTM, Input, Dropout
from keras.models import Model

vector_size = 16
i = Input(shape=(100,))
x = Embedding(input_dim=max_words, output_dim=vector_size, mask_zero=True)(i)
x = Masking()(x)  # Masking layer to handle variable length sequences
x = Bidirectional(LSTM(32, return_sequences=True))(x)
x = Dense(nb_tags)(x)

model = Model(i, x)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import SparseCategoricalCrossentropy

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model_checkpoint = ModelCheckpoint('/content/best_model.keras', monitor='val_loss', save_best_only=True)

model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.fit(X_train, y_train,
          validation_data=(X_test, y_test),
          epochs=20,
          callbacks=[early_stopping, model_checkpoint])


In [None]:
# ---------------------------------------------------------
# Predictions
# --------------------------------------------------------

In [None]:
input_text = "Apple is planning to open a new store in Tokyo next month."

input_sequence = tokenizer.texts_to_sequences([input_text])
input_sequence_padded = pad_sequences(input_sequence, maxlen = maxlen, padding='post')

idx2tag = {idx: tag for tag, idx in word2idx_tag.items()}

In [None]:

predictions = model.predict(input_sequence_padded)
decoded_predictions = [idx2tag[np.argmax(pred)] for pred in predictions[0]]

print(decoded_predictions)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
# ---------------------------------------------------------
# Spacy et NLTK
# --------------------------------------------------------

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

input_text = "Apple is planning to open a new store in Tokyo next month."

doc = nlp(input_text)
ner_tags_spacy = [(ent.text, ent.label_) for ent in doc.ents]

print(ner_tags_spacy)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag, ne_chunk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

input_text = "Apple is planning to open a new store in Tokyo next month."

words = word_tokenize(input_text)

tagged = pos_tag(words)

entities = ne_chunk(tagged)

ner_tags_nltk = [(chunk.text, chunk.label()) for chunk in entities if hasattr(chunk, 'label')]

print(ner_tags_nltk)
