<a href="https://colab.research.google.com/github/gdeni89/NLP-summarization-of-French-written-documents/blob/main/NLP_Abstractive_approach_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dataset and preprocessing**

In [None]:
! pip install datasets

In [None]:
from datasets import load_dataset

dataset_orangesum = load_dataset("GEM/OrangeSum", "abstract") # we can also specify "title" to obtain pairs of text-title
dataset_xlsum = load_dataset("csebuetnlp/xlsum", "french")
dataset_mlsum = load_dataset("mlsum", "fr")

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    #if True:
     #   text = text.split()
      #  new_text = []
       # for word in text:
        #    if word in contractions:
         #       new_text.append(contractions[word])
          #  else:
           #     new_text.append(word)
        #text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("french"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [None]:
dataset_orangesum.shape

In [None]:
df_test_OS = pd.DataFrame(dataset_orangesum['test'])
df_train_OS = pd.DataFrame(dataset_orangesum['train'])
df_validation_OS = pd.DataFrame(dataset_orangesum['validation'])

df_train_OS.isnull().sum()

In [None]:
df_train_OS.head()

Removing stop words from training

In [None]:
import nltk
nltk.download('stopwords')
  
# Clean the summaries and texts
clean_target = []
for target in df_train_OS.target:
    clean_target.append(clean_text(target, remove_stopwords=False))
print("Summaries are complete.")

clean_input = []
for input in df_train_OS.input:
    clean_input.append(clean_text(input))
print("Texts are complete.")

In [None]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(5):
    print("Clean News #",i+1)
    print(clean_target[i])
    print(clean_input[i])
    print()

In [None]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [None]:
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

count_words(word_counts, clean_target)
count_words(word_counts, clean_input)
            
print("Size of Vocabulary:", len(word_counts))

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
# Load Conceptnet Numberbatch's (CN) embeddings 
# (https://github.com/commonsense/conceptnet-numberbatch)
import numpy as np

import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

#data_path = 'drive/MyDrive/numberbatch-fr.txt'
data_path = 'drive/MyDrive/Colab Notebooks/numberbatch-fr-clean.txt'

embeddings_index = {}#'rb' encoding='utf-8'

with tf.device('/device:GPU:0'):
  with open(data_path, encoding='utf-8') as f:
      for line in f:
          values = line.split(' ')
          word = values[0]
          embedding = np.asarray(values[1:])#, dtype='float32'
          embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

In [None]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

In [None]:
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
threshold = 20

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

In [None]:
# Limit the vocab that we will use to words that appear ≥ threshold 

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

In [None]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

In [None]:
def convert_to_ints(text, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count

In [None]:
# Apply convert_to_ints to clean_summaries and clean_texts
word_count = 0
unk_count = 0

int_target, word_count, unk_count = convert_to_ints(clean_target, word_count, unk_count)
int_input, word_count, unk_count = convert_to_ints(clean_input, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

In [None]:
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [None]:
lengths_target = create_lengths(int_target)
lengths_input = create_lengths(int_input)

print("Summaries:")
print(lengths_target.describe())
print()
print("Texts:")
print(lengths_input.describe())

In [None]:
# Inspect the length of "input"
print(np.percentile(lengths_input.counts, 90))
print(np.percentile(lengths_input.counts, 95))
print(np.percentile(lengths_input.counts, 99))

In [None]:
# Inspect the length of "target"
print(np.percentile(lengths_target.counts, 90))
print(np.percentile(lengths_target.counts, 95))
print(np.percentile(lengths_target.counts, 99))

In [None]:
def unk_counter(sentence):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count