In [None]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/Datascientest/Projet PY_email Datascientest/Data/Enron Cleaned Data/"
file_name = "enron_unique_1000_NER"
file = file_path+file_name+'.csv'

df_emails = pd.read_csv(file)
df_emails.tail(10)
deb = 0
fin = df_emails.shape[0]

In [None]:
# SAUVEGARDE en csv du résultat.
def svgd_csv_xls (file_path,file_name):

# to .CSV
  file_extension = ".csv"
  file = file_path+file_name+file_extension
  print("File : ",file)
  df_emails.to_csv(file, encoding='utf-8', index=False)
# df_emails[deb:fin].to_csv(file, encoding='utf-8', index=False)
  print("Svgde effectuée")
  return


# Fonctions Summary methods

In [None]:
import pandas as pd
import numpy as np

def TF(token, artikle):
    """
    Calcule le score TF d'un mot dans un artikle
    
    token : Mot dont le score TF doit être calculé.
    
    artikle : Dictionnaire généré à partir d'un texte.
    """
    f = 0
    for key in artikle:
        for work in artikle[key]:
            if work == token:
                f += 1 
    return np.log(f+1)

def IDF(token, corpus):
    """
    Calcule le score IDF d'un mot dans un corpus d'artikles.
    
    token : Mot dont le score IDF doit être calculé.
    
    corpus : Liste d'artikles.
    """
    N = len(corpus)
    d=0
    present = False
    
    for artikle in corpus:
        for key in artikle:
            if token in artikle[key]:
                present = True
        d += int(present)
        present = False
                
    return np.log(N/(d+1) +1)

def TFIDF(token, artikle, corpus):
    """
    Calcule le score TF-IDF d'un mot dans un texte.
    
    token : mot dont le score doit être calculé.
    
    artikle : artikle qui servira à calculer le score du mot dans cet artikle.
    
    corpus : Liste d'artikles formant le corpus.
    """
    return TF(token, artikle)*IDF(token, corpus)

def score_sentence(corpus, artikle, sentence):
    """
    Calcule le score d'une phrase.
    
    corpus : Liste d'artikles.
    
    artikle : Dictionnaire de phrases.
    
    sentence : Phrase sous forme de liste de mots.
    """
    score_sentence = []
    for word in sentence :
        score_word = TFIDF(word, artikle, corpus)
        score_sentence.append(score_word)
    return np.mean(score_sentence)

def best_sentences(scores_artikle, nb_sentences):
    """
    Retourne les indices des phrases les plus importantes en fonction des scores obtenus.
    
    scores_artikle : Liste des scores de chaque phrase dans un texte.
    
    nb_sentences : Nombre de phrases à sélectionner.
    """
    
    return sorted(np.argsort(scores_artikle)[-nb_sentences:])

def summarize_TFIDF(i, n_sentences, df):
    """
    Synthèse extractive d'un article par la méthode TF-IDF.
    
    i : indice de l'article dans le corpus.
    
    n_sentences : nombre de phrases à conserver.
    
    df : DataFrame contenant les artikles dans une colonne 'Artikle'.    
    """
    corpus = df['body_dict']
    artikle = corpus[i]
    texte=df['body'][i]
    
    if len(corpus) <= n_sentences:
      print("Longeur corpus inférieure au nombre minimal de phrases retenu pour le résumé")
      texte = "See Original"
      return texte
    # Calcul du score de chaque phrase de l'artikle
    scores_artikle = [score_sentence(corpus, artikle, sentence) for sentence in artikle.values()]
 
    
    # Extraction des indices des phrases ayant les meilleurs scores
    result = best_sentences(scores_artikle, n_sentences)
#    print('best sentences :',result)
    
    # Séparation de phrases l'article original 
    tokenized_article = sent_tokenize(texte)
    
    # Extraction des phrases les plus importantes
    summary = [tokenized_article[i] for i in result]
    
    # transformation finale en chaine de caracteres
    texte = ''.join(summary)
    return texte

def summarize_spacy (text,ratio):
  # https://www.numpyninja.com/post/text-summarization-through-use-of-spacy-library
  # print('\n Texte à résumer avec un ratio de ',ratio*100,"%")
  # print(text)
  nlp = spacy.load('en_core_web_lg')
  doc= nlp(text)
  tokens=[token.text for token in doc]
#  print("Tokens : \n",tokens)

  # calcul frequence de mots
  word_frequencies={}
  for word in doc:
    if word.text.lower() not in stopwords:
      if word.text.lower() not in punctuation:
        if word.text not in word_frequencies.keys():
          word_frequencies[word.text] = 1
        else:
          word_frequencies[word.text] += 1
  # print("\n word_frequencies : ",word_frequencies)

  # normalisation des frequence de mots
  max_frequency=max(word_frequencies.values())
  # print('Max Frequency : ',max_frequency)
  for word in word_frequencies.keys():
    word_frequencies[word]=word_frequencies[word]/max_frequency
  # print("\n Normalized word_frequencies : \n",word_frequencies)
  # Sentences token
  sentence_tokens= [sent for sent in doc.sents]
  # print("sentence_tokens ",sentence_tokens)
  # Calculate the most important sentences by adding the word frequencies in each sentence.
  sentence_scores = {}
  for sent in sentence_tokens:
    for word in sent:
      if word.text.lower() in word_frequencies.keys():
        if sent not in sentence_scores.keys():                            
          sentence_scores[sent]=word_frequencies[word.text.lower()]
        else:
          sentence_scores[sent]+=word_frequencies[word.text.lower()]

  # identifier % (ratio) du texte avec score maximum
  from heapq import nlargest
  select_length=int(len(sentence_tokens)*ratio)
  # print('select_length ',select_length)
  if select_length != 0:
    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
    final_summary=[word.text for word in summary]
    summary=''.join(final_summary)
  else:
    summary = 'See Original'
  return summary


In [None]:
!pip install transformers
!pip install datasets transformers[sentencepiece]

In [None]:
# T5 (non performant)
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration
my_model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

def summarize_T5 (text):
  text = "summarize:" + text
  # encoding the input text
  input_ids=tokenizer.encode(text, return_tensors='pt', max_length=512)
  # Generating summary ids
  summary_ids = my_model.generate(input_ids)
  # Decoding the tensor and printing the summary.
  t5_summary = tokenizer.decode(summary_ids[0])
  return t5_summary

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
# Loading the model and tokenizer for bart-large-cnn
tokenizer=BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

def summarize_BART (text):
  # Encoding the inputs and passing them to model.generate()
  inputs = tokenizer.batch_encode_plus([text],return_tensors='pt')
  summary_ids = model.generate(inputs['input_ids'], early_stopping=True)
  # Decoding and printing the summary
  bart_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  # print('bart_summary',bart_summary)
  return bart_summary

TF-IDF

In [None]:

print("Traitement TF-IDF en cours pour ",fin, "records.")
for id in range(deb,fin):
  print(" id : ",id)
  text = df_emails['body_clean'][id]
  # Extractive summary using TF-IDF
  # ------------------------------------------------------------------------
  # suppression de stopwords pour TFIDF
  text = remove_stop_words(text,stopwords)

  # Dictionnarize the text to be used with TF-IDF
  df_emails['body_dict'][id] = dictionarize(text)   # sans stopwords
  max_sent = 3 
  if len(df_emails['body_dict'][id]) > max_sent:
    text = df_emails['body_dict'][id]
    # on affiche phrase par phrase le contenu de chaque mail
    # for cle, valeur in text.items():
    #   print("Ligne ", cle, " : ", valeur)
    # Identification des n_sent les plus significatives

    df_emails['summary_TFIDF'][id] = summarize_TFIDF(id, max_sent, df_emails)
    ratio = len(df_emails['summary_TFIDF'][id])/len(df_emails['body_clean'][id])
    # print("\nRésumé TFIDF - Len ",len(df_emails['summary_TFIDF'][id])," Ratio : ",int(ratio*100),"%")
    # print(df_emails['summary_TFIDF'][id])
  else:
    # print("ID ",id," contenu trop court. Reprise de l'original pour TFIDF")
    df_emails['summary_TFIDF'][id] = df_emails['body_clean'][id]


print("Traitement TF-IDF terminé pour ",fin, "records.")
svgd_csv_xls (file_path,file_name)

Spacy

In [None]:
print("Résumé par Spacy en cours pour ",fin, "records.")
for id in range(deb,fin):
  print(" id : ",id)
# ------------------------------------------------------------------------
  # Extractive summary using Spacy
  ratio = 0.20
  text = df_emails['body_clean'].iloc[id]
  df_emails['summary_spacy'][id] = summarize_spacy (str(text),ratio)
  ratio = len(df_emails['summary_spacy'][id])/len(df_emails['body_clean'][id])
  # print("\nRésumé Spacy - Len ",len(df_emails['summary_spacy'][id]),"Ratio : ",int(ratio*100),"%")
  # print(df_emails['summary_spacy'][id])

print("Résumé par Spacy terminé pour ",fin, "records.")
svgd_csv_xls (file_path,file_name)

BART

In [None]:
print("Traitement résume avec BART en cours pour ",fin, "records.")
for id in range(deb,fin):
  print("id : ",id)
  # print('--------------------')
  # coller quand OK pour les tests unitaires
  df_emails['body_clean'][id] = data_clean(str(df_emails['body'].iloc[id]))
  text = df_emails['body_clean'][id]
  # print("Cleaned text : ",len(str(text))," car.\n")
  # print(text)
 
# ------------------------------------------------------------------------
# Abstractive summary using Transformers BART
  # print("\nExtractive summary using Transformers / BART")
  text = df_emails['body_clean'].iloc[id]
  # print('text :',text)
  try:
    df_emails['summary_BART'][id] = summarize_BART (text)
    ratio = len(df_emails['summary_BART'][id])/len(df_emails['body_clean'][id])
    # print("\nRésumé Transformers BART  - Len ",len(df_emails['summary_BART'][id])," Ratio : ",int(ratio*100),"%")
  except:
    # print("Pb extractive summary on Id ",id)
    df_emails['summary_BART'][id] = "Erreur"

  # Svgd version intermédiaire
  if int(id/100) == id/100:
    file_name_tmp = "enron_unique_output_"+str(id)
    file = file_path+file_name_tmp
    svgd_csv_xls (file_path,file_name_tmp)
    print(file_name_tmp," sauvegardé.")

print("Traitement résume avec BART terminé pour ",fin, "records.")

svgd_csv_xls (file_path,file_name)

# Calcul similarités des résumés

In [None]:
spacy.load('en_core_web_lg')
nlp = spacy.load('en_core_web_lg')

In [None]:
print("Traitement en cours pour ",fin, "records")

for id in range(deb,fin):
  print("ID:",id)
  # Calcul de similarité avec texte initial nettoyé
  doc1 = nlp(df_emails['body_clean'][id])
  max_sim = 0
  doc2 = nlp(df_emails['summary_TFIDF'][id])
  if doc2 != "":
    sim = doc1.similarity(doc2)
    df_emails['summary_TFIDF_sim'][id] = sim
    max_sim = sim

  doc2 = nlp(df_emails['summary_spacy'][id])
  if doc2 != "":
    sim = doc1.similarity(doc2)
    df_emails['summary_spacy_sim'][id] = sim
    if sim > max_sim:
      max_sim = sim

  doc2 = nlp(df_emails['summary_T5'][id])
  if doc2 != "Erreur":
    sim = doc1.similarity(doc2)
    df_emails['summary_T5_sim'][id] = sim
    if sim > max_sim:
      max_sim = sim

  doc2 = nlp(df_emails['summary_BART'][id])
  if doc2 != "Erreur":
    sim = doc1.similarity(doc2)
    df_emails['summary_BART_sim'][id] = sim
    if sim > max_sim:
      max_sim = sim

  # print("Meilleur score : ",max_sim)
  df_emails['best_sim'][id] = max_sim


svgd_csv_xls (file_path,file_name)