<a href="https://colab.research.google.com/github/marco-scatassi/TLDR_text_summarization/blob/main/1_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library Import - General


In [163]:
import pandas as pd
import numpy as np

# Data Loading


In [95]:
train = pd.read_json('/content/train_lemmatized.json', orient="records", lines=True)
train_lemmatized = train['document_lemmatized']
train.head(1)

Unnamed: 0,id,document,summary,ext_labels,rg_labels,document_normalized,document_tokenized,document_stopwords,document_lemmatized
0,train-TLDR_RS_2019-07-25907.json,"[hey y' all , i 've been a lurker in this comm...",i 'm publishing betas of some stuff i 've been...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.040456223300000003, 0.022386650100000002, 0...",[hey y all i have been a lurked in this commun...,"[[hey, y, all, i, have, been, a, lurked, in, t...","[[hey, lurked, community, sons, time, contribu...","[[hey, lurked, community, son, time, contribut..."


In [64]:
val = pd.read_json('/content/val_lemmatized.json', orient="records", lines=True)
val_lemmatized = val['document_lemmatized']
val.head(1)

Unnamed: 0,id,document,summary,ext_labels,rg_labels,document_normalized,document_tokenized,document_stopwords,document_lemmatized
0,train-TLDR_RS_2019-02-45730.json,[so i just had the thought of what if people c...,name a new eve every 2 - 5 generations so that...,"[0, 0, 1, 0, 0]","[0.14523170130000002, 0.0821840568, 0.47712410...",[so i just had the thought of what if people c...,"[[so, i, just, had, the, thought, of, what, if...","[[thought, people, could, name, children, even...","[[thought, people, could, name, child, even, c..."


In [63]:
test = pd.read_json('/content/test_lemmatized.json', orient="records", lines=True)
test_lemmatized = test['document_lemmatized']
test.head(1)

Unnamed: 0,id,document,summary,ext_labels,rg_labels,document_normalized,document_tokenized,document_stopwords,document_lemmatized
0,train-TLDR_RS_2019-02-45730.json,[so i just had the thought of what if people c...,name a new eve every 2 - 5 generations so that...,"[0, 0, 1, 0, 0]","[0.14523170130000002, 0.0821840568, 0.47712410...",[so i just had the thought of what if people c...,"[[so, i, just, had, the, thought, of, what, if...","[[thought, people, could, name, children, even...","[[thought, people, could, name, child, even, c..."


# POS tagging

## Library import

In [49]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

Serve per estrarre il solo tag dalla coppia (word,tag) restituita dal *nltk.tag.pos_tag_sents*

In [128]:
def extract_tags(document_tags: list):
  doc_tags = pd.Series(document_tags)
  doc_tags = doc_tags.apply(lambda subList: pd.Series(subList))
  doc_tags = doc_tags.applymap(lambda wordTagTuple: wordTagTuple[1] if type(wordTagTuple)==tuple else '')
  return doc_tags.values

Lavora direttamente su una lista di frasi (ovvero il documento), in questo modo èpiù efficiente

In [130]:
def POS_tagging(document: list, tagset:str = 'universal', lang:str='eng'):
  POS_tags = nltk.tag.pos_tag_sents(document, tagset=tagset, lang=lang)
  POS_tags = extract_tags(POS_tags)
  return POS_tags

In [132]:
train['POS_tag'] = train_lemmatized.apply(POS_tagging)

  doc_tags = doc_tags.apply(lambda subList: pd.Series(subList))


# Text Summarization

## Library import

In [233]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Features definition


In [341]:
def sentence_relative_positions(sentence: list, document: list):
  doc = pd.Series(document)
  sentence_index = doc[doc.apply(lambda x: x==sentence)].index.to_list()[0]
  return sentence_index/(len(doc)-1)

In [348]:
def word_in_sentence_relative(sentence: list, document:list):
  return len(sentence)/len(np.array(document).flatten())

In [370]:
def POS_tag_ratio(sentence: list, tag_name: str, document: list, document_tags: list):
  doc = pd.Series(document)

  sentence_index = document[document.apply(lambda x: x==sentence)].index.to_list()[0]
  tags = document_tags[sentence_index]

  sentence_length = len(sentence)

  if sentence_length == 0:
    return 0
  else:
    return sum(tags==tag_name)/sentence_length

In [264]:
def TF_ISF_and_Sentence_similarity_score_n_gram(document: list, n: int=1):
  doc = pd.Series(document)
  doc = doc.apply(lambda wordList: ' '.join(wordList))
  
  vectorizer = TfidfVectorizer(ngram_range=(n,n))
  X = vectorizer.fit_transform(doc)

  tf_isf_absolute = X.sum(axis=1)
  tf_isf_relative = tf_isf_absolute/max(tf_isf_absolute)

  sentence_similarity_absolute = np.array(list(map(lambda x: x.sum(), cosine_similarity(X))))
  sentence_similarity_relative = sentence_similarity_absolute/len(doc)

  return (tf_isf_relative, sentence_similarity_relative)

## Matrix generation

In [372]:
def sentence_feature_matrix(document_data: pd.Series):
  doc = pd.Series(document_data[0])
  doc_tag = document_data[1]
  
  s = dict()

  s['sentence_relative_positions'] = doc.apply(sentence_relative_positions, document = doc)
  s['word_in_sentence_relative'] = doc.apply(word_in_sentence_relative, document = doc)

  for tag_name in ['NOUN','VERB','ADJ']
  s['POS_tag_ratio'] = doc.apply(POS_tag_ratio, tag_name = 'NOUN', document = doc, document_tags = doc_tag)

  return s

In [None]:
sentence_feature_matrix(train[['document_lemmatized','POS_tag']].iloc[0])