In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=c8249dfe7441de6f3859e6231c1441c652f5cac29072ab2483f44bd51932c6ce
  Stored in directory: /tmp/pip-ephem-wheel-cache-42olfrar/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


In [None]:
!pip install -q transformers  rouge-score sentence-transformers

[K     |████████████████████████████████| 2.6 MB 4.1 MB/s 
[K     |████████████████████████████████| 85 kB 5.6 MB/s 
[K     |████████████████████████████████| 636 kB 57.4 MB/s 
[K     |████████████████████████████████| 3.3 MB 53.4 MB/s 
[K     |████████████████████████████████| 895 kB 64.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 57.4 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
import tensorflow_datasets as tfds 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
from tqdm.notebook import tqdm

import tensorflow_hub as hub
from tensorflow import keras 
import tensorflow as tf
from IPython.display import Image 
import matplotlib.pyplot as plt

import os 

nlp = spacy.load('en_core_web_lg')
os.makedirs("data", exist_ok=True)
sum_dir = "drive/MyDrive/mlexperiments/extractivesummarization/"
cnn_df = pd.read_json(sum_dir + "data/test/test.json") 

In [None]:
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

## 

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2")
sentence_model = AutoModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2")



#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def get_sentence_embedding(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
      model_output = sentence_model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings

#Sentences we want sentence embeddings for
sentences = ['This framework generates embeddings for each input sentence',
             'Sentences are passed as a list of string.',
             'Twitter is rolling out changes to its newly rebuilt API that will allow third-party developers to build tools and other solutions specifically for its audio chatroom product, Twitter Spaces. The company today announced it’s shipping new endpoints to support Spaces on the Twitter API v2, with the initial focus on enabling discovery of live or scheduled Spaces. This may later be followed by an API update that will make it possible for developers to build out more tools for Spaces’ hosts.With the current API update, Twitter hopes developers will build new products that enable users — both on and off Twitter — to find Twitter Spaces more easily, the company says. This could potentially broaden the reach of Spaces and introduce its audio chats to more people, which could give Twitter a leg up in the increasingly competitive landscape for audio-based social networking. Today, Twitter Spaces isn’t only taking on Clubhouse, but also the audio chat experiences being offered by Facebook, Discord, Reddit, Public.com, Spotify and smaller social apps.']




Downloading:   0%|          | 0.00/516 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [None]:
%timeit
semmb = get_sentence_embedding(sentences)
semmb.shape

torch.Size([3, 384])

## Build a Classification Head Model 



## Create Data Pipeline

In [None]:
from rouge_score import rouge_scorer 
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def get_rougue_score(text, highlights, metric="rougeL"):
  max_score = 0
  for h_text in highlights:
    score =  scorer.score(text, h_text)[metric].fmeasure
    # print(score, text, "\n \t" , h_text)
    if score > max_score:
      max_score = score 
  return max_score 


def get_label(sent, doc_dict,  score_threshold = 0.85):
  sent_id, doc_id, sentence = sent["sentid"], sent["docid"], sent["text"]  
  highlights = doc_dict[doc_id]["highlight"].split("\n")
  doc = doc_dict[doc_id]["article"]

  label_score = get_rougue_score(sentence, highlights) 
  # Normalize label to 0/1 based on rogue score threshold
  label_score = 0 if label_score < score_threshold else 1 
  return (sentence, doc, label_score)
  

def get_label_only(sentence, highlights,  score_threshold = 0.85): 
  label_score = get_rougue_score(sentence, highlights) 
  # Normalize label to 0/1 based on rogue score threshold
  label_score = 0 if label_score < score_threshold else 1 
  return  label_score 
  

In [None]:
def get_dicts(df, folder="test"):   
  sents_dict = {}
  doc_dict = { i: {"article": df.article[i], "highlight": df.highlights[i]} for i in df.index }
  raw_docs = [ doc_dict[k]["article"] for k in doc_dict.keys()]

  doc_sents = {}
  sents_list = []
  raw_sents = [] 
  i = 0
  min_sent_length = 14
  for k in tqdm(doc_dict.keys()):
    article = doc_dict[k]["article"]  
    highlight = doc_dict[k]["highlight"] 
    sents = nlp(article).sents
    doc_sent_ids = [] 
    for sent in sents:
      if (len(sent)) > min_sent_length:
        sents_dict[i] = {"docid":k, "text": str(sent)} 
        sents_list.append({"sentid":i, "docid":k, "text": str(sent) }) 
        raw_sents.append(str(sent))
        i += 1  
        
    
  # sent_embs = get_sentence_embedding(sentence_model, raw_sents)
  # doc_embs = get_sentence_embedding(sentence_model, raw_docs)
  # save dicts
  return doc_dict, sents_list

  

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split( cnn_df, test_size=0.2, random_state=42)
print(train_df.shape, test_df.shape)

test_doc_dict, test_sents_list = get_dicts(test_df)
train_doc_dict, train_sents_list = get_dicts(train_df)

(9192, 2) (2298, 2)


  0%|          | 0/2298 [00:00<?, ?it/s]

  0%|          | 0/9192 [00:00<?, ?it/s]

In [None]:

def sub_sample(sents_batch, doc_dict):
  vals = [get_label(x, doc_dict)  for x in sents_batch] 
    
  sents, docs, y = [], [], [] 
  for row in vals:
    sents.append(row[0])
    docs.append(row[1])
    y.append(row[2])
   

  # get balanced number of positive and negative
  sub_df = pd.DataFrame.from_dict({"sents":sents, "docs":docs, "y":y}) 
  pos_df = sub_df[sub_df.y == 1]
  neg_df = sub_df[sub_df.y == 0]

  sub_neg_df = neg_df.sample(len(pos_df)) 
  balanced_df = pos_df.append(sub_neg_df)
  
  return balanced_df




In [None]:
train_bdf = sub_sample(train_sents_list, train_doc_dict)
test_bdf = sub_sample(test_sents_list, test_doc_dict)

train_bdf.to_json("data/train_bdf.json")
test_bdf.to_json("data/test_bdf.json")

!gsutil cp -r data $sum_dir

In [None]:
def get_batched_vals(sents_batch):
    vals = [get_label(x, train_doc_dict)  for x in sents_batch] 
    
    sents, docs, y = [], [], [] 
    for row in vals:
      sents.append(row[0])
      docs.append(row[1])
      y.append(row[2])
    return sents, docs, y

## Get Class Weights 

In [None]:
batch_start, batch_end = 0, 64
batch = train_bdf.iloc[batch_start:batch_end]
sentences, docs, y = list(batch.sents), list(batch.docs), list(batch.y)

sentences = get_sentence_embedding(sentences)
docs = get_sentence_embedding(docs)
y = torch.reshape(torch.FloatTensor(y), (-1,1))

In [None]:
train_bdf.shape

In [None]:
train_bdf

In [None]:
class Net(nn.Module):

    def __init__(self, in_features=384):
        super(Net, self).__init__()
        
        # classification head that accepts text features

        self.cls_head = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features*3, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),

            nn.Dropout(p=0.5),
            nn.Linear(512, 64),
            nn.BatchNorm1d(64),
            nn.Sigmoid(),
            nn.Dropout(p=0.5),

            nn.Linear(64, 1),
            nn.Sigmoid(),
        )

    def forward(self, doc_feats, sentence_feats):
        
        # Multiply (element-wise) the feature vectors of the two images together, 
        # to generate a combined feature vector representing the similarity between the two.
        combined_features = doc_feats * sentence_feats  

        # get concat of both features and elementwise product
        x = torch.cat((doc_feats, sentence_feats, combined_features), dim=1) 

        output = self.cls_head(x)

        # print(output.shape)
        return output

## Train a model

In [None]:
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter


model = Net() 

losses = []
correct = 0
total = 0 


# Set device to CUDA if a CUDA device is available, else CPU. Copy model to selected device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("Cuda is available?", torch.cuda.is_available())
learning_rate = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# criterion = nn.CrossEntropyLoss() # 
criterion = torch.nn.BCELoss()


writer = SummaryWriter(os.path.join("", "summary"))

batch_size = 128
n_epochs = 1
n_batches = int( len(train_bdf) / batch_size) + 1

for epoch in range(n_epochs):
  print("Epoch ... [{} / {}]".format(epoch, n_epochs))
  model.train()

  losses = []
  correct = 0
  total = 0

  for i in tqdm(range(n_batches)):
    # sentence, document, y = get_label(sent, train_doc_dict)
    batch_start = i*batch_size
    batch_end = (i+1)*batch_size

    batch = train_bdf.iloc[batch_start:batch_end]
    sentences, docs, y = list(batch.sents), list(batch.docs), list(batch.y)
     
    
    if len(sentences) == 0:
      break;
    # get embeddings for docss and sentences 
    sentences = get_sentence_embedding(sentences)
    docs = get_sentence_embedding(docs)
    y = torch.reshape(torch.FloatTensor(y), (-1,1)) 
   

    sentences, docs, y = map(lambda x: x.to(device), [sentences, docs, y])

    optimizer.zero_grad()

    # get model prediction
    prob = model(sentences, docs) 
    loss = criterion(prob, y)

    
    loss.backward()
    optimizer.step()

    losses.append(loss.item())
    batch_correct = torch.count_nonzero(y == (prob > 0.5)).item()
    correct += batch_correct
    total += len(y)
    print("Batch | ", str(i), "of", str(n_batches), "loss -> ", loss.item() , "acc ->", round(batch_correct/batch_size,4) )

  writer.add_scalar('train_loss', sum(losses)/len(losses), epoch)
  writer.add_scalar('train_acc', correct / total, epoch)

  print("\tTraining: Loss={:.2f}\t Accuracy={:.2f}\t".format(sum(losses)/len(losses), correct / total))
        

In [None]:
import os 
os.makedirs("models", exist_ok=True)

torch.save(model, "models/bal_exsum.pth")

!gsutil cp -r models $sum_dir

In [None]:
torch.save(model.state_dict(), "models/bal_exsum_dict.pth")

!gsutil cp -r models $sum_dir

In [None]:
!ls -lh models