# THE OFFICIAL COLAB NOTEBOOK OF TEAM ABC FOR AUTOMIN @ INTERSPEECH 2021 

In [None]:
# MOUNT DRIVE, SET CUDA DEVICE, NECESSARY INSTALLATIONS ...

from google.colab import drive
drive.mount('/content/drive')

import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name())

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

torch.cuda.set_device(0)

!pip install sentencepiece
!pip install transformers

from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import nltk
nltk.download('punkt')
import numpy as np

model_checkpoint = "facebook/bart-large-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

summarizer1 = pipeline("summarization", model="/content/drive/MyDrive/AutoMin-2021/bart_large_xsum_samsum/checkpoint", , device=0)

from IPython.display import clear_output
clear_output()

In [None]:
### CUSTOMIZED STRIP, REPLACE AND PREPROCESS FUNCTIONS ###

def stripp(string):
    list1=[]
    string = string.strip()
    list1[:0]=string
    idx = 0
    cnd = False
    for i in list1:
        if i.isalpha():
            cnd = True
            break
    if cnd:
        while list1[0].isalpha() == False:
            if idx+1 == len(string):
                break
            list1.remove(list1[0])
            idx+=1
        list1 = ''.join(list1)
    else:
        list1 = None

    return list1

def preprocess(ctx):

  ctx = ctx.replace(" '", "'")
  ctx = ctx.replace(" ,", ",")
  ctx = ctx.replace(" .", ".")
  ctx = ctx.replace(" ?", "?")
  ctx = ctx.replace("Ehmm", "")
  ctx = ctx.replace(" Ehm", "")
  ctx = ctx.replace(" mmm", "")
  ctx = ctx.replace(" hmm", "")
  ctx = ctx.replace(" uh", "")
  ctx = ctx.replace(" uh ,", "")
  ctx = ctx.replace(" uh .", "")
  ctx = ctx.replace(" um", "")
  ctx = ctx.replace(" um ,", "")
  ctx = ctx.replace(" um .", "")
  ctx = ctx.replace(" Uh", "")
  ctx = ctx.replace(" Uh ,", "")
  ctx = ctx.replace(" Uh .", "")
  ctx = ctx.replace(" Um", "")
  ctx = ctx.replace(" Um ,", "")
  ctx = ctx.replace(" Um .", "")
  ctx = ctx.replace("Uh", "")
  ctx = ctx.replace("Um", "")
  ctx = ctx.replace("Yeah", "")
  ctx = ctx.replace(" yeah", "")
  ctx = ctx.replace("Ehm, ", "")
  ctx = ctx.replace("Hmm, ", "")
  ctx = ctx.replace("Ehm. ", "")
  ctx = ctx.replace("Hmm. ", "")
  ctx = ctx.replace("Yeah", "")
  ctx = ctx.replace(" yeah", "")
  ctx = ctx.replace("Ehm", "")
  ctx = ctx.replace("Hmm", "")
  ctx = ctx.replace("Ehm", "")
  ctx = ctx.replace("Hmm", "")
  ctx = ctx.replace("Mhm", "")
  ctx = ctx.replace(" {disfmarker}", "")
  ctx = ctx.replace(" {vocalsound}", "")
  ctx = ctx.replace(" {gap}", "")
  ctx = ctx.replace("...", ".")
  ctx = ctx.replace("..", ".")
  ctx = ctx.replace(",,", ",")
  ctx = ctx.replace(",,", ",")
  ctx = ctx.replace(",.", "")
  ctx = ctx.replace(".,", ".")
  ctx = ctx.replace("  ", " ")
  ctx = ctx.replace("(", "")
  ctx = ctx.replace(")", "")
  ctx = ctx.replace("Person", "PERSON")
  ctx = ctx.replace("is going to", "will")
  ctx = ctx.replace("are going to", "will")
  ctx = ctx.replace("are discussing", "discussed")
  ctx = ctx.replace("discuss", "discussed")
  ctx = ctx.replace("are working", "worked")
  ctx = ctx.replace("is working", "worked")

  return ctx

def replacee(i):
  i = i.replace("do n't", "do not")
  i = i.replace("n't", "not")
  i = i.replace("it 's", "it is")
  i = i.replace(" 's", "")
  if i[0]+i[1] == "'s":
    i = i.replace("'s ", "")
  i = i.replace("wo n't", "won't")
  i = i.replace(" and", ",")
  i = i.replace(",,", ",")
  return i

### USEFUL UTIL FUNCTIONS FOR GENERATION AND FORMATTING ###

def summarize(tsc):
  a1 = summarizer1(tsc)[0]['summary_text']
  return a1

def gen_tscs(transcript_id, length):
  tscs_preprocessed = {}
  attendees = []
  for k,v in t.items():
    key = k
    if key == transcript_id:      #IF ALL THE TRANSCIRPTS ARE NEEDED, CHANGE THIS CONDITION TO if True:
      roles = v['roles']
      attendees.append(list(set(roles)))
      utterances = v['utterances']
      tsc = ['']
      i=0
      for role, utterance in zip(roles, utterances):
        utterance = preprocess(utterance)
        v = utterance
        v = re.sub(r"[^a-zA-Z0-9]+", ' ', utterance)
        v = v.split(' ')
        if len(v)<=4:
          continue
        if len(v)>4 and len(v)<7 and 's' in v:
          continue
        utterance = stripp(utterance)
        if utterance == None:
          continue
        if len(utterance) == 1:
          continue
        line = role + ': ' + utterance + '\n'

        # IF DIALOGUE IS LONGER THAN "length"
        tokenized_line = tokenizer.encode(line)
        if len(tokenized_line)>=length:
            line_ = line.split('.')
            split_ = len(line_)//2
            line1 = '. '.join(line_[0:split_]) + '.\n'
            line2 = role + ': ' + '. '.join(line_[split_:])
            tokenized_line = [line1, line2]
            for l in tokenized_line:
                tokenized = tokenizer.encode(tsc[i]+l)
                if len(tokenized)>=length:
                    i+=1
                    tsc.append('')
                    tsc[i]+=l
                else:
                    tsc[i]+=l               
        else:
            tokenized = tokenizer.encode(tsc[i]+line)
            if len(tokenized)>=length:
                i+=1
                tsc.append('')
                tsc[i]+=line
            else:
                tsc[i]+=line
      tscs = {key:tsc}
      tscs_preprocessed.update(tscs)

  return tscs_preprocessed, attendees

def format_summary(s2):

  s3 = ''.join(s2) #s2[0]

  s3 = s3.split('.')
  summ = ['']
  id=0
  summ1 = []
  for i in s3:
    #stripping the spaces
    i = i.replace('  ', ' ')
    if len(i) == 1:
      continue
    if i[0]==' ' and i[1].isalpha():
      i = stripp(i)
    if type(i) == type(None):
      continue
    if i[0] == ' ':
      continue
    i = preprocess(i)
    check = re.sub(r"[^a-zA-Z0-9]+", ' ', i)
    check = ''.join(i for i in check if not i.isdigit())
    check = check.replace('  ', ' ')
    check = check.split(' ')
    if len(check)<=6:
      continue

    #formatting
    if i[0] == 'P' and i[1] == 'E':
      summ1.append('-' + i + '.')
    # elif i[0] in ['M','T','O','A'] and (i[1].isalpha()==False):
    #   id+=1
    #   summ.append('')
    #   summ[id] = summ[id] + ' -' + i + '.'
    # elif i[0]=='M' and i[1]=='U':
    #   id+=1
    #   summ.append('')
    #   summ[id] = summ[id] + ' -' + i + '.'
    else:
      summ1.append(i + '.')

  summ1 = insert_pronouns(summ1)
  for i in summ1:
    if i[1] == 'P' and i[2] == 'E':
      id+=1
      summ.append('')
      summ[id] = summ[id] + ' ' + i
    else:
      summ[id] = summ[id] + '\n  ' + i

  if '' in summ:
    summ.remove('')
  summ = '\n'.join(summ)
  return summ

def insert_pronouns(summ1):
  len_sum = len(summ1)
  for line_no, i in enumerate(summ1):
    if '-' in i:
      if len_sum-line_no <= 3:
        rng = len_sum-line_no-1
      else:
        rng = 3
      for k1 in range(rng):
        st1, st2 = check_req(i, summ1[line_no+k1+1])
        if st1:
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace(st1, 'They')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They's", 'Their')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They is", 'They are')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They is", 'They are')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They has", 'They have')
          summ1[line_no+k1+1] = summ1[line_no+k1+1].replace("They wants", 'They want')
  return summ1

def check_req(line1, line2):
  if ('-' in line1) and ('-' in line2):
    st1 = ''
    st2 = ''
    for _ in range(8):
      st1+=line1[_]
      st2+=line2[_]
    if st1 == st2:
      if line1[_+1] == line2[_+1]:
        if line1[_+1]==' ':
          st3 = st1
          st4 = st2
        elif line1[_+1]==',':
          st3 = False
          st4 = False
        else:
          st3 = st1+line1[_+1]
          st4 = st2+line2[_+1]
      else:
        if line1[_+1]=="'" or line2[_+1]=="'":
          st3 = st1
          st4 = st2
        else:
          st3 = False
          st4 = False
    else:
      st3 = False
      st4 = False
  else:
    st3 = False
    st4 = False

  return st3, st4 

def gen_summary(tscs_preprocessed):
  s2 = []
  filename = []
  for k, v in tscs_preprocessed.items():

    k = k.replace('meeting', 'minute')
    k = k.replace('_transcript', '')
    filename.append(k)

    if len(v) < 11:
      section = 2
    elif len(v) < 18:
      section = 4
    elif len(v) < 24: 
      section = 6
    else:
      section = 8
    s1 = ['']
    tsc = v
    id=0
    for i, t1 in enumerate(tsc):
      a1 = summarize(t1)
      s1[id] = s1[id] + a1 + ' '
      if i%section==0:
        s1.append('')
        id+=1

    s2.append(s1)
  return s2, filename

In [None]:
### LOAD THE DATA ... A DICTIONARY :- key : meeting_id(str) ; value : roles(list), utterance(list) ###

import json
import re
with open('/content/drive/MyDrive/AutoMin-2021/test.json', 'r') as out:
  t = json.load(out)

In [None]:
### IF INFERENCING ON A SPECIFIC TRANSCRIPT, INPUT THE MEETING ID... ###
m_id = 'minutes_en_test_001'

### IMPLEMENTING THE BELOW LINES WILL GIVE 3 SUMMARIES WITH VARYING LENGTHS, AS MENTIONED ###
# tscs_preprocessed3, attendees = gen_tscs(m_id, 512) #for longer summary
# tscs_preprocessed2, attendees = gen_tscs(m_id, 768)
tscs_preprocessed1, attendees = gen_tscs(m_id, 1024) #for shorter summary

print(len(tscs_preprocessed1))
print(tscs_preprocessed1)

1
{'minutes_en_test_001': ["PERSON5: Hi, how are you? Good morning.\nPERSON13: Good morning. I'm, well, fine. Still at home.\nPERSON13: You know. And you?\nPERSON5: I'm also at home. But, ehm, the Czech republic government, they already lifted the Kind of lifted the rules. So, ehm, from this Monday we can actually go out even if it's not like the necessity. It the meetings of. Up to 10 people are allowed.\nPERSON13: Ha. So here in LOCATION1, we have to wait until the 4th of May, some commercial activity can already be open. But we think that we have to wait until June for the free circulation of people. And fortunately, starting from the 4th of June we are allowed to reach our family. If it's in the same region.\nPERSON13: And so finally, I will reach my parents. Cause we live in two different cities. Because I'm in Trento and my family is in Bolzano. Which is pretty near around 50 kilometres. But, but\nPERSON5: So you're looking forward.\nPERSON13: exactly. Hi guys, good morning.\nPER

In [None]:
tscs_preprocessed1[m_id]

["PERSON5: Hi, how are you? Good morning.\nPERSON13: Good morning. I'm, well, fine. Still at home.\nPERSON13: You know. And you?\nPERSON5: I'm also at home. But, ehm, the Czech republic government, they already lifted the Kind of lifted the rules. So, ehm, from this Monday we can actually go out even if it's not like the necessity. It the meetings of. Up to 10 people are allowed.\nPERSON13: Ha. So here in LOCATION1, we have to wait until the 4th of May, some commercial activity can already be open. But we think that we have to wait until June for the free circulation of people. And fortunately, starting from the 4th of June we are allowed to reach our family. If it's in the same region.\nPERSON13: And so finally, I will reach my parents. Cause we live in two different cities. Because I'm in Trento and my family is in Bolzano. Which is pretty near around 50 kilometres. But, but\nPERSON5: So you're looking forward.\nPERSON13: exactly. Hi guys, good morning.\nPERSON5: So I went to the par

In [None]:
# OVERVIEW THE SECTIONED BLOCKS OF CONVERSATIONS FROM THE TRANSCRIPT ...

id=1
for i in tscs_preprocessed1[m_id]:
  print('{} - {}'.format(id, i))
  id+=1

In [None]:
### THE BELOW 4 CELLS WOULD GIVE YOU 4 SUMMARIES VARYING IN LENGTH; ###
### THIS WOULD NORMALLY AFFECT THE COVERAGE AND ADEQUACY OF THE SUMMARIES; ###
### YOU CAN CHOOSE A SUITABLE SUMMARY FOR EVERY SINGLE TRANSCRIPT !!! ###

s2_short, filename = gen_summary(tscs_preprocessed1)
print(format_summary(s2_short[0]))


  The Czech Republic government has lifted the rules.
  People can go out even if they don't need to, but they have to wait until June for the free circulation of people.
 -PERSON5 is in Trento and PERSON13 is in Bolzano.
 -PERSON1, PERSON5, PERSON6, PERSON15, PERSON16 and PERSON8 will write a project management guide for Organizing Committee 6.
 -PERSON2 can has always believed in selforganization.
  The internal reviews should be ready by mid June at the latest.
  The project management guides are due in the end of June.
  The test sets are due on the 8th of June PERSON6 and PERSON2 discusseded the layout of the PROJECT1 test set.
  They want to have it populated and described by the August date so that they can submit as a deliver.
 -PERSON6, PERSON1 and PERSON9 discussed how to organize the test sets and create file lists.
 -PERSON6, PERSON1, PERSON9 and PERSON16 discusseded the evaluation of spoken language translation.
 -PERSON6 asks PERSON1 and PERSON3 to review the PROJECT1 te

In [None]:
s2_avg, filename = gen_summary(tscs_preprocessed2)
print(format_summary(s2_avg[0]))

In [None]:
s2_long, filename = gen_summary(tscs_preprocessed3)
print(format_summary(s2_long[0]))

In [None]:
print(attendees)

[['PERSON5', 'PERSON9', 'PERSON1', 'PERSON6', 'PERSON16', 'PERSON15', 'PERSON13']]


In [None]:
# SAVING THE MINUTE ...

import datetime
outfile = open('/content/drive/MyDrive/AutoMin-2021/outputs/new/{}.txt'.format(filename[0]), 'w')
tday = datetime.date.today()
formatted_summary = format_summary(s2_longer[0])
att = ', '.join(attendees[0])
outfile.write('DATE : {}\nATTENDEES : {}\n\n\nSUMMARY-\n{}\n\n\nMinuted by: Team ABC'.format(tday, att, formatted_summary))
outfile.close()

In [None]:
### A FORMAT SUMMARY FUNCTION, WITHOUT PRONOUN INSERTION ###

def format_summary1(s2):

  s3 = ''.join(s2) #s2[0]

  s3 = s3.split('.')
  summ = ['']
  id=0
  
  summ1 = []
  for i in s3:
    #stripping the spaces
    i = i.replace('  ', ' ')
    if len(i) == 1:
      continue
    if i[0]==' ' and i[1].isalpha():
      i = stripp(i)
    if type(i) == type(None):
      continue
    if i[0] == ' ':
      continue
    i = preprocess(i)
    check = re.sub(r"[^a-zA-Z0-9]+", ' ', i)
    check = ''.join(i for i in check if not i.isdigit())
    check = check.replace('  ', ' ')
    check = check.split(' ')
    if len(check)<=6:
      continue

    #formatting
    if i[0] == 'P' and i[1] == 'E':
      id+=1
      summ.append('')
      summ[id] = summ[id] + ' -' + i + '.'
    # elif i[0] in ['M','T','O','A'] and (i[1].isalpha()==False):
    #   id+=1
    #   summ.append('')
    #   summ[id] = summ[id] + ' -' + i + '.'
    # elif i[0]=='M' and i[1]=='U':
    #   id+=1
    #   summ.append('')
    #   summ[id] = summ[id] + ' -' + i + '.'
    else:
      summ[id] = summ[id] + '\n  ' + i + '.'

  if '' in summ:
    summ.remove('')
  summ = '\n'.join(summ)
  return summ

# If we want to further shorten the obtained summary...
This method sacrifices gramaticality and readbility, in order to achieve compactness, by using NLTK stopword reduction over a general BART Summarization. 

In [None]:
# RUN THE CELLS BELOW AND USE THIS FUNCTION INSTEAD OF THE 'format_summary()' version ...

def format_summary_short(s2):
  s3 = ''.join(s2) #s2[0]
  s3 = s3.split('.')
  summ = ['']
  id=0
  for i in s3:

    #stripping the spaces
    i = i.replace('  ', ' ')
    if len(i) == 1:
      continue
    if i[0]==' ' and i[1].isalpha():
      i = stripp(i)
    if i[0] == ' ':
      continue
    check = re.sub(r"[^a-zA-Z0-9]+", ' ', i)
    check = ''.join(i for i in check if not i.isdigit())
    check = check.replace('  ', ' ')
    check = check.split(' ')
    if len(check)<=6:
      continue

    #formatting
    if i[0] == 'P':
      id+=1
      summ.append('')
      i = shorten(i)
      i = replacee(i)
      summ[id] = summ[id] + ' -' + i + '.'
    else:
      i = shorten(i)
      i = replacee(i)
      summ[id] = summ[id] + '\n  ' + i + '.'

  if '' in summ:
    summ.remove('')
  summ = '\n'.join(summ)
  return summ

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
def shorten(example_sent):
  stop_words = set(stopwords.words('english')) 
  stop_words.remove('to')
  stop_words.remove('of')
  stop_words.remove('from')
  stop_words.remove('as')
  stop_words.remove('has')
  stop_words.remove('do')
  stop_words.remove('not')
  #stop_words.remove('be')
  stop_words.remove('on')
  stop_words.remove('in')
  stop_words.remove('if')
  stop_words.remove('is')
  stop_words.remove('it')
  stop_words.remove('for')
  stop_words.remove('with')
  stop_words.remove('he')
  stop_words.remove('can')
  stop_words.remove('does')
  stop_words.remove('between')
  stop_words.add('They')
  stop_words.add('which')
  stop_words.add('On')
  stop_words.add('It')
  stop_words.add('The')
  stop_words.remove('over')
  stop_words.remove('until')
  stop_words.remove('after') 
  stop_words.add('He')
  stop_words.remove('when')
  stop_words.remove('have')
  stop_words.remove('them')
  stop_words.remove('into')
  stop_words.remove('by')
  stop_words.remove('and')
  stop_words.remove('will')
  stop_words.remove('what')
  stop_words.add('manually')
  stop_words.remove('him')
    
  word_tokens = word_tokenize(example_sent) 
    
  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    
  filtered_sentence = [] 
    
  for w in word_tokens: 
      if w not in stop_words: 
          filtered_sentence.append(w) 
    
  return ' '.join(filtered_sentence)

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model=model.to(device)

In [None]:
summary = ''
for s in s2[0]:
  preprocess_text = s.strip().replace("\n","")
  inputs = tokenizer(preprocess_text, return_tensors='pt').to(device)
  summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=512)
  output = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
  summary = summary + output + ' '

In [None]:
s2_shorter, filename = gen_summary(tscs_preprocessed4)
print(format_summary_short(s2_shorter[0]))

-PERSON14 is preparing data for the audible cell ties data preparation.
 PERSON5 is watching a lecture on PROJECT2 ORGANIZATION4 live streaming.
 .
-PERSON5 needs to get the input files for the evaluation.
 The most important part is Antrecorp.
 PERSON5 and PERSON14 are on a conference call.
 They need to change the language or nation names in the released files.
 The most up to date OST versions are in the repositor.
 PERSON5 needs to create the support director for the files he's processing.
 He also needs to prepare the versioned files for the subtitles for the caraoke and the OST file for the preview.
 .
-PERSON14 wants PERSON1 to synchronize with PERSON5 on the on the audible SLT test set and depth set forced alignment files.
 He wants them to run the alignments and produce OST files to OSTP plus ASS files.
 PERSON1's priorities are the SLT evaluation, the clean up of the stable and preparing the files that PERSON5 will digest.
 PERSON5 is working on a caraoke as timeview.
 The de

# TextRank Scipt for ranking sentences...
This method uses GloVe Embeddings to calculate similarity score with the help of cosine similairty, and ranks individual sentences with the help of the PageRank Algorithm.

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

In [None]:
# ENTER THE MINUTE_ID
min_id = 'minutes_en_test_001'

import os
path = '/content/drive/MyDrive/AutoMin-2021/outputs/new'
os.chdir(path)
summaries = []
# for file1 in sorted(os.listdir()):
sumfile = open(path+'/'+min_id+'.txt', 'r')
summ = sumfile.readlines()
summ = summ[5:-3]
text = ''
for line in summ:
    line = line.replace(' -', '')
    line = line.replace('  ', '')
    line = line.replace('\n', '')
    text = text + line + ' '
summaries.append(text)
#break

In [None]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in summaries:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list
print('Total no. of sentences: ', len(sentences))

In [None]:
# EXTRACT WORD VECTORS

word_embeddings = {}
f = open('/content/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()


# REMOVE PUNCTUATIONS, NUMBERS AND SPECIAL CHARACTERS
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")


# MAKE ALPHABETS TO LOWERCASE
clean_sentences = [s.lower() for s in clean_sentences]


# REMOVE STOPWORDS

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]


# EXTRACT SENTENCE VECTORS

sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)


# INITIALIZE A SIMILARITY MATRIX
sim_mat = np.zeros([len(sentences), len(sentences)])

from sklearn.metrics.pairwise import cosine_similarity

for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]


# PAGERANK SCORING

import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [None]:
# REVIEW THE RANKINGS BEFORE ELIMINATING IRRELEVANT INFO FROM THE SUMMARY

for i in range(len(ranked_sentences)):
  print(ranked_sentences[i][1])

In [None]:
# ENTER THE PERCENTAGE OF SENTENCES THAT SEEM UNIFORMATIONAL,  THIS NUMBER IS USUALLY AROUND ~15% FOR THE MINUTES BELONGING TO A LENGTHY TRANSCRIPT

rem_perc = 0.15

import math
remove_count = math.ceil(len(sentences)*rem_perc)
print('No. of sentences removed: ', remove_count)

print('\n\nReduced Summary(jumbled): \n')
for i in range(len(ranked_sentences)-remove_count):
  print(ranked_sentences[i][1])