#Library

In [1]:
import pandas as pd 
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from copy import deepcopy
import regex as re

# Load CSV

In [2]:
from google.colab import files

uploaded = files.upload()

Saving IMDB.csv to IMDB.csv


In [3]:
df = pd.read_csv('IMDB.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
df.shape

(50000, 2)

In [7]:
df_original = deepcopy(df)

In [8]:
df_original.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

## Remove tags and split into array

In [12]:
def remove_tags(string):
    result = re.sub('<.*?>',' ',string)
    return result

df['review_split'] = df['review'].apply(lambda x: re.sub('<.*?>',' ',x).split())
df.head()

Unnamed: 0,review,sentiment,review_split
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production., The, filmi..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there's, a, family, where, a, litt..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei's, ""Love, in, the, Time, of, M..."


## Stop words removal 

In [13]:
! python -m nltk.downloader stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
def stop_words_removal(text) : 
  stops = set(stopwords.words("english"))
  return [word for word in text if word not in (stops)]

df['review_wo_stop_words'] = df['review_split'].apply(stop_words_removal)
print(df['review_split'][0])
print(df['review_wo_stop_words'][0])

['One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'Oz', 'episode', "you'll", 'be', 'hooked.', 'They', 'are', 'right,', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me.', 'The', 'first', 'thing', 'that', 'struck', 'me', 'about', 'Oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence,', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'GO.', 'Trust', 'me,', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid.', 'This', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs,', 'sex', 'or', 'violence.', 'Its', 'is', 'hardcore,', 'in', 'the', 'classic', 'use', 'of', 'the', 'word.', 'It', 'is', 'called', 'OZ', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'Oswald', 'Maximum', 'Security', 'State', 'Penitentary.', 'It', 'focuses', 'mainly', 'on', 'Emerald', 'City,', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', 'th

In [15]:
df.head()

Unnamed: 0,review,sentiment,review_split,review_wo_stop_words
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione...","[One, reviewers, mentioned, watching, 1, Oz, e..."
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production., The, filmi...","[A, wonderful, little, production., The, filmi..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,...","[I, thought, wonderful, way, spend, time, hot,..."
3,Basically there's a family where a little boy ...,negative,"[Basically, there's, a, family, where, a, litt...","[Basically, there's, family, little, boy, (Jak..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei's, ""Love, in, the, Time, of, M...","[Petter, Mattei's, ""Love, Time, Money"", visual..."


## Lemmatization

In [16]:
! python -m nltk.downloader wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [17]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text] 

df['review_lemmatize'] = df['review_wo_stop_words'].apply(lemmatize_text)
print(df['review_wo_stop_words'][0])
print(df['review_lemmatize'][0])

['One', 'reviewers', 'mentioned', 'watching', '1', 'Oz', 'episode', 'hooked.', 'They', 'right,', 'exactly', 'happened', 'me.', 'The', 'first', 'thing', 'struck', 'Oz', 'brutality', 'unflinching', 'scenes', 'violence,', 'set', 'right', 'word', 'GO.', 'Trust', 'me,', 'show', 'faint', 'hearted', 'timid.', 'This', 'show', 'pulls', 'punches', 'regards', 'drugs,', 'sex', 'violence.', 'Its', 'hardcore,', 'classic', 'use', 'word.', 'It', 'called', 'OZ', 'nickname', 'given', 'Oswald', 'Maximum', 'Security', 'State', 'Penitentary.', 'It', 'focuses', 'mainly', 'Emerald', 'City,', 'experimental', 'section', 'prison', 'cells', 'glass', 'fronts', 'face', 'inwards,', 'privacy', 'high', 'agenda.', 'Em', 'City', 'home', 'many..Aryans,', 'Muslims,', 'gangstas,', 'Latinos,', 'Christians,', 'Italians,', 'Irish', 'more....so', 'scuffles,', 'death', 'stares,', 'dodgy', 'dealings', 'shady', 'agreements', 'never', 'far', 'away.', 'I', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goes', 'shows', '

# Summary with Tools for Benchmark

In [40]:
df_sum = df

In [41]:
df_sum['review_removetags'] = df_sum['review'].apply(remove_tags)

In [42]:
pip install pysummarization

Collecting pysummarization
  Downloading pysummarization-1.1.8.tar.gz (51 kB)
[?25l[K     |██████▍                         | 10 kB 19.9 MB/s eta 0:00:01[K     |████████████▊                   | 20 kB 17.5 MB/s eta 0:00:01[K     |███████████████████             | 30 kB 11.3 MB/s eta 0:00:01[K     |█████████████████████████▌      | 40 kB 9.6 MB/s eta 0:00:01[K     |███████████████████████████████▉| 51 kB 5.3 MB/s eta 0:00:01[K     |████████████████████████████████| 51 kB 146 kB/s 
Building wheels for collected packages: pysummarization
  Building wheel for pysummarization (setup.py) ... [?25l[?25hdone
  Created wheel for pysummarization: filename=pysummarization-1.1.8-py3-none-any.whl size=59453 sha256=aaa19b3a7453f16fce0ccb95d932aa4b56e218a92516cf2f3adfa4b39f25918e
  Stored in directory: /root/.cache/pip/wheels/da/14/3e/02d15001af23ca877c5149b66280a605e5cdbbe76972598afa
Successfully built pysummarization
Installing collected packages: pysummarization
Successfully installe

In [43]:
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor

In [44]:
auto_abstractor = AutoAbstractor()
auto_abstractor.tokenizable_doc = SimpleTokenizer()
auto_abstractor.delimiter_list = ['.', '\n']
abstractable_doc = TopNRankAbstractor()

# result_dict = auto_abstractor.summarize(document, abstractable_doc)

In [45]:
def getSumm(text):
  result_dict = auto_abstractor.summarize(text, abstractable_doc)
  summ = getTheLarger(result_dict)
  return summ

def getTheLarger(arrResult):
  num = 0
  for i in range(len(arrResult['scoring_data'])):
    if(arrResult['scoring_data'][i][1] > num):
      num = arrResult['scoring_data'][i][1]
      res = arrResult['summarize_result'][i]
  return res

In [46]:
df_sum['review_sentence_benchmark'] = df_sum['review_removetags'].apply(getSumm)

In [47]:
df_sum['review_benchmark'] = df_sum['review_sentence_benchmark'].apply(lambda x: re.sub('<.*?>',' ',x).split())

In [48]:
df_sum['review_benchmark'] = df_sum['review_benchmark'].apply(stop_words_removal)

In [49]:
df_sum['review_benchmark'] = df_sum['review_benchmark'].apply(lemmatize_text)

In [50]:
df_sum['review_benchmark']

0        [The, first, thing, struck, Oz, brutality, unf...
1        [The, actor, extremely, well, chosen-, Michael...
2        [While, may, disappointed, realize, Match, Poi...
3        [And, Jake, closet, totally, ruin, film!, I, e...
4        [This, variation, Arthur, Schnitzler's, play, ...
                               ...                        
49995    [Sharon, Stone, great,, always, is,, even, mov...
49996    [Bad, plot,, bad, dialogue,, bad, acting,, idi...
49997    [I, still, practicing, Catholic, would, consid...
49998    [This, second, rate,, excessively, vicious, We...
49999    [I, would, say, movie, worth, rental,, hardly,...
Name: review_benchmark, Length: 50000, dtype: object

In [51]:
# sentenceValue = dict()

# for s in df_sum['review_split_sentence']:
#   for word, freq in freqTable.items():
#     if word in s.lower():
#       if s in sentenceValue:
#         sentenceValue[s] +=freq
#       else:
#         sentenceValue[s] = freq

# AddPrep


In [52]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
def split_Sentence(text):
    result = sent_tokenize(text)
    return result

df_sum['review_split_sentence'] = df_sum['review'].apply(remove_tags)
df_sum['review_split_sentence'] = df_sum['review_split_sentence'].apply(split_Sentence)
df_sum.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,review,sentiment,review_split,review_wo_stop_words,review_lemmatize,review_removetags,review_sentence_benchmark,review_benchmark,review_split_sentence
0,One of the other reviewers has mentioned that ...,positive,"[One, of, the, other, reviewers, has, mentione...","[One, reviewers, mentioned, watching, 1, Oz, e...","[One, reviewer, mentioned, watching, 1, Oz, ep...",One of the other reviewers has mentioned that ...,The first thing that struck me about Oz was ...,"[The, first, thing, struck, Oz, brutality, unf...",[One of the other reviewers has mentioned that...
1,A wonderful little production. <br /><br />The...,positive,"[A, wonderful, little, production., The, filmi...","[A, wonderful, little, production., The, filmi...","[A, wonderful, little, production., The, filmi...",A wonderful little production. The filming t...,The actors are extremely well chosen- Micha...,"[The, actor, extremely, well, chosen-, Michael...","[A wonderful little production., The filming t..."
2,I thought this was a wonderful way to spend ti...,positive,"[I, thought, this, was, a, wonderful, way, to,...","[I, thought, wonderful, way, spend, time, hot,...","[I, thought, wonderful, way, spend, time, hot,...",I thought this was a wonderful way to spend ti...,While some may be disappointed when they real...,"[While, may, disappointed, realize, Match, Poi...",[I thought this was a wonderful way to spend t...
3,Basically there's a family where a little boy ...,negative,"[Basically, there's, a, family, where, a, litt...","[Basically, there's, family, little, boy, (Jak...","[Basically, there's, family, little, boy, (Jak...",Basically there's a family where a little boy ...,And then we have Jake with his closet which t...,"[And, Jake, closet, totally, ruin, film!, I, e...",[Basically there's a family where a little boy...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[Petter, Mattei's, ""Love, in, the, Time, of, M...","[Petter, Mattei's, ""Love, Time, Money"", visual...","[Petter, Mattei's, ""Love, Time, Money"", visual...","Petter Mattei's ""Love in the Time of Money"" is...",This being a variation on the Arthur Schnit...,"[This, variation, Arthur, Schnitzler's, play, ...","[Petter Mattei's ""Love in the Time of Money"" i..."


# Summary With WordFrequency Algorithm

In [53]:
import string 
punc = ['.',',','?','!']
for i in range(len(df_sum)):
  sentence = df_sum['review_split_sentence'][i]
  p = len(sentence)
  q = len(sentence[p-1])
  if(p!=1):
    if sentence[p-1][q-1] not in punc:
      del df_sum['review_split_sentence'][i][p-1]


In [54]:
review_split_prep =[]
for i in range(len(df_sum)):
  temp = ''
  for j in range(len(df_sum['review_split_sentence'][i])):
    temp += ' '
    temp +=(df_sum['review_split_sentence'][i][j])
  # print(temp)
  review_split_prep.append(temp)

In [55]:
df_sum['review_split_new'] = review_split_prep

In [56]:
#importing libraries
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request  

def _create_dictionary_table(text_string) -> dict:
   
    #removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table


def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:20] in sentence_weight:
                    sentence_weight[sentence[:20]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:20]] = frequency_table[word_weight]

        sentence_weight[sentence[:20]] = sentence_weight[sentence[:20]] / sentence_wordcount_without_stop_words

       

    return sentence_weight

def _calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:20] in sentence_weight and sentence_weight[sentence[:20]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, 1.5 * threshold)

    return article_summary

# if __name__ == '__main__':
#     summary_results = _run_article_summary(article_content)
#     print(summary_results)

In [57]:
summ_new = []
for i in range(len(df_sum)):
  # print(i)
  summ_new.append(_run_article_summary(df_sum['review_split_new'][i]))

In [58]:
summ_new

[' They are right, as this is exactly what happened with me. Trust me, this is not a show for the faint hearted or timid.',
 '',
 '',
 '',
 '',
 '',
 '',
 ' As it is now, the show is just awful.',
 '',
 ' !',
 '',
 ' The horror. The horror. There are no rules.',
 '',
 ' I do not know.',
 '',
 '',
 ' In and of itself it is not a bad film.',
 ' Horrible. Just awful.',
 '',
 '',
 " And if you're looking for Qaulen, he's the one wearing the helicopter.",
 ' !',
 '',
 '',
 '',
 '',
 ' If so, I want what he was smoking.',
 '',
 ' What crappy dancing.',
 ' Then again, calling Cold Mountain" a war movie is not entirely accurate. However, "Cold Mountain" is not so much about the Civil War itself as it is about the period and the people of the times.',
 '',
 ' Those aspects are dealt brilliantly.',
 '',
 ' Your loss, world.',
 '',
 ' It is, but not in that way. Just say no to this movie, though.',
 " That's it.",
 ' Where to begin? What do they do?',
 ' !',
 ' Truly grotesque. Creepy....',
 '',


# COSINE

In [59]:
df_cosine = df_sum

In [60]:
from nltk.corpus import stopwords
import numpy as np
from nltk.cluster.util import cosine_distance
import networkx as nx

In [61]:
def read_text(text):
  article = text.split(".")
  # print(article)
  sentences = []
  for sentence in article:
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
  sentences.pop() 
  # print(sentences)
  return sentences

In [62]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [63]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [64]:
def generate_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_text(file_name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)
    # print(scores)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    

    for i in range(top_n):
      # print("---", ranked_sentence[i][1])
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    return(summarize_text)
    # Step 5 - Offcourse, output the summarize texr
    # print("Summarize Text: \n", ". ".join(summarize_text))

In [66]:
def read_text(arr_text):
  sentences = []
  for sentence in arr_text:
    # print(sentence)
    sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    # print(sentences)

  # sentences.pop() 
  # print(sentences)
  return sentences

In [None]:
summ_cosine = []
for i in range(len(df_cosine)):
  # print(i)
  summ_cosine.append(generate_summary( df_cosine['review_split_sentence'][i],1))

In [None]:
print(summ_cosine[0])

# T5 Algorithm

In [None]:
rom transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('t5-base')

def tokenize(batch):
    tokenized_input = tokenizer(batch['source'], padding='max_length', truncation=True, max_length=max_source)
    tokenized_label = tokenizer(batch['target'], padding='max_length', truncation=True, max_length=max_target)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=512)
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

train_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
import pandas as pd

df = df_sum

source_text = df['source']
target_text = df['target']

tokenized_source_text = tokenizer(list(source_text), truncation=False, padding=False)
tokenized_target_text = tokenizer(list(target_text), truncation=False, padding=False)

max_source = 0
for item in tokenized_source_text['input_ids']:
    if len(item) > max_source:
        max_source = len(item)

max_target = 0
for item in tokenized_target_text['input_ids']:
    if len(item) > max_target:
        max_target = len(item)

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained('t5-base')

output_dir = 'your/output/dir'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, 
    prediction_loss_only=True, 
    learning_rate=0.001,
    evaluation_strategy='steps',
    save_steps=1000, 
    save_total_limit=1, 
    remove_unused_columns=True, 
    run_name='run_name', 
    logging_steps=1000,
    eval_steps=1000, 
    logging_first_step=False, 
    load_best_model_at_end=True, 
    metric_for_best_model="loss", 
    greater_is_better=False 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()
trainer.save_model(output_dir + '/model')

In [None]:
import torch

print("input:")
input_text = input()

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask, 
        max_length=512,
        num_beams=5,
        repetition_penalty=1, 
        length_penalty=1, 
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("\noutput:\n" + pred)

# Evaluate with Blue Score

In [None]:
import nltk
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)