In [None]:
!pip install numpy pandas faiss-gpu torch transformers sentence_transformers --quiet

[K     |████████████████████████████████| 89.7MB 56kB/s 
[K     |████████████████████████████████| 2.5MB 45.5MB/s 
[K     |████████████████████████████████| 92kB 12.9MB/s 
[K     |████████████████████████████████| 901kB 39.8MB/s 
[K     |████████████████████████████████| 3.3MB 42.0MB/s 
[K     |████████████████████████████████| 1.2MB 39.7MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
data_folder = '/content/drive/MyDrive/ML-DL-DS/data/'

In [None]:
import re
import numpy as np
import pandas as pd
import faiss
import torch
from torch.utils.data import DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import SentenceTransformer

np.random.seed(0)
pd.set_option("max_colwidth", 100)

In [None]:
# wikipedia movie plots dataset: https://www.kaggle.com/jrobischon/wikipedia-movie-plots
movies = pd.read_csv(data_folder + 'wiki_movie_plots_deduped.csv', usecols=['Title', 'Plot'])

print(f"Plots of {len(movies.index)} movies!")
movies.sample(2)

Plots of 34886 movies!


Unnamed: 0,Title,Plot
23500,Belly of the Beast,Jake Hopper (Steven Seagal) is a successful businessman and retired CIA agent who runs a success...
26709,Payback,Insurance employee Kunal (Munish Khan) suffers an almost fatal accident but is rescued by passer...


In [None]:
def clean_text(text, max_words=1024):
    """
    Truncates a string, then removes string control characters and multiple spaces.
    """
    text = text.split()[:max_words]
    text = ' '.join(text)
    regex = re.compile(r'[\n\r\t]') 
    text = regex.sub(" ", text)
    text = re.sub(' +', ' ', text).strip()
    return text

In [None]:
clean_text("Lorem    ipsum dolor sit amet, consectetur adipiscing elit.\r\nSem integer vitae justo eget magna fermentum iaculis.")

'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sem integer vitae justo eget magna fermentum iaculis.'

In [None]:
movies['Plot'] = movies['Plot'].apply(clean_text)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # use GPU if available
print(device)

cuda


In [None]:
# summarization model
model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6')
model.to(device)
model.eval()

tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1802.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1222317369.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898822.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




In [None]:
FIRST_HP_MOVIE_PLOT = """Albus Dumbledore, Minerva McGonagall, and Rubeus Hagrid, professors of Hogwarts School of Witchcraft and Wizardry, deliver an orphaned infant named Harry Potter to his only remaining relatives, the Dursleys. Ten years later, Harry has been battling a disjointed life with the Dursleys, inadvertently causing an accident during a family outing, and begins receiving unsolicited letters by owls. Finally, Hagrid re-appears, and informs Harry that he is actually a wizard, and has been accepted into Hogwarts, against the Dursleys' wishes. He also tells Harry of the latter's past; Harry is the orphaned son of two wizards who met their demise at the hands of Lord Voldemort, a malevolent, all-powerful wizard, by a Killing Curse, with Harry being the only survivor in the chaos thus, leading to his fame in the wizarding world as "The Boy Who Lived". Hagrid takes Harry to Diagon Alley to purchase school supplies, then takes him to King's Cross station to board a train to the school. While on the train, Harry meets Ron Wea"""

In [None]:
inputs = tokenizer([FIRST_HP_MOVIE_PLOT],
                   max_length=1024,
                   padding=True,
                   truncation=True,
                   return_tensors='pt')

# Generate Summary (max 128 tokens)
summary_ids = model.generate(inputs['input_ids'].to(device),
                             max_length=128,
                             early_stopping=True)

summaries = tokenizer.batch_decode(summary_ids,
                                   skip_special_tokens=True,
                                   clean_up_tokenization_spaces=True)

In [None]:
summaries[0]

' Albus Dumbledore, Minerva McGonagall, and Rubeus Hagrid deliver Harry Potter to his only remaining relatives, the Dursleys. Harry is the orphaned son of two wizards who met their demise at the hands of Lord Voldemort, a malevolent, all-powerful wizard, by a Killing Curse. Harry was the only survivor in the chaos leading to his fame in the wizarding world as "The Boy Who Lived"'

In [None]:
plot_list = movies.Plot.tolist()

def split_list(lst, n):  
    for i in range(0, len(lst), n): 
        yield lst[i:i + n] 

n = 32  # batch size
batches = list(split_list(plot_list, n))

In [None]:
all_sumaries = []
for batch in batches:
    with torch.no_grad():
        # tokenize
        inputs = tokenizer(batch,
                        max_length=1024,
                        padding=True,
                        truncation=True, 
                        return_tensors='pt')

        # generate summary (max 128 tokens)
        summary_ids = model.generate(inputs['input_ids'].to(device), max_length=128, early_stopping=True).to('cpu')
        all_sumaries += [txt.strip() for txt in tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)]
        del inputs, summary_ids 
        torch.cuda.empty_cache()

movies['PlotSummary'] = all_sumaries

In [None]:
movies.to_csv(data_folder + 'wiki_movie_plots_deduped_summarized.csv', index=False)

In [None]:
movies.head()

Unnamed: 0,Title,Plot,PlotSummary
0,Kansas Saloon Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypicall...",Carrie Nation and her followers burst into a saloon and attack a bartender. The group then begin...
1,Love by the Light of the Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a ...","The moon, painted with a smiling face hangs over a park at night. A young couple walking past a ..."
2,The Martyred Presidents,"The film, just over a minute long, is composed of two shots. In the first, a girl sits at the ba...","The film, just over a minute long, is composed of two shots. In the first, a girl sits at the ba..."
3,"Terrible Teddy, the Grizzly King","Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during wint...","The first shot is set in a wood during winter and lasts just 61 seconds. Two men, bearing signs ..."
4,Jack and the Beanstalk,"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow fo...",The earliest known adaptation of the classic fairytale shows Jack trading his cow for the beans....


In [None]:
pd.set_option("max_colwidth", None)
movies[movies.Title.str.contains('Harry Potter and the Order of the Phoenix')]

Unnamed: 0,Title,Plot,PlotSummary
15225,Harry Potter and the Order of the Phoenix,"During another summer with his Aunt Petunia and Uncle Vernon, Harry Potter and Dudley are attacked by Dementors. After using magic to save Dudley and himself, Harry is expelled from Hogwarts, but the decision is later reversed after a hearing at the Ministry of Magic. Harry is whisked off by a group of wizards including Mad-Eye Moody, Remus Lupin, and several new faces, including Nymphadora Tonks and Kingsley Shacklebolt, to Number 12, Grimmauld Place, the childhood home of his godfather, Sirius Black. The building also serves as the headquarters of the Order of the Phoenix, a secret organisation founded by Albus Dumbledore, informs Harry Potter that the Ministry of Magic is oblivious to Lord Voldemort's return. At the Order's headquarters, Sirius Black, mentions that Voldemort is after an object he did not have during his previous attack. Back at Hogwarts, Harry learns that Minister of Magic Cornelius Fudge has appointed a new Defence Against the Dark Arts professor: Dolores Umbridge. She and Harry immediately clash, and she punishes Harry for his ""lies"" by forcing him to write a message with a magic quill, scarring his hand. When Ron and Hermione notice Harry's scars they are outraged, but Harry refuses to go to Dumbledore, who has distanced himself from Harry since the summer. As Umbridge's control over the school increases, Ron and Hermione aid Harry in forming a secret group to train students in defensive spells, calling themselves ""Dumbledore's Army"". Umbridge recruits the Slytherin students to expose the group. Meanwhile, Harry and Cho Chang develop romantic feelings for each other. Harry has a vision involving an attack upon Arthur Weasley, from the point of view of Arthur's attacker. Concerned that Voldemort will exploit this connection to Harry, Dumbledore instructs Severus Snape to give Harry Occlumency lessons to defend his mind from Voldemort's influence. The connection between Harry and Voldemort leads Harry to further isolate himself from his friends. Meanwhile, Bellatrix Lestrange, Sirius' deranged Death Eater cousin, escapes from Azkaban along with nine other Death Eaters. At Hogwarts, Umbridge and her Inquisitorial Squad expose Dumbledore's Army. Dumbledore escapes as Fudge orders his arrest. Umbridge becomes the new Headmistress. Harry's relationship with Cho falls apart, as he believes she betrayed Dumbledore's Army to Umbridge. Harry discovers through Snape's memories why Snape hated Harry's father James, who often ridiculed him. Harry has another vision, this one of Sirius being tortured by Voldemort. Harry, Ron and Hermione rush to Umbridge's fireplace to alert the Order via the Floo Network, since hers is the only fireplace not being monitored, but Umbridge stops them before they can do so. As Umbridge tortures Harry, Hermione tricks Umbridge into entering the Forbidden Forest in search of Dumbledore's ""secret weapon"". She and Harry lead her to the hiding place of Hagrid's giant half-brother, Grawp, only to be confronted by centaurs who kidnap Umbridge after she attacks and insults them. Harry, Hermione, Ron, Luna, Neville and Ginny fly to the Ministry of Magic on Thestrals in an attempt to save Sirius. The six enter the Department of Mysteries where they uncover a bottled prophecy, the object Voldemort was after. However, they are ambushed by Death Eaters including Lucius Malfoy and Bellatrix Lestrange. Lucius reveals that Harry only saw a dream of Sirius being tortured; it was simply a ruse to lure Harry into the Death Eaters' grasp. Harry refuses to give Lucius the prophecy, and a fight between Dumbledore's Army and the Death Eaters ensues. The Death Eaters take everyone except Harry as hostages, threatening to kill them unless he surrenders the prophecy. Harry obliges just as Sirius and Remus Lupin arrive with Order members Nymphadora Tonks, Kingsley Shacklebolt and Mad-Eye Moody. As they attack the Death Eaters, Lucius drops the prophecy, destroying it. Just as Sirius overpowers Lucius, Bellatrix kills Sirius. Voldemort appears, but Dumbledore arrives through the Floo Network moments before Voldemort can kill Harry. A duel between Voldemort and Dumbledore ensues, destroying much of the Atrium, while Bellatrix escapes. After the two prove evenly matched, Voldemort possesses Harry to try to get Dumbledore to sacrifice Harry, but the love Harry feels for his friends and Sirius makes it impossible for Voldemort to remain in his body. Ministry officials arrive before Voldemort disapparates; Fudge is forced to admit that Voldemort has returned and is forced out of his position as Minister in disgrace. Umbridge is removed from Hogwarts and Dumbledore returns as headmaster. Dumbledore explains that he distanced himself from Harry throughout the year hoping it would lessen the risk of Voldemort using their connection. Harry comes to terms with the prophecy; ""Neither can live while the other survives.""","Harry is expelled from Hogwarts after using magic to save Dudley and himself from Dementors. The Ministry of Magic is oblivious to Lord Voldemort's return. Harry has a vision involving an attack upon Arthur Weasley, from the point of view of the attacker. The connection between Harry and Voldemort leads Harry to isolate himself from his friends."
