In [1]:
# Uses ths guide: https://stackabuse.com/text-summarization-with-nltk-in-python/

In [2]:
##adding .lower in other sections (such as tokenize version)
#terminal upload .py 
#

In [3]:
# You'll need all these packages. You might need to install the extra nltk packages (see: https://www.nltk.org/data.html)

import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import heapq

In [4]:
# Read movie summaries into Pandas
# Store Movie ID in first column, summary in second column.
# Should make a n x 2 matrix where n = number of movie summaries

df = pd.read_csv('/generate_summaries/plot_summaries.txt',sep='\t', names = ["ID", "summary"])
df.columns = df.columns.str.strip()
print(df)

             ID                                            summary
0      23890098  Shlykov, a hard-working taxi driver and Lyosha...
1      31186339  The nation of Panem consists of a wealthy Capi...
2      20663735  Poovalli Induchoodan  is sentenced for six yea...
3       2231378  The Lemon Drop Kid , a New York City swindler,...
4        595909  Seventh-day Adventist Church pastor Michael Ch...
...         ...                                                ...
42298  34808485  The story is about Reema , a young Muslim scho...
42299   1096473  In 1928 Hollywood, director Leo Andreyev  look...
42300  35102018  American Luthier focuses on Randy Parsons’ tra...
42301   8628195  Abdur Rehman Khan , a middle-aged dry fruit se...
42302   6040782  1940 - Operation Dynamo has just taken place. ...

[42303 rows x 2 columns]


In [6]:
# Remove Square Brackets and Extra Spaces

size = len(df.summary)
article_text = ["" for x in range(size)]

for i in range(size):
    article_text[i] = re.sub(r'\[[0-9]*\]', ' ', df.summary[i])
    article_text[i] = re.sub(r'\s+', ' ', df.summary[i])


# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The article_text list contains {0} article summaries.'.format(len(article_text)))

The article_text list contains 42303 article summaries.


In [7]:
# QC. Slice the first 6 elements in the article_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

article_text[0:5]

# we have some ([( before certain names but they are also there in the original data



["Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",
 'The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker\'s son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the "Career" tributes who train intensively at special academies and almost always wi

In [8]:
# Remove special characters and digits

formatted_article_text = ["" for x in range(size)]
for i in range(size):
    formatted_article_text[i] = re.sub('[^a-zA-Z]', ' ', article_text[i])
    formatted_article_text[i] = re.sub(r'\s+', ' ', formatted_article_text[i])

# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The formatted_article_text list contains {0} article summaries.'.format(len(formatted_article_text)))



The formatted_article_text list contains 42303 article summaries.


In [9]:
# QC. Slice the first 6 elements in the article_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

formatted_article_text[0:5]

# Removing digits alsoo removes dates included in the original text
# It is okay since we will use the article_text to differentiate words.



['Shlykov a hard working taxi driver and Lyosha a saxophonist develop a bizarre love hate relationship and despite their prejudices realize they aren t so different after all ',
 'The nation of Panem consists of a wealthy Capitol and twelve poorer districts As punishment for a past rebellion each district must provide a boy and girl between the ages of and selected by lottery for the annual Hunger Games The tributes must fight to the death in an arena the sole survivor is rewarded with fame and wealth In her first Reaping year old Primrose Everdeen is chosen from District Her older sister Katniss volunteers to take her place Peeta Mellark a baker s son who once gave Katniss bread when she was starving is the other District tribute Katniss and Peeta are taken to the Capitol accompanied by their frequently drunk mentor past victor Haymitch Abernathy He warns them about the Career tributes who train intensively at special academies and almost always win During a TV interview with Caesar F

In [10]:
# Convert Text to Sentences

sentence_list = ["" for x in range(size)]
for i in range(size):
  # this should be article_text because formatted_article_text does not include any punctuation
    sentence_list[i] = nltk.sent_tokenize(article_text[i])

# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The sentence_text list contains {0} article summaries.'.format(len(sentence_list)))

The sentence_text list contains 42303 article summaries.


In [11]:
# QC. Slice the first 6 elements in the sentence_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

sentence_list[0:5]

[["Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all."],
 ['The nation of Panem consists of a wealthy Capitol and twelve poorer districts.',
  'As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games.',
  'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.',
  'In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12.',
  'Her older sister Katniss volunteers to take her place.',
  "Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute.",
  'Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy.',
  'He warns them about the "Career" tributes who train intensively at s

In [12]:
# For QCing the word frequency loop. It calculates word frequencies for the 1st element in the list of summaries

# Find Weighted Frequency of Occurance
stopwords = nltk.corpus.stopwords.words('english')


word_frequencies_test = {}
for word in nltk.word_tokenize(formatted_article_text[1]):
    if word.lower() not in stopwords: #by adding .lower here we excluded 
        #'The' to be counted but stil counted private names (Capitols, Primrose)
        if word not in word_frequencies_test.keys():
            word_frequencies_test[word] = 1
        else:
            word_frequencies_test[word] += 1
            
                

In [13]:
print(word_frequencies_test)

# One thing to consider here is that while stopwords have him.her.the, and they are not counted. 
# If The, Him, and Her are used in the text, these capitalized ones are counted
# Also cpaitalized words are counted differently from lowercase words
# This produces useful and interpretable summaries, which is all we need this code to do

{'nation': 1, 'Panem': 1, 'consists': 1, 'wealthy': 1, 'Capitol': 2, 'twelve': 1, 'poorer': 1, 'districts': 1, 'punishment': 1, 'past': 2, 'rebellion': 1, 'district': 2, 'must': 2, 'provide': 2, 'boy': 2, 'girl': 2, 'ages': 1, 'selected': 1, 'lottery': 1, 'annual': 1, 'Hunger': 2, 'Games': 5, 'tributes': 4, 'fight': 1, 'death': 2, 'arena': 2, 'sole': 1, 'survivor': 2, 'rewarded': 1, 'fame': 1, 'wealth': 1, 'first': 2, 'Reaping': 1, 'year': 1, 'old': 1, 'Primrose': 1, 'Everdeen': 1, 'chosen': 1, 'District': 7, 'older': 1, 'sister': 1, 'Katniss': 24, 'volunteers': 1, 'take': 1, 'place': 1, 'Peeta': 16, 'Mellark': 1, 'baker': 1, 'son': 1, 'gave': 1, 'bread': 1, 'starving': 1, 'tribute': 3, 'taken': 1, 'accompanied': 1, 'frequently': 1, 'drunk': 1, 'mentor': 1, 'victor': 1, 'Haymitch': 4, 'Abernathy': 1, 'warns': 2, 'Career': 1, 'train': 1, 'intensively': 1, 'special': 1, 'academies': 1, 'almost': 1, 'always': 1, 'win': 2, 'TV': 1, 'interview': 1, 'Caesar': 1, 'Flickerman': 1, 'unexpectedl

In [14]:
# Find Weighted Frequency of Occurance
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {x:{} for x in range(size)}

for i in range(size):
    for word in nltk.word_tokenize(formatted_article_text[i]):
        if word.lower() not in stopwords:
            if word not in word_frequencies[i].keys():
                word_frequencies[i][word] = 1
            else:
                word_frequencies[i][word] += 1
                
# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The word_frequencies dictionary is {0} rows long.'.format(len(word_frequencies)))

The word_frequencies dictionary is 42303 rows long.


In [15]:
# QC. Index the 1st element in the dictionary. The frequency counts should match word_frequencies_test
print(word_frequencies[1])


{'nation': 1, 'Panem': 1, 'consists': 1, 'wealthy': 1, 'Capitol': 2, 'twelve': 1, 'poorer': 1, 'districts': 1, 'punishment': 1, 'past': 2, 'rebellion': 1, 'district': 2, 'must': 2, 'provide': 2, 'boy': 2, 'girl': 2, 'ages': 1, 'selected': 1, 'lottery': 1, 'annual': 1, 'Hunger': 2, 'Games': 5, 'tributes': 4, 'fight': 1, 'death': 2, 'arena': 2, 'sole': 1, 'survivor': 2, 'rewarded': 1, 'fame': 1, 'wealth': 1, 'first': 2, 'Reaping': 1, 'year': 1, 'old': 1, 'Primrose': 1, 'Everdeen': 1, 'chosen': 1, 'District': 7, 'older': 1, 'sister': 1, 'Katniss': 24, 'volunteers': 1, 'take': 1, 'place': 1, 'Peeta': 16, 'Mellark': 1, 'baker': 1, 'son': 1, 'gave': 1, 'bread': 1, 'starving': 1, 'tribute': 3, 'taken': 1, 'accompanied': 1, 'frequently': 1, 'drunk': 1, 'mentor': 1, 'victor': 1, 'Haymitch': 4, 'Abernathy': 1, 'warns': 2, 'Career': 1, 'train': 1, 'intensively': 1, 'special': 1, 'academies': 1, 'almost': 1, 'always': 1, 'win': 2, 'TV': 1, 'interview': 1, 'Caesar': 1, 'Flickerman': 1, 'unexpectedl

In [16]:
# Get Weighted Frequency
maximum_frequncy = ["" for x in range(size)]
for i in range(size):
    maximum_frequncy[i] = max(word_frequencies[i].values())

    for word in word_frequencies[i].keys():
        word_frequencies[i][word] = (word_frequencies[i][word]/maximum_frequncy[i])
#QC   
print('The maximum_frequncy dictionary is {0} rows long.'.format(len(maximum_frequncy)))



The maximum_frequncy dictionary is 42303 rows long.


In [17]:
#QC. You can compare this result with the previous list of frequencies and see if it is the highest. 
print(maximum_frequncy[1])

24


In [18]:
# For QCing the sentence scoes loop. It calculates sentence scores for the 1st element in the list of summaries

sentence_scores_test = {}
for sent in sentence_list[1]:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies[1].keys():
            if len(sent.split(' ')) < 35: ## Change this to specify how long/short of sentences you want to include
                if sent not in sentence_scores_test.keys():
                    sentence_scores_test[sent] = word_frequencies[1][word]
                else:
                    sentence_scores_test[sent] += word_frequencies[1][word]
                    

In [19]:
# Check your work

print (sentence_scores_test)

{'The nation of Panem consists of a wealthy Capitol and twelve poorer districts.': 0.24999999999999997, 'As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games.': 0.7499999999999998, 'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.': 0.7083333333333333, 'In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12.': 0.20833333333333331, 'Her older sister Katniss volunteers to take her place.': 0.20833333333333331, "Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute.": 0.41666666666666663, 'Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy.': 0.3333333333333333, 'He warns them about the "Career" tributes who train intensively at special academies and almost always win.': 0.583333333333

In [20]:
# Calculate Sentence Scores

sentence_scores = {x:{} for x in range(size)}

for i in range(size):
    for sent in sentence_list[i]:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies[i].keys():
                if len(sent.split(' ')) < 35: ## We have some very short ones. With this, they are excluded
                    #Change this to specify how long/short of sentences you want to include
                    if sent not in sentence_scores[i].keys():
                        sentence_scores[i][sent] = word_frequencies[i][word]
                    else:
                        sentence_scores[i][sent] += word_frequencies[i][word]
#QC. 
print('The sentence_scores dictionary is {0} rows long.'.format(len(sentence_scores)))




The sentence_scores dictionary is 42303 rows long.


In [21]:
#QC sentence score of the hunger games (element 1). It should be same with above example. 

print (sentence_scores[1])

{'The nation of Panem consists of a wealthy Capitol and twelve poorer districts.': 0.24999999999999997, 'As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games.': 0.7499999999999998, 'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.': 0.7083333333333333, 'In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12.': 0.20833333333333331, 'Her older sister Katniss volunteers to take her place.': 0.20833333333333331, "Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute.": 0.41666666666666663, 'Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy.': 0.3333333333333333, 'He warns them about the "Career" tributes who train intensively at special academies and almost always win.': 0.583333333333

In [22]:
# Single example for Summary

summary_sentences_test = heapq.nlargest(2, sentence_scores[1], key=sentence_scores[1].get)

summary_test = ' '.join(summary_sentences_test)
print(summary_test)

# The results should in the order of highest to the lowest



As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.


In [23]:
# To understand what is different than summary_test

print(summary_sentences_test)

['As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games.', 'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.']


In [24]:
# Get a Summary
# First part, creating a summary sentences list

#Change value here to get summary sentence length
summary_sentences = ["" for x in range(size)]
for i in range(size):
    summary_sentences[i] = heapq.nlargest(2, sentence_scores[i], key=sentence_scores[i].get)

#QC. 
print('The summary_sentences list is {0} rows long.'.format(len(summary_sentences)))

The summary_sentences list is 42303 rows long.


In [25]:
# Should be two sentences in each

print(summary_sentences[0:5])


[["Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all."], ['As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games.', 'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.'], ['In court, Nandagopal Maarar , a close friend of Induchoodan and a famous supreme court lawyer, appears for Menon and manages to lay bare the murder plot and hidden intentions of other party .', "At Menon's funeral, Manapally Pavithran arrives to poke fun at Induchoodan and he also tries to carry out the postponed last rituals of his own father."], ['The Kid learns of this when he returns to the home after a late night to find the home deserted and money gone.', "He decides to recover the money, sneaking into Charley's home in the guise of a

In [26]:
# Now to get these two sentences separated '' into a joint sentences.

summary = ["" for x in range(size)]
for i in range(size):
    summary[i] = ' '.join(summary_sentences[i])

#QC 
print('The summary list is {0} rows long.'.format(len(summary)))

The summary list is 42303 rows long.


In [27]:
print(summary[0:5])

["Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.", 'As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.', "In court, Nandagopal Maarar , a close friend of Induchoodan and a famous supreme court lawyer, appears for Menon and manages to lay bare the murder plot and hidden intentions of other party . At Menon's funeral, Manapally Pavithran arrives to poke fun at Induchoodan and he also tries to carry out the postponed last rituals of his own father.", "The Kid learns of this when he returns to the home after a late night to find the home deserted and money gone. He decides to recover the money, sneaking into Charley's home in the guise of an elderly woman.

In [40]:
summaries_without = ["" for x in range(size)]
for i in range(size):
    summaries_without[i] = re.sub(r'\. ', ' ', summary[i])

# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The formatted_article_text list contains {0} article summaries.'.format(len(summaries_without)))




The formatted_article_text list contains 42303 article summaries.


In [44]:
print(summaries_without[0])

Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.


In [45]:
# Save summaries_without
import numpy as np
np.savetxt('movie_summaries_without.txt',summaries_without, delimiter='str',fmt='%s')

In [26]:
# save summaries as a text file
import numpy as np
np.savetxt('movie_summaries.txt',summary, delimiter='str',fmt='%s')