In [None]:
# Uses ths guide: https://stackabuse.com/text-summarization-with-nltk-in-python/

In [None]:
# You'll need all these packages. You might need to install the extra nltk packages (see: https://www.nltk.org/data.html)

import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import heapq

In [None]:
# Read movie summaries into Pandas
# Store Movie ID in first column, summary in second column.
# Should make a n x 2 matrix where n = number of movie summaries
df = pd.read_csv('plot_summaries_short.txt',sep='\t', names = ["ID", "summary"])
df.columns = df.columns.str.strip()

print(df)

In [None]:
# Remove Square Brackets and Extra Spaces

size = len(df.summary)
article_text = ["" for x in range(size)]

for i in range(size):
    article_text[i] = re.sub(r'\[[0-9]*\]', ' ', df.summary[i])
    article_text[i] = re.sub(r'\s+', ' ', df.summary[i])


# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The article_text list contains {0} article summaries.'.format(len(article_text)))

In [None]:
# QC. Slice the first 6 elements in the article_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

article_text[0:5]

In [None]:
# Remove special characters and digits

formatted_article_text = ["" for x in range(size)]
for i in range(size):
    formatted_article_text[i] = re.sub('[^a-zA-Z]', ' ', article_text[i])
    formatted_article_text[i] = re.sub(r'\s+', ' ', formatted_article_text[i])

# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The formatted_article_text list contains {0} article summaries.'.format(len(formatted_article_text)))

In [None]:
# QC. Slice the first 6 elements in the article_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

formatted_article_text[0:5]

In [None]:
# Convert Text to Sentences

sentence_list = [[] for x in range(size)]
for i in range(size):
    sentence_list[i] = nltk.sent_tokenize(article_text[i])

# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The sentence_list is {0} rows long.'.format(len(sentence_list)))

In [None]:
# QC. Slice the first 6 elements in the sentence_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

sentence_list[0:5]

In [None]:
# For QCing the word frequency loop. It calculates word frequencies for the 1st element in the list of summaries

# Find Weighted Frequency of Occurance
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies_test = {}
for word in nltk.word_tokenize(formatted_article_text[1]):
    if word not in stopwords:
        if word not in word_frequencies_test.keys():
            word_frequencies_test[word] = 1
        else:
            word_frequencies_test[word] += 1

In [None]:
print(word_frequencies_test)

In [None]:
# Find Weighted Frequency of Occurance
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {x:{} for x in range(size)}

for i in range(size):
    for word in nltk.word_tokenize(formatted_article_text[i]):
        if word not in stopwords:
            if word not in word_frequencies[i].keys():
                word_frequencies[i][word] = 1
            else:
                word_frequencies[i][word] += 1

# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The word_frequencies dictionary is {0} rows long.'.format(len(word_frequencies)))

In [None]:
# QC. Index the 1st element in the dictionary. The frequency counts should match word_frequencies_test
print(word_frequencies[1])

In [None]:
# Get Max Frequency

maximum_frequncy = ["" for x in range(size)]

for i in range(size):
    maximum_frequncy[i] = max(word_frequencies[i].values())
    
# QC. The maximum_frequncy list should be as long as the total number of rows in the Pandas dataframe
print('The maximum_frequncy list is {0} rows long.'.format(len(maximum_frequncy)))

In [None]:
# QC to check that this list contains the max word frequency for each plot summary

print(maximum_frequncy)

In [None]:
# Get Weighted Frequency

for i in range(size):
    for word in word_frequencies[i].keys():
        word_frequencies[i][word] = (word_frequencies[i][word]/maximum_frequncy[i])
        
# QC. The maximum_frequncy list should be as long as the total number of rows in the Pandas dataframe
print('The word_frequncies list is {0} rows long.'.format(len(word_frequencies)))

In [None]:
# QC to check that each word contains its weighted frequency: the word's count divided by the max word count

print(word_frequencies[1])

In [None]:
# Calculate Sentence Scores

sentence_scores = {x:{} for x in range(size)}

for i in range(size):
    for sent in sentence_list[i]:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies[i].keys():
                if len(sent.split(' ')) < 20: ## Change this to specify how long/short of sentences you want to include
                    if sent not in sentence_scores[i].keys():
                        sentence_scores[sent] = word_frequencies[i][word]
                    else:
                        sentence_scores[sent] += word_frequencie[i][word]

In [None]:
len(sentence_scores)

In [None]:
len(sentence_list)

In [None]:
print(sentence_scores[0])

In [None]:
# Get a Summary
# This needs to be better. Instead of summarizing the text defined above, this should be a for loop that
# Runs a text summary on every row in the pandas data frame defined above


#Change value here to get summary sentence length
summary_sentences = heapq.nlargest(2, sentence_scores[1], key=sentence_scores.get)

summary = ' '.join(summary_sentences)
print(summary)

In [None]:
# OK, now it is time to score the summaries on arousal, valence, and dominance. There is a nice dictionary by
# Bradley and Lang that will let you do exactly that. You can borrow that here:
# https://github.com/dwzhou/SentimentAnalysis

# Biasically, you'll want to classify each cell in the Pandas Dataframe (where each cell contains a movie summary)
# along arousal, valence, and dominance. So we can get a score for each.

In [None]:
# Note: this is super optional and likely not necissary. You could, instead of using a dictionary approach,
# Train a classifier to do your sentiment analysis. This would be cooler, but also probably a lot of work
# And I'm not sure it would gain us all that much. But if you are feeling energetic, or the Lang dictionary
# doesn't work, here is some hints about training a classifier.

#Train a text sentiment classifier. Here we are in good shape because most are trained on movie ratings
# But you could also train the classifier on the un summarized movie reviews
# For ideas on how to do this, check out: https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184