In [1]:
# Uses ths guide: https://stackabuse.com/text-summarization-with-nltk-in-python/

In [1]:
# You'll need all these packages. You might need to install the extra nltk packages (see: https://www.nltk.org/data.html)

import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import heapq

In [2]:
# Read movie summaries into Pandas
# Store Movie ID in first column, summary in second column.
# Should make a n x 2 matrix where n = number of movie summaries
df = pd.read_csv('plot_summaries.txt',sep='\t', names = ["ID", "summary"])
df.columns = df.columns.str.strip()
print(df)

             ID                                            summary
0      23890098  Shlykov, a hard-working taxi driver and Lyosha...
1      31186339  The nation of Panem consists of a wealthy Capi...
2      20663735  Poovalli Induchoodan  is sentenced for six yea...
3       2231378  The Lemon Drop Kid , a New York City swindler,...
4        595909  Seventh-day Adventist Church pastor Michael Ch...
...         ...                                                ...
42298  34808485  The story is about Reema , a young Muslim scho...
42299   1096473  In 1928 Hollywood, director Leo Andreyev  look...
42300  35102018  American Luthier focuses on Randy Parsons’ tra...
42301   8628195  Abdur Rehman Khan , a middle-aged dry fruit se...
42302   6040782  1940 - Operation Dynamo has just taken place. ...

[42303 rows x 2 columns]


In [3]:
# This needs to be better. Instead of defining string text, it needs to import the .txt file to pandas
# Where each row is a new movie summary

## article_text = '''
## In 1936, archaeologist Indiana Jones braves an ancient Peruvian temple filled with booby traps to retrieve a golden idol. Upon fleeing the temple, Indiana is confronted by rival archaeologist René Belloq and the indigenous Hovitos. Surrounded and outnumbered, Indiana is forced to surrender the idol to Belloq and escapes aboard a waiting Waco seaplane, in the process revealing his fear of snakes. Shortly after returning to the college in the United States where he teaches archaeology, Indiana is interviewed by two Army intelligence agents. They inform him that the Nazis, in their quest for occult power, are searching for his old mentor, Abner Ravenwood, who is the leading expert on the ancient Egyptian city of Tanis and possesses the headpiece of an artifact called the Staff of Ra. Indiana deduces that the Nazis are searching for Tanis because it is believed to be the location of the Ark of the Covenant, the biblical chest built by the Israelites to contain the fragments of the Ten Commandments; the Nazis believe that if they acquire it, their armies will become invincible. The Staff of Ra, meanwhile, is the key to finding the Well of Souls, a secret chamber in which the Ark is buried. The agents subsequently authorize Indiana to recover the Ark before the Nazis. Indiana travels to Nepal, only to find that Ravenwood has died and that the headpiece is in the possession of his daughter, Marion, Indiana's embittered former lover. Indiana offers to buy the headpiece for three thousand dollars, plus two thousand more when they return to the United States. Marion's tavern is suddenly raided by a group of thugs commanded by Nazi agent Toht. The tavern is burned down in the ensuing fight, during which Toht burns his hand on the searing hot headpiece as he tries to grab it. Indiana and Marion escape with the headpiece, with Marion declaring she will accompany Indiana in his search for the Ark so he can repay his debt. They travel to Cairo where they learn from Indiana's friend Sallah, a skilled excavator, that Belloq and the Nazis, led by Colonel Dietrich, are currently digging for the Well of Souls with a replica of the headpiece modeled after the scar on Toht's hand. In a bazaar, Nazi operatives attempt to kidnap Marion and as Indiana chases after them it appears that she dies in an explosion. While deciphering the markings on the headpiece, Indiana and Sallah realize that the Nazis have miscalculated the location of the Well of Souls. Using this to their advantage, they infiltrate the Nazi dig and use the Staff of Ra to determine the location correctly and uncover the Well of Souls, which is filled with snakes. Indiana fends off the snakes and acquires the Ark, but Belloq, Dietrich and the Nazis arrive to take it. They toss Marion into the well with Indiana and seal them in, but they manage to escape. After a fistfight with a giant Nazi mechanic, blowing up a flying wing on the airstrip, and chasing down a convoy of trucks, Indiana takes back the Ark before it can be shipped to Berlin. Indiana and Marion leave Cairo to escort the Ark to England on board a tramp steamer. The next morning, their boat is boarded by Belloq, Dietrich and the Nazis, who once again steal the Ark and kidnap Marion. Indiana stows away on their U-boat and follows them to an isolated island in the Aegean Sea where Belloq plans to test the power of the Ark before presenting it to Hitler. Indiana reveals himself and threatens to destroy the Ark with a rocket-propelled grenade launcher, but Belloq calls his bluff, knowing Indy cannot bear to eradicate an important historical artifact. Indiana surrenders and is tied to a post with Marion as Belloq performs a ceremonial opening of the Ark, which appears to contain nothing but sand. Suddenly, spirits resembling Old Testament Seraphim emerge from the Ark. Aware of the supernatural danger of looking at the opened Ark, Indiana warns Marion to close her eyes. The apparitions suddenly morph into "angels of death", and lightning bolts begin flying out of the Ark, gruesomely killing the Nazi soldiers, while Belloq, Dietrich and Toht meet even more gruesome fates. The fires rise into the sky, then fall back down to Earth and the Ark closes with a crack of thunder. Back in Washington, D.C., the Army intelligence agents tell a suspicious Indiana and Brody that the Ark "is someplace safe" to be studied by "top men". In reality, the Ark is sealed in a wooden crate labeled "top secret" and stored in a giant government warehouse filled with countless similar crates.
## '''




In [4]:
# Remove Square Brackets and Extra Spaces

size = len(df.summary)
article_text = ["" for x in range(size)]

for i in range(size):
    article_text[i] = re.sub(r'\[[0-9]*\]', ' ', df.summary[i])
    article_text[i] = re.sub(r'\s+', ' ', df.summary[i])


# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The article_text list contains {0} article summaries.'.format(len(article_text)))

The article_text list contains 42303 article summaries.


In [5]:
# QC. Slice the first 6 elements in the article_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

article_text[0:5]

# we have some ([( before certain names but they are also there in the original data



["Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.",
 'The nation of Panem consists of a wealthy Capitol and twelve poorer districts. As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games. The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12. Her older sister Katniss volunteers to take her place. Peeta Mellark, a baker\'s son who once gave Katniss bread when she was starving, is the other District 12 tribute. Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy. He warns them about the "Career" tributes who train intensively at special academies and almost always wi

In [6]:
# Removing special characters and digits
## formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
## formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

# Remove special characters and digits

formatted_article_text = ["" for x in range(size)]
for i in range(size):
    formatted_article_text[i] = re.sub('[^a-zA-Z]', ' ', article_text[i])
    formatted_article_text[i] = re.sub(r'\s+', ' ', formatted_article_text[i])

# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The formatted_article_text list contains {0} article summaries.'.format(len(formatted_article_text)))



The formatted_article_text list contains 42303 article summaries.


In [7]:
# QC. Slice the first 6 elements in the article_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

formatted_article_text[0:5]

##removing digits alsoo removes dates included in the original text. It is okay since we will use the article_text to differentiate words.



['Shlykov a hard working taxi driver and Lyosha a saxophonist develop a bizarre love hate relationship and despite their prejudices realize they aren t so different after all ',
 'The nation of Panem consists of a wealthy Capitol and twelve poorer districts As punishment for a past rebellion each district must provide a boy and girl between the ages of and selected by lottery for the annual Hunger Games The tributes must fight to the death in an arena the sole survivor is rewarded with fame and wealth In her first Reaping year old Primrose Everdeen is chosen from District Her older sister Katniss volunteers to take her place Peeta Mellark a baker s son who once gave Katniss bread when she was starving is the other District tribute Katniss and Peeta are taken to the Capitol accompanied by their frequently drunk mentor past victor Haymitch Abernathy He warns them about the Career tributes who train intensively at special academies and almost always win During a TV interview with Caesar F

In [8]:
# Convert Text to Sentences

sentence_list = ["" for x in range(size)]
for i in range(size):
  ##  sentence_list[i] = nltk.sent_tokenize(formatted_article_text[i]) #this should be 
  #article text because other one does not include any punctuations.
    sentence_list[i] = nltk.sent_tokenize(article_text[i])

# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The sentence_text list contains {0} article summaries.'.format(len(sentence_list)))

The sentence_text list contains 42303 article summaries.


In [9]:
# QC. Slice the first 6 elements in the sentence_text list. Make sure that they are cleaned correctly and
# Make sure that one summary is still one element in the list

sentence_list[0:5]

[["Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all."],
 ['The nation of Panem consists of a wealthy Capitol and twelve poorer districts.',
  'As punishment for a past rebellion, each district must provide a boy and girl between the ages of 12 and 18 selected by lottery for the annual Hunger Games.',
  'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.',
  'In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12.',
  'Her older sister Katniss volunteers to take her place.',
  "Peeta Mellark, a baker's son who once gave Katniss bread when she was starving, is the other District 12 tribute.",
  'Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy.',
  'He warns them about the "Career" tributes who train intensively at s

In [10]:
# For QCing the word frequency loop. It calculates word frequencies for the 1st element in the list of summaries

# Find Weighted Frequency of Occurance
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies_test = {}
for word in nltk.word_tokenize(formatted_article_text[1]):
    if word not in stopwords:
        if word not in word_frequencies_test.keys():
            word_frequencies_test[word] = 1
        else:
            word_frequencies_test[word] += 1
            
                

In [11]:
print(word_frequencies_test)

##one thing to consider here is that while stopwords have him.her.the, and they are not counted. 
#if The, Him, and Her are used in the text
#capitalized ones are counted. Is it what we wanted? 
#also cpaitalized words are counted differently. Is it okay? 


{'The': 4, 'nation': 1, 'Panem': 1, 'consists': 1, 'wealthy': 1, 'Capitol': 2, 'twelve': 1, 'poorer': 1, 'districts': 1, 'As': 3, 'punishment': 1, 'past': 2, 'rebellion': 1, 'district': 2, 'must': 2, 'provide': 2, 'boy': 2, 'girl': 2, 'ages': 1, 'selected': 1, 'lottery': 1, 'annual': 1, 'Hunger': 2, 'Games': 5, 'tributes': 4, 'fight': 1, 'death': 2, 'arena': 2, 'sole': 1, 'survivor': 2, 'rewarded': 1, 'fame': 1, 'wealth': 1, 'In': 1, 'first': 2, 'Reaping': 1, 'year': 1, 'old': 1, 'Primrose': 1, 'Everdeen': 1, 'chosen': 1, 'District': 7, 'Her': 1, 'older': 1, 'sister': 1, 'Katniss': 24, 'volunteers': 1, 'take': 1, 'place': 1, 'Peeta': 16, 'Mellark': 1, 'baker': 1, 'son': 1, 'gave': 1, 'bread': 1, 'starving': 1, 'tribute': 3, 'taken': 1, 'accompanied': 1, 'frequently': 1, 'drunk': 1, 'mentor': 1, 'victor': 1, 'Haymitch': 4, 'Abernathy': 1, 'He': 2, 'warns': 2, 'Career': 1, 'train': 1, 'intensively': 1, 'special': 1, 'academies': 1, 'almost': 1, 'always': 1, 'win': 2, 'During': 1, 'TV': 1

In [12]:
# Find Weighted Frequency of Occurance
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {x:{} for x in range(size)}

for i in range(size):
    for word in nltk.word_tokenize(formatted_article_text[i]):
        if word not in stopwords:
            if word not in word_frequencies[i].keys():
                word_frequencies[i][word] = 1
            else:
                word_frequencies[i][word] += 1
                
# QC. The list should be as long as the total number of rows in the Pandas dataframe
print('The word_frequencies dictionary is {0} rows long.'.format(len(word_frequencies)))

The word_frequencies dictionary is 42303 rows long.


In [13]:
# QC. Index the 1st element in the dictionary. The frequency counts should match word_frequencies_test
print(word_frequencies[1])


{'The': 4, 'nation': 1, 'Panem': 1, 'consists': 1, 'wealthy': 1, 'Capitol': 2, 'twelve': 1, 'poorer': 1, 'districts': 1, 'As': 3, 'punishment': 1, 'past': 2, 'rebellion': 1, 'district': 2, 'must': 2, 'provide': 2, 'boy': 2, 'girl': 2, 'ages': 1, 'selected': 1, 'lottery': 1, 'annual': 1, 'Hunger': 2, 'Games': 5, 'tributes': 4, 'fight': 1, 'death': 2, 'arena': 2, 'sole': 1, 'survivor': 2, 'rewarded': 1, 'fame': 1, 'wealth': 1, 'In': 1, 'first': 2, 'Reaping': 1, 'year': 1, 'old': 1, 'Primrose': 1, 'Everdeen': 1, 'chosen': 1, 'District': 7, 'Her': 1, 'older': 1, 'sister': 1, 'Katniss': 24, 'volunteers': 1, 'take': 1, 'place': 1, 'Peeta': 16, 'Mellark': 1, 'baker': 1, 'son': 1, 'gave': 1, 'bread': 1, 'starving': 1, 'tribute': 3, 'taken': 1, 'accompanied': 1, 'frequently': 1, 'drunk': 1, 'mentor': 1, 'victor': 1, 'Haymitch': 4, 'Abernathy': 1, 'He': 2, 'warns': 2, 'Career': 1, 'train': 1, 'intensively': 1, 'special': 1, 'academies': 1, 'almost': 1, 'always': 1, 'win': 2, 'During': 1, 'TV': 1

In [14]:
# Get Weighted Frequency
maximum_frequncy = ["" for x in range(size)]
for i in range(size):
    maximum_frequncy[i] = max(word_frequencies[i].values())

    for word in word_frequencies[i].keys():
        word_frequencies[i][word] = (word_frequencies[i][word]/maximum_frequncy[i])
 #QC   
print('The maximum_frequncy dictionary is {0} rows long.'.format(len(maximum_frequncy)))



The maximum_frequncy dictionary is 42303 rows long.


In [15]:
#QC. You can compare this result with the previous list of frequencies and see if it is the highest. 
print(maximum_frequncy[1])

24


In [16]:
# For QCing the sentence scoes loop. It calculates sentence scores for the 1st element in the list of summaries

sentence_scores_test = {}
for sent in sentence_list[1]:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies[1].keys():
            if len(sent.split(' ')) < 20: ## Change this to specify how long/short of sentences you want to include
                if sent not in sentence_scores_test.keys():
                    sentence_scores_test[sent] = word_frequencies[1][word]
                else:
                    sentence_scores_test[sent] += word_frequencies[1][word]
                    

In [17]:
#%%
print (sentence_scores_test)



{'The nation of Panem consists of a wealthy Capitol and twelve poorer districts.': 0.24999999999999997, 'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.': 0.7083333333333333, 'In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12.': 0.20833333333333331, 'Her older sister Katniss volunteers to take her place.': 0.20833333333333331, 'Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy.': 0.3333333333333333, 'He warns them about the "Career" tributes who train intensively at special academies and almost always win.': 0.5833333333333335, 'During a TV interview with Caesar Flickerman, Peeta unexpectedly reveals his love for Katniss.': 0.20833333333333331, 'However, she discovers Peeta meant what he said.': 0.125, 'Peeta forms an uneasy alliance with the four Careers.': 0.20833333333333331, 'They later find Katniss and corner her up a tree.': 0.2

In [18]:
# Calculate Sentence Scores

sentence_scores = {x:{} for x in range(size)}

for i in range(size):
    for sent in sentence_list[i]:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies[i].keys():
                if len(sent.split(' ')) < 20: ## We have some very short ones. With this, they are excluded. Change this to specify how long/short of sentences you want to include
                    if sent not in sentence_scores[i].keys():
                        sentence_scores[i][sent] = word_frequencies[i][word]
                    else:
                        sentence_scores[i][sent] += word_frequencies[i][word]
#QC. 
print('The sentence_scores dictionary is {0} rows long.'.format(len(sentence_scores)))




The sentence_scores dictionary is 42303 rows long.


In [19]:

#%%
#QC sentence score of the hunger games. It should be same with above example. 
print (sentence_scores[1])

{'The nation of Panem consists of a wealthy Capitol and twelve poorer districts.': 0.24999999999999997, 'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.': 0.7083333333333333, 'In her first Reaping, 12-year-old Primrose Everdeen is chosen from District 12.': 0.20833333333333331, 'Her older sister Katniss volunteers to take her place.': 0.20833333333333331, 'Katniss and Peeta are taken to the Capitol, accompanied by their frequently drunk mentor, past victor Haymitch Abernathy.': 0.3333333333333333, 'He warns them about the "Career" tributes who train intensively at special academies and almost always win.': 0.5833333333333335, 'During a TV interview with Caesar Flickerman, Peeta unexpectedly reveals his love for Katniss.': 0.20833333333333331, 'However, she discovers Peeta meant what he said.': 0.125, 'Peeta forms an uneasy alliance with the four Careers.': 0.20833333333333331, 'They later find Katniss and corner her up a tree.': 0.2

In [20]:
#single example for Summary
summary_sentences_test = heapq.nlargest(2, sentence_scores[1], key=sentence_scores[1].get)

summary_test = ' '.join(summary_sentences_test)
print(summary_test)
#the results should in the order of highest to the lowest



The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. Rue, hiding in a nearby tree, draws her attention to a poisonous tracker jacker nest hanging from a branch.


In [24]:
print(summary_sentences_test)
#to understand what is different than summary_test



['The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.', 'Rue, hiding in a nearby tree, draws her attention to a poisonous tracker jacker nest hanging from a branch.']


In [22]:
# Get a Summary
# This needs to be better. Instead of summarizing the text defined above, this should be a for loop that
# Runs a text summary on every row in the pandas data frame defined above

#first part, creating a summary sentences list#

#Change value here to get summary sentence length
summary_sentences = ["" for x in range(size)]
for i in range(size):
    summary_sentences[i] = heapq.nlargest(2, sentence_scores[i], key=sentence_scores[i].get)

#QC. 
print('The summary_sentences list is {0} rows long.'.format(len(summary_sentences)))



The summary_sentences list is 42303 rows long.


In [23]:
print(summary_sentences[0:5])
#should be two sentences in each

[[], ['The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth.', 'Rue, hiding in a nearby tree, draws her attention to a poisonous tracker jacker nest hanging from a branch.'], ['On his way back to peaceful life, Induchoodan accepts Anuradha as his life partner.', 'Poovalli Induchoodan is sentenced for six years prison life for murdering his classmate.'], ["He decides to recover the money, sneaking into Charley's home in the guise of an elderly woman.", 'Later that night, the Kid returns to the original Nellie Thursday home to meet with Moose Moran .'], ['When she discovers the infant is missing, everyone joins forces to search for her, without success.', 'The tide of public opinion soon turns against the Chamberlains.']]


In [25]:
#now to get these two sentences separated '' into a joint sentences.
summary = ["" for x in range(size)]
for i in range(size):
    summary[i] = ' '.join(summary_sentences[i])
#QC. 
print('The summary list is {0} rows long.'.format(len(summary)))

The summary list is 42303 rows long.


In [26]:
print(summary[0:5])

['', 'The tributes must fight to the death in an arena; the sole survivor is rewarded with fame and wealth. Rue, hiding in a nearby tree, draws her attention to a poisonous tracker jacker nest hanging from a branch.', 'On his way back to peaceful life, Induchoodan accepts Anuradha as his life partner. Poovalli Induchoodan is sentenced for six years prison life for murdering his classmate.', "He decides to recover the money, sneaking into Charley's home in the guise of an elderly woman. Later that night, the Kid returns to the original Nellie Thursday home to meet with Moose Moran .", 'When she discovers the infant is missing, everyone joins forces to search for her, without success. The tide of public opinion soon turns against the Chamberlains.']


In [43]:
# OK, now it is time to score the summaries on arousal, valence, and dominance. There is a nice dictionary by
# Bradley and Lang that will let you do exactly that. You can borrow that here:
# https://github.com/dwzhou/SentimentAnalysis

# Biasically, you'll want to classify each cell in the Pandas Dataframe (where each cell contains a movie summary)
# along arousal, valence, and dominance. So we can get a score for each.

In [39]:
# Note: this is super optional and likely not necissary. You could, instead of using a dictionary approach,
# Train a classifier to do your sentiment analysis. This would be cooler, but also probably a lot of work
# And I'm not sure it would gain us all that much. But if you are feeling energetic, or the Lang dictionary
# doesn't work, here is some hints about training a classifier.

#Train a text sentiment classifier. Here we are in good shape because most are trained on movie ratings
# But you could also train the classifier on the un summarized movie reviews
# For ideas on how to do this, check out: https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184

In [27]:
#%%
# Author: Doris Zhou
# Date: September 29, 2017
# Performs sentiment analysis on a text file using ANEW.
# Parameters:
 #   --dir [path of directory]
  #      specifies directory of files to analyze
 #   --file [path of text file]
   #     specifies location of specific file to analyze
  #  --out [path of directory]
    #    specifies directory to create output files
   # --mode [mode]
   #     takes either "median" or "mean"; determines which is used to calculate sentence sentiment values
# add parameter to exclude duplicates? also mean or median analysis
   
   #as mentioned on the website and the website of Stanford NLP, you might need to (pip) install few extra things  

import csv
import sys
import os
#MacOS users have to use sudo to run the program.
os.system("sudo -t /Users/ezgiulusoy/Downloads/stanford-corenlp-full-2018-10-05/_psosx.py") #This py file was the problematic for me so I only use root for that.
import statistics
import time
import argparse
from stanfordcorenlp import StanfordCoreNLP


nlp = StanfordCoreNLP('/Users/ezgiulusoy/Downloads/stanford-corenlp-full-2018-10-05') #write the path to corenlp downloaded from https://stanfordnlp.github.io/CoreNLP/ 

from nltk import tokenize
from nltk.corpus import stopwords
from nltk import pos_tag #it was not in the original file but I needed this.
nltk.download('averaged_perceptron_tagger') #these will be used in the analysis. You might need to pip install nltk from Terminal first. 
nltk.download('wordnet')



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ezgiulusoy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ezgiulusoy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:

stops = set(stopwords.words("english"))
anew = "/Users/ezgiulusoy/Downloads/SentimentAnalysis-master/lib/EnglishShortened.csv" #write your own path here to the EnglishShortened.csv available on github link for this analysis.


In [None]:
#%%
##I SKIPPED THIS PART SINCE I HAVE THE DATA IN THE PROGRAM CURRENTLY##

# performs sentiment analysis on inputFile using the ANEW database, outputting results to a new CSV file in outputDir
def analyzefile(input_file, output_dir, mode):
    """
    Performs sentiment analysis on the text file given as input using the ANEW database.
    Outputs results to a new CSV file in output_dir.
    :param input_file: path of .txt file to analyze
    :param output_dir: path of directory to create new output file
    :param mode: determines how sentiment values for a sentence are computed (median or mean)
    :return:
    """
    output_file = os.path.join(output_dir, "Output Anew Sentiment " + os.path.basename(input_file).rstrip('.txt') + ".csv")

    # read file into string
    with open(input_file, 'r') as myfile:
        fulltext = myfile.read()
    # end method if file is empty
    if len(fulltext) < 1:
        print('Empty file.')
        return

In [30]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()

In [34]:
 # QC for one sentence -- not working completely yet. 
      
sentences_test = sentence_list [0]
i = 1 # to store sentence index
# check each word in sentence for sentiment and write to output_file
#write your own output file 
with open('/Users/ezgiulusoy/Desktop/output', 'w', newline='') as csvfile:
    fieldnames = ['Sentence ID', 'Sentence', 'Sentiment', 'Sentiment Label', 'Arousal', 'Dominance',
                    '# Words Found', 'Found Words', 'All Words']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    # analyze each sentence for sentiment
    for s in sentences_test:
        #print("S" + str(i) +": " + s)
        all_words = []
        found_words = []
        total_words = 0
        v_list = []  # holds valence scores
        a_list = []  # holds arousal scores
        d_list = []  # holds dominance scores
            
        # search for each valid word's sentiment in ANEW
        pos_tag(word_tokenize(sentences_test[0]))
        words = nlp.pos_tag(s.lower())
        for index, p in enumerate(words):
            # don't process stops or words w/ punctuation
            w = p[0]
            pos = p[1]
            if w in stops or not w.isalpha():
                continue

            # check for negation in 3 words before current word
            j = index-1
            neg = False
            while j >= 0 and j >= index-3:
                if words[j][0] == 'not' or words[j][0] == 'no' or words[j][0] == 'n\'t':
                    neg = True
                    break
                j -= 1

            # lemmatize word based on pos
            if pos[0] == 'N' or pos[0] == 'V':
                lemma = lmtzr.lemmatize(w, pos=pos[0].lower())
            else:
                lemma = w

            all_words.append(lemma)

            # search for lemmatized word in ANEW
            with open(anew) as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    if row['Word'].casefold() == lemma.casefold():
                        if neg:
                            found_words.append("neg-"+lemma)
                        else:
                            found_words.append(lemma)
                        v = float(row['valence'])
                        a = float(row['arousal'])
                        d = float(row['dominance'])

                        if neg:
                            # reverse polarity for this word
                            v = 5 - (v - 5)
                            a = 5 - (a - 5)
                            d = 5 - (d - 5)

                        v_list.append(v)
                        a_list.append(a)
                        d_list.append(d)

        if len(found_words) == 0:  # no words found in ANEW for this sentence
            writer.writerow({'Sentence ID': i,
                                'Sentence': s,
                                'Sentiment': 'N/A',
                                'Sentiment Label': 'N/A',
                                'Arousal': 'N/A',
                                'Dominance': 'N/A',
                                '# Words Found': 0,
                                'Found Words': 'N/A',
                                'All Words': all_words
                                })
            i += 1
        else:  # output sentiment info for this sentence

            # get values
            mode = [] #I added this because I skipped inputfile section above
            if mode == 'median':
                sentiment = statistics.median(v_list)
                arousal = statistics.median(a_list)
                dominance = statistics.median(d_list)
            else:
                sentiment = statistics.mean(v_list)
                arousal = statistics.mean(a_list)
                dominance = statistics.mean(d_list)

            # set sentiment label
            label = 'neutral'
            if sentiment > 6:
                label = 'positive'
            elif sentiment < 4:
                label = 'negative'
                
            writer.writerow({'Sentence ID': i,  #this section does not work on spyder but for some readon just worked here.
                                'Sentence': s,
                                'Sentiment': sentiment,
                                'Sentiment Label': label,
                                'Arousal': arousal,
                                'Dominance': dominance,
                                '# Words Found': ("%d out of %d" % (len(found_words), len(all_words))),
                                'Found Words': found_words,
                                'All Words': all_words
                                })
            i += 1

 

In [None]:
#I did not yet come to this section.
def main(input_file, input_dir, output_dir, mode):
    """
    Runs analyzefile on the appropriate files, provided that the input paths are valid.
    :param input_file:
    :param input_dir:
    :param output_dir:
    :param mode:
    :return:
    """

    if len(output_dir) < 0 or not os.path.exists(output_dir):  # empty output
        print('No output directory specified, or path does not exist')
        sys.exit(0)
    elif len(input_file) == 0 and len(input_dir)  == 0:  # empty input
        print('No input specified. Please give either a single file or a directory of files to analyze.')
        sys.exit(1)
    elif len(input_file) > 0:  # handle single file
        if os.path.exists(input_file):
            analyzefile(input_file, output_dir, mode)
        else:
            print('Input file "' + input_file + '" is invalid.')
            sys.exit(0)
    elif len(input_dir) > 0:  # handle directory
        if os.path.isdir(input_dir):
            directory = os.fsencode(input_dir)
            for file in os.listdir(directory):
                filename = os.path.join(input_dir, os.fsdecode(file))
                if filename.endswith(".txt"):
                    start_time = time.time()
                    print("Starting sentiment analysis of " + filename + "...")
                    analyzefile(filename, output_dir, mode)
                    print("Finished analyzing " + filename + " in " + str((time.time() - start_time)) + " seconds")
        else:
            print('Input directory "' + input_dir + '" is invalid.')
            sys.exit(0)


if __name__ == '__main__':
    # get arguments from command line
    parser = argparse.ArgumentParser(description='Sentiment analysis with ANEW.')
    parser.add_argument('--file', type=str, dest='input_file', default='',
                        help='a string to hold the path of one file to process')
    parser.add_argument('--dir', type=str, dest='input_dir', default='',
                        help='a string to hold the path of a directory of files to process')
    parser.add_argument('--out', type=str, dest='output_dir', default='',
                        help='a string to hold the path of the output directory')
    parser.add_argument('--mode', type=str, dest='mode', default='mean',
                        help='mode with which to calculate sentiment in the sentence: mean or median')
    args = parser.parse_args()

    # run main
    sys.exit(main(args.input_file, args.input_dir, args.output_dir, args.mode))



