In [1]:
import matplotlib.pyplot as plt
import collections 
import numpy as np
import pandas as pd
import seaborn as sns
from functools import reduce

In [2]:
class Speech:
    def __init__(self, name, year, nbPositiveWords, nbNegativeWords, nbTotalWords,speechList,topwords):
        self.name = name
        self.year = year
        self.nbPositiveWords = nbPositiveWords 
        self.nbNegativeWords = nbNegativeWords 
        self.nbTotalWords = nbTotalWords
        self.speechList = speechList
        self.topwords = topwords
        # divide by the number of words because some speeches are longer than others.
        self.ratio = (nbPositiveWords-nbNegativeWords)/nbTotalWords*100 


## getLexicons() Function will take blank positiveList and negativeList and return all the positive and negative words from the files positive_words.txt and negative_words.txt.

In [3]:
def getLexicons(positiveList, negativeList):
    ## Positive lexicon
    positiveLex = open('positive_words.txt', 'r')
    for words in positiveLex:
        word=words.rstrip("\n") # to eliminate the line break
        positiveList.append(word) # add the word to the list

    ## Negative lexicon 
    negativeLex = open('negative_words.txt', 'r')
    for words in negativeLex:
        word=words.rstrip("\n") # to eliminate the line break
        negativeList.append(word) # add the word to the list

    ## close the files
    positiveLex.close()
    negativeLex.close()

    return positiveList, negativeList

## This getSpeechAnalysis() Function takes Speech, President name, year, positive word list and negative word list and process the following: 1. Convert the words into lower case, 2. Eliminate punctuations, 3. Generate array of words 4. Calculate positive and negative score comparing the positive / negative words, 5.Track the most common 100 words with the length > 5. 

In [4]:
def getSpeechAnalysis(speechName, name, year, positiveList, negativeList): 
    ## Open the speech
    speech = open(speechName, 'r')
    speechList = [] 
    for line in speech:
        word=line.rstrip(" ").split()    # Generate a bag of words
        for element in word: 
            if element[-1]=='.' or element[-1]==",": #  Eliminate punctuations
                element = element[:-1]
            speechList.append(element.lower()) # Convert the words into lower case
            b=[]
            for i in speechList:
                if (len(i)> 5) :
                    b.append(i)
            word_counter = collections.Counter(b)
            topwords=word_counter.most_common(100)
            
    ## Count the nb of positive words and negative words
    positiveWords = 0
    negativeWords = 0
    for words in speechList: # we browse the speech
        if positiveList.count(words)==1: # if the word is a positive one
            positiveWords+=1
        if negativeList.count(words)==1: # if the word is a negative one
            negativeWords+=1
    
    ## Close the files
    speech.close()

    # we create a speech
    speechClass = Speech(name, year, positiveWords, negativeWords, len(speechList),speechList,topwords)
    return speechClass

In [5]:
## Initialization 
positiveList = []
negativeList = []
getLexicons(positiveList, negativeList)

([';;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;',
  ';',
  '; Opinion Lexicon: Positive',
  ';',
  '; This file contains a list of POSITIVE opinion words (or sentiment words).',
  ';',
  '; This file and the papers can all be downloaded from',
  ';    http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html',
  ';',
  '; If you use this list, please cite one of the following two papers:',
  ';',
  ';   Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews."',
  ';       Proceedings of the ACM SIGKDD International Conference on Knowledge',
  ';       Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle,',
  ';       Washington, USA,',
  ';   Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing',
  ';       and Comparing Opinions on the Web." Proceedings of the 14th',
  ';       International World Wide Web conference (WWW-2005), May 10-14,',
  ';       2005, Chiba, Japan.',
  ';',
  '; Notes:',
  ';    1. The appearan

In [6]:
# list of all the speeches, Presidents and years:
Files = [
            [ 'Trump', 2017, 'Trump.txt'],
            [ 'Obama', 2013, 'Obama.txt'],
            [ 'Bush', 2001, 'Bush.txt'],
            [ 'Reagan', 1981, 'Reagan.txt'],
            [ 'Kennedy', 1961, 'Kennedy.txt'],
            [ 'Roosevelt', 1941, 'Roosevelt.txt']
    ]

In [7]:
## Initialization of speeches
speeches = []
# Pass the data to the Function getSpeechAnalysis
sizeOfFiles = len(Files)
for i in range(sizeOfFiles-1, -1, -1): # browse all the array from end to beginning
    speeches.append(getSpeechAnalysis('.\speech\\'+Files[i][2], Files[i][0], Files[i][1], positiveList, negativeList))

# Initialization
iFiles = 0
Result = []
Speechtext = []
nameOfSpeaker = "" 
ratio = []
year = []
nbPositiveWords = []
nbNegativeWords = []
nbTotalWords = []
speechList = []
topwords = []

while (iFiles<sizeOfFiles):
    if nameOfSpeaker!=speeches[iFiles].name: # if the President changes
        if (len(nameOfSpeaker)>0): # if the President is defined 
            Result.append([nameOfSpeaker, ratio, year, nbPositiveWords, nbNegativeWords, nbTotalWords]); 
            Speechtext.append([nameOfSpeaker, topwords, speechList]);
        nameOfSpeaker=speeches[iFiles].name # Replace by actual President
        #  reinitialize the lists
        ratio = [] 
        year = []  
        nbPositiveWords = []
        nbNegativeWords = []
        nbTotalWords = []
        speechList = []
        topwords = []
        
    ratio.append(speeches[iFiles].ratio) # if the President is still the same, we add the ratio to its ratio list
    year.append(speeches[iFiles].year) 
    nbPositiveWords.append(speeches[iFiles].nbPositiveWords)
    nbNegativeWords.append(speeches[iFiles].nbNegativeWords)
    nbTotalWords.append(speeches[iFiles].nbTotalWords)
    speechList.append(speeches[iFiles].speechList)
    topwords.append(speeches[iFiles].topwords)
    iFiles+=1
    
Result.append([nameOfSpeaker, ratio, year, nbPositiveWords, nbNegativeWords, nbTotalWords]); # we add the last speaker's parameters to the List
Speechtext.append([nameOfSpeaker,topwords, speechList]);

FileNotFoundError: [Errno 2] No such file or directory: '.\\speech\\Roosevelt.txt'

In [None]:
print("(President,Ratio,Year,Positve Words, Negative Words, Total Words)")
print(Result)

## Visualization: Positiveness ratio in speeches depending on the year

In [None]:
color = ['magenta', 'black', 'red', 'blue', 'orange', 'green']
iColor = 0

for i in range(0, len(Result)):
    if (iColor>=len(color)):
        iColor=0
    plt.scatter(Result[i][2],Result[i][1], c = color[iColor], label=Result[i][0]) 
    iColor+=1

plt.legend()
plt.title('Positiveness ratio in speeches depending on the year')
plt.xlabel('Year')
plt.ylabel('Positive Ratio')
plt.show()

## Visualization: Positiv Vs Negative words in speeches

In [None]:
# Scatter Plot for positive and negative word
for i in range(0, len(Result)):
    if (iColor>=len(color)):
        iColor=0
    plt.scatter(Result[i][3],Result[i][4], c = color[iColor], label=Result[i][0]) 
    iColor+=1
plt.legend()
plt.title('Positiv Vs Negative words in speeches')
plt.xlabel('Positive Words')
plt.ylabel('Negative Words')
plt.show()

## Visualization: Scatter Plot for President by length of speech

In [None]:
# Scatter Plot for President by length of speech
for i in range(0, len(Result)):
    if (iColor>=len(color)):
        iColor=0
    plt.scatter(Result[i][0],Result[i][5], c = color[iColor], label=Result[i][2]) 
    iColor+=1
plt.legend()
plt.title('Presiden Vs Length of speech')
plt.xlabel('Presidents')
plt.ylabel('Length of Speech')
plt.show()

## The function getSpeechAnalysis() calculated the top 100 most frequent words from every speech. The result has generated in the below section  

In [None]:
# Top 100 words from the speech of President Roosevelt
print(Speechtext[0][1])

## The following Dataframe stores top common words from all the speeches 

In [None]:
# Create Dataframes from list
Roosevelt=pd.DataFrame(Speechtext[0][1][0],columns =['word','Roosevelt'])
Kennedy=pd.DataFrame(Speechtext[1][1][0],columns =['word','Kennedy'])
Reagan=pd.DataFrame(Speechtext[2][1][0],columns =['word','Reagan'])
Bush=pd.DataFrame(Speechtext[3][1][0],columns =['word','Bush'])
Obama=pd.DataFrame(Speechtext[4][1][0],columns =['word','Obama'])
Trump=pd.DataFrame(Speechtext[5][1][0],columns =['word','Trump'])

# Join all the dataframes

data_frames = [Roosevelt, Kennedy, Reagan, Bush, Obama, Trump]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['word'],how='inner'), data_frames)
df_merged

## Visualization: Heatmap from all the top common words

In [None]:
sns.heatmap(df_merged.iloc[:, 1:7])

# 6. This Function will calculate the following measures:
## a. Number of Sentences and Words per Speech and Average Number of words per sentence.
## b. Average number of words of length > 5 per sentence. 
## c. Number of unique words per 1000 words 


In [None]:
def SentenceAnalysis(speechName, name):
    speech = open(speechName, 'r')
    a=speech.read()
    sentences = a.split('.') # List of sentences
    words = a.split()        # List of words
    b=[]
    for i in words:
        if (len(i)> 5) :
            b.append(i)
    word_counter = collections.Counter(words)   # Number of Unique words
    # will return Name of the President, Number of Sentences, Number of Words, Average words per sentence, Number of words with the length > 5, Average 6 or more letter words per sentence and Unique words per 1000. 
    return name,len(sentences),len(words),len(words) / len(sentences),len(b), len(b) / len(sentences),len(word_counter)/len(words)*1000

In [None]:
speechresult = []
# we apply the function to all the speeches
sizeOfFiles = len(Files)
for i in range(sizeOfFiles-1, -1, -1): # browse all the array from end to beginning
    speechresult.append(SentenceAnalysis('.\speech\\'+Files[i][2], Files[i][0]))
print("(President,Sentences, Words, Average sentence Length, No of words length > 5, Average sentence length (words>5), Unique words per 1000)")
print(speechresult)

## Visualizatio: Number of Sentences Vs Average sentence length

In [None]:
color = ['magenta', 'black', 'red', 'blue', 'orange', 'green']
iColor = 0

for i in range(0, len(speechresult)):
    if (iColor>=len(color)):
        iColor=0
    plt.scatter(speechresult[i][1],speechresult[i][3], c = color[iColor], label=speechresult[i][0]) 
    iColor+=1

plt.legend()
plt.title('Number of Sentences Vs Average sentence length')
plt.xlabel('Number of Sentences')
plt.ylabel('Average sentence length')
plt.show()

## Visualization: Total Number of Words Vs Unique words

In [None]:
for i in range(0, len(speechresult)):
    if (iColor>=len(color)):
        iColor=0
    plt.scatter(speechresult[i][2],speechresult[i][6], c = color[iColor], label=speechresult[i][0]) 
    iColor+=1

plt.legend()
plt.title('Total Number of Words Vs Unique words')
plt.xlabel('Total Number of Words')
plt.ylabel('Unique Words')
plt.show()

## Thank You 