# Nobel Peace Prize Speech Analysis

## This notebook walks through how I updated my initial analysis and the improvements I made.

In [29]:
import os
import nltk
import re
import collections as col
from nltk.corpus import stopwords

In [46]:
#define the path where the speech files are found
path = "nobelspeeches"
nobel_speeches = os.listdir(path)

#join the path with the files to find the appropriate file when looking for it
files = sorted([os.path.join(path, file) for file in nobel_speeches if file.endswith('.txt')])

### I added a few pieces of punctuation to the stopwords to prevent them from coming through in the final dictionary.

In [59]:
stops = set(stopwords.words('english'))

stops.update(["–","…","*", " ", ""])

#### Instead of tokenizing the words, I used a for loop to iterate through them and pull them out of each file into a list.

#### I also used regex to tackle some of the more challenging issues within the data so that the final result was as clean as possible.

#### The collections module is useful for quickly forming dictionaries out of iterable datatypes and then sorting the dictionaries by their frequency

In [64]:
vocab_list = []

for file in files:

    file1 = open(file, encoding = "utf8")
    file2 = file1.read()
    file3 = re.sub("\b\W?(\w*)\W?\b", r"/1", file2)
    file4 = re.sub(r"[\b?\(\n*)](\w*)[\(\n*)\b?]", r" \1", file3)
    file4 = re.sub(r"\\n*", " ", file3)
    file5 = re.sub(r"(,|;|:|\.|)?(\w*)(,|;|:|\.|)?", r"\2", file4)
    file5.replace("\n", " ")
    lst = file5.lower().split(" ")

    for word in lst:
        if word.startswith("\'"):
            pass
        elif word not in stops:
            vocab_list.append(word)

most_frequent_words = col.Counter(vocab_list).most_common()

#### Instead of a list with a word and the occurence value, I used a dictionary and took advantage of the key:value strengths.

#### This data is far cleaner and was organized in a much shorter time with far fewer lines of code.

In [65]:
most_frequent_words

[('peace', 361),
 ('world', 303),
 ('people', 257),
 ('us', 238),
 ('human', 234),
 ('one', 187),
 ('must', 171),
 ('women', 152),
 ('war', 149),
 ('nobel', 145),
 ('also', 139),
 ('would', 128),
 ('prize', 122),
 ('many', 119),
 ('rights', 114),
 ('today', 108),
 ('years', 107),
 ('children', 104),
 ('weapons', 104),
 ('new', 98),
 ('social', 90),
 ('change', 88),
 ('time', 85),
 ('nuclear', 85),
 ('international', 84),
 ('every', 83),
 ('could', 82),
 ('country', 81),
 ('work', 76),
 ('make', 76),
 ('great', 76),
 ('nations', 75),
 ('believe', 73),
 ('freedom', 71),
 ('countries', 70),
 ('life', 70),
 ('political', 70),
 ('still', 68),
 ('global', 66),
 ('even', 65),
 ('first', 65),
 ('poor', 65),
 ('climate', 64),
 ('day', 62),
 ('conflict', 61),
 ('see', 60),
 ('security', 60),
 ('future', 60),
 ('end', 59),
 ('child', 59),
 ('together', 58),
 ('community', 58),
 ('justice', 58),
 ('without', 58),
 ('states', 58),
 ('love', 57),
 ('violence', 57),
 ('like', 56),
 ('poverty', 56),
 