# GETTING AND LOADING A CORPUS

https://nlpforhackers.io/corpora/
http://lucumr.pocoo.org/2015/11/18/pythons-hidden-re-gems/

The goal of this step is to develop an initial list of each character and their spoken lines, or a cleanish list of the lines within the text. (Dictionaries are Hash value arbitrary, so may not be ordered the same. Lists are used instead.)

In [1]:
# Select and Read a file into "f" using a list and stripping out all Project Gutenberg headers and footers

from pathlib import Path
import re
data_folder = Path("data/poe/")
file_to_open = data_folder / "2149-0.txt"
f = open(file_to_open, 'r')
first_document = list(f)
#print(first_document)

# Determine whether a Project Gutenberg Text
first_header_index = 0
second_header_index = 0
footer_index = 0
if any("GUTENBERG" in s for s in first_document):
    for first_header_index in range( len(first_document) ):
        if ( ( first_document[first_header_index].find('*END*THE SMALL PRINT!') ) != -1 ) :
            break
        else:
            for first_header_index in range( len(first_document) ):
                if ( ( first_document[first_header_index].find('START OF THIS PROJECT GUTENBERG') ) != -1 ) :
                    break        
        
    second_document = list(first_document[first_header_index + 1 :])

    for second_header_index in range( len(second_document) ):
        if ( ( second_document[second_header_index].find('www.gutenberg.org') ) != -1 ) :
            break            
    for footer_index in range( len(first_document) ):
        if ( ( first_document[footer_index].find('End of Project') ) != -1 ) :
            break
        else:
            for footer_index in range( len(first_document) ):
                if ( ( first_document[footer_index].find('End of the Project') ) != -1 ) :
                    break    
        
    print(first_header_index)            
    print(second_header_index)
    print(footer_index)     
    
    script = list()
    if (second_header_index < (first_header_index + 100)):
        script = list(first_document[first_header_index +1 + second_header_index +1 : footer_index-1])
    else:
        script = list(first_document[first_header_index +1 : footer_index-1])
else:
    script = first_document

#print(script)

20
9261
9276


In [2]:
#Compile a list of speakers
r = re.compile("[A-Z0-9][A-Z0-9]+")
speakers = []
for line in script:
    mtch = r.match(line)
    if mtch:
        speakers.append(mtch.group())
#print(speakers)

#Omit speakers from the list of text
s = re.compile(r"\b[A-Z{3}\.]+\b")
spoken = list(filter(lambda i: not r.search(i), script))

#print(spoken)

# RE-CREATING SENTENCES AND PHRASES

During this step, we concatenate lines in batches to allow the identification of sentences with regular expressions. Then we identify phrases with stop words.

In [3]:
#Concatenate lines into list entries for future sentence splitting

newLines = []
singleLine = ''
singleLines = []

#Remove all line returns(ok)
for j in range(0, len(spoken)):
    spoken[j] = spoken[j].replace('\n', '')
    
#Split 5 lines at a time into new list
for k in range( 0, len(spoken), 3):
    newLines = []
    for line in range( 0, 3 ):
        try:
            newLines.append(' '+spoken[line+k])
        except:
            #print("Index Error at", k, line)
            break
    #Join 5-line groups into one line and append to a list
    singleLine = ''.join(newLines)
    singleLines.append(singleLine)

#print(singleLines)

In [4]:
#Create list of sentences
sentences = []
for m in range(0, len(singleLines)):
    mtch = re.findall("[A-Z][^\.!?]*[\.!?]", singleLines[m], re.M|re.I)
    if mtch:
        sentences.append(mtch)

#print(sentences)

In [5]:
#Clean the stopword list
stoplist = []
clean_line = []
data_folder = Path("data/")
file_to_open = data_folder / "snowball_stop.txt"
f = open(file_to_open, 'r')
full_stop = list(f)

for n in range( 0, len(full_stop), 1 ):
    clean_line = full_stop[n].split('|')
    stoplist.append(clean_line[0])

for p in range(len(stoplist)):
    stoplist[p] = stoplist[p].replace('\n', '')
    
#print(stoplist)

In [6]:
# Create list of phrases using stopwords
phrases = []
candidate_phrases = []

for q in range(len(sentences)):
    for r in sentences[q]:
        words = re.split("\\s+", r)
        previous_stop = False
 
        # Examine each word to determine if it is a phrase boundary marker or part of a phrase or alone
        for w in words:
 
            if w in stoplist and not previous_stop:
                # phrase boundary encountered, so put a hard indicator
                candidate_phrases.append(";")
                previous_stop = True
            elif w not in stoplist and len(w) > 3:
                # keep adding words to list until a phrase boundary is detected
                candidate_phrases.append(w.strip())
                previous_stop = False
 
    # Create a list of candidate phrases without boundary demarcation
    phrases = re.split(";+", ' '.join(candidate_phrases))

# Clean up phrases    
re2 = re.compile('[^\.!?,"(){}\*:]*[\.!?,"(){}\*:]')
for s in range(len(phrases)):
    phrases[s] = re.sub(re2, '', phrases[s])
    phrases[s] = phrases[s].strip(' ')
    phrases[s] = phrases[s].replace(' ', '_')
    phrases[s] = phrases[s].replace('__', '_')
    phrases[s] = phrases[s].strip('_')

for s in range(len(phrases)):
    try:
        phrases.remove('')
        phrases.remove(' ')
        phrases.remove('/n')
    except:
        pass
    
#for t in range(50):
    #print(phrases[t])

#print(phrases)

# PERFORM A PHRASE FREQUENCY COUNT

Now we can identify common phrases by performing a frequency count on each phrase.  Moreover, if the corpus is large enough, commonly used phrases will be evident with higher counts across many texts.  For this reason the phrase list along with counts, will be stored in a file.

In [7]:
# Phrase frequency count
from operator import itemgetter
wordfreq = []
for u in range(len(phrases)):
    utterance = phrases[u]
    uttcnt = 0
    uttcnt = phrases.count(utterance)
    if uttcnt > 1:
        wordfreq.append(uttcnt)
    
zipped = list(zip(phrases, wordfreq))
sortzip = sorted(zipped, key=itemgetter(1), reverse=True)

for v in range(50):
    print(sortzip[v])

('flashed', 35)
('spoke_much', 35)
('watch', 35)
('long_time_heard_moving_among', 35)
('told', 35)
('were', 35)
('have_recovered', 35)
('present', 35)
('vessel', 35)
("mate's_gang", 35)
('have_never_seen_rigged_either', 35)
('bear', 35)
('endeavouring', 35)
('galley', 35)
('bowsprit', 35)
('never_even_succeed', 35)
('small_leather_trunk_belonging', 35)
('returned_without', 35)
('lengths', 35)
('Peters', 35)
('conduct', 35)
('thought_likely', 35)
('still_dared', 35)
('ecstatic', 35)
('commenced_cutting', 35)
('their_bodies_being_carried', 35)
('head', 35)
('resemblance', 35)
('ensued_upon_drinking', 35)
('addition', 35)
('three_pounds', 35)
('never', 35)
('never_left', 35)
('incredible', 35)
('fifty', 35)
('account_given', 34)
('sent', 34)
('longer', 34)
('sober', 34)
('minute', 34)
('water', 34)
('insensible_upon', 34)
('avoid_coming', 34)
('nature', 34)
('reviving', 34)
('severest_gales_ever_experienced', 34)
('have_effectually_cooled_incipient_passion', 34)
('bring', 34)
('Augustus_g

# TERM FREQUENCY–INVERSE DOCUMENT FREQUENCY (TF-IDF)
![image.png](attachment:image.png)
The quintessential early Natural Language Processing tool, the TF-IDF analysis for context and sentiment evaluation is useful only over a large corpus. It must be understood that the corpus is not just a sample to be evaluated, but instead is the entire population that sets a 'benchmark' for evaluation, if you will. 

Here we establish a Term Frequency (TF) count of word frequencies, just as we showed a phrase frequency count in the last step in this notebook.

In [8]:
#Establish wordList
wordList = []
for u in range(len(sentences)):
    for v in sentences[u]:
        words = re.split("\\s+", v)
        wordList.extend(words)
        
for w in range(50):
    print(wordList[w])

extraordinary
series
of
adventure
in
the
South
Seas
and
elsewhere,
of
which
an
account
is
given
in
the
following
pages,
accident
threw
me
into
the
society
of
several
gentlemen
in
Richmond,
Va.
interest
in
all
matters
relating
to
the
regions
I
had
visited,
and
who
were
constantly
urging
it
upon


In [None]:
#Establish wordDict
wordDict = {}
for w in range(len(wordList)):
    newWord = wordList[w]
    newWord = newWord.lower()
    newWord = newWord.replace('.', '')
    wordDict[w] = newWord
print(wordDict)
    
#Perform word counts on dict
countDict = {}
for x in range(len(wordDict)):
    term = wordDict[x]
    count = 1
    for y in range(len(wordDict)):
        try:
            if wordDict[y].find(term) > 0:
                count += 1
        except:
            pass
        countDict[term] = count

#for k, v in countDict.items():
    #print(k, v)

In [None]:
# Computes ratio of word's appearances to total words
bow = wordList
bowCount = len(bow) #BOW = Bag of Words
tfDict = {}
for term, count in countDict.items():
    tfDict[term] = count/float(bowCount)

num = dict(sorted(tfDict.items(), key=lambda x: x[1], reverse = True))
for k, v in num.items():
    print(k, v)

## TO BE CONTINUED...
At this point, this notebook is finished. Please refer to Part 2 for a continuation of this process, wherein the code in this notebook is converted into objects and a multiple document corpus is compiled.