In [67]:
import nltk
import string

# used for looping through folders/files
from os import listdir
from os.path import isfile, join

#Calc tfidf and cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
# All text entries to compare will appear here
BASE_INPUT_DIR = "./training/"

## Preprocess Data

#### File information

In [69]:
def returnListOfFilePaths(folderPath):
    fileInfo = []
    listOfFileNames = [fileName for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    listOfFilePaths = [join(folderPath, fileName) for fileName in listdir(folderPath) if isfile(join(folderPath, fileName))]
    fileInfo.append(listOfFileNames)
    fileInfo.append(listOfFilePaths)
    return fileInfo

fileNames, filePaths = returnListOfFilePaths(BASE_INPUT_DIR)
print(fileNames, "\n", filePaths)

['10ten.txt', '1one.txt', '2two.txt', '3three.txt', '4four.txt', '5five.txt', '6six.txt', '7seven.txt', '8eight.txt', '9nine.txt'] 
 ['./training/10ten.txt', './training/1one.txt', './training/2two.txt', './training/3three.txt', './training/4four.txt', './training/5five.txt', './training/6six.txt', './training/7seven.txt', './training/8eight.txt', './training/9nine.txt']


In [70]:
# Get document contents
def create_docContentDict(filePaths):
    rawContentDict = {}
    for filePath in filePaths:
        with open(filePath, "r") as ifile:
            fileContent = ifile.read()
        rawContentDict[filePath] = fileContent
    return rawContentDict
rawContentDict = create_docContentDict(filePaths)
print(rawContentDict)

{'./training/10ten.txt': "HIS LIFE\n\n\nThe few events in the long life of Izaak Walton have been carefully\ninvestigated by Sir Harris Nicolas.  All that can be extricated from\ndocuments by the alchemy of research has been selected, and I am unaware\nof any important acquisitions since Sir Harris Nicolas's second edition\nof 1860.  Izaak was of an old family of Staffordshire yeomen, probably\ndescendants of George Walton of Yoxhall, who died in 1571.  Izaak's\nfather was Jarvis Walton, who died in February 1595-6; of Izaak's mother\nnothing is known.  Izaak himself was born at Stafford, on August 9, 1593,\nand was baptized on September 21.  He died on December 15, 1683, having\nlived in the reigns of Elizabeth, James I., Charles I., under the\nCommonwealth, and under Charles II.  The anxious and changeful age\nthrough which he passed is in contrast with his very pacific character\nand tranquil pursuits.\n\n", './training/1one.txt': 'Washington, Dec. 24th, 1848.\n\nMy dear father:--\n

## Create Custom tokenizer

### Define functions to use within the tokenizer
We'd like to;
- tokenize the input
- remove stop words
- perform stemming
- remove punctuation
- convert input to lowercase

#### Tokenize

In [71]:
def tokenizeContent(contentsRaw):
    tokenized = nltk.tokenize.word_tokenize(contentsRaw)
    return tokenized

#### Remove Stop words

In [72]:
def removeStopWordsFromTokenized(contentsTokenized):
    stop_word_set = set(nltk.corpus.stopwords.words("english"))
    filteredContents = [word for word in contentsTokenized if word not in stop_word_set]
    return filteredContents

#### Stemming

In [73]:
def performPorterStemmingOnContents(contentsTokenized):
    porterStemmer = nltk.stem.PorterStemmer()
    filteredContents = [porterStemmer.stem(word) for word in contentsTokenized]
    return filteredContents

#### Remove Punctuation

In [74]:
def removePunctuationFromTokenized(contentsTokenized):
    excludePuncuation = set(string.punctuation)
    
    # manually add additional punctuation to remove
    doubleSingleQuote = '\'\''
    doubleDash = '--'
    doubleTick = '``'

    excludePuncuation.add(doubleSingleQuote)
    excludePuncuation.add(doubleDash)
    excludePuncuation.add(doubleTick)

    filteredContents = [word for word in contentsTokenized if word not in excludePuncuation]
    return filteredContents

#### Convert terms to lowercase

In [75]:
def convertItemsToLower(contentsRaw):
    filteredContents = [term.lower() for term in contentsRaw]
    return filteredContents

### Test that functions are working as expected

In [76]:
# get contents of a file for testing
# TODO: may need to make a copy of this here
content_test = rawContentDict[filePaths[0]]

# visually inspect
print(content_test[:300])

HIS LIFE


The few events in the long life of Izaak Walton have been carefully
investigated by Sir Harris Nicolas.  All that can be extricated from
documents by the alchemy of research has been selected, and I am unaware
of any important acquisitions since Sir Harris Nicolas's second edition
of 1860


In [77]:
# test tokenization
content_test_tokenized = tokenizeContent(content_test)

# visually inspect
print(content_test_tokenized[:30])

['HIS', 'LIFE', 'The', 'few', 'events', 'in', 'the', 'long', 'life', 'of', 'Izaak', 'Walton', 'have', 'been', 'carefully', 'investigated', 'by', 'Sir', 'Harris', 'Nicolas', '.', 'All', 'that', 'can', 'be', 'extricated', 'from', 'documents', 'by', 'the']


In [89]:
# test remove stop words
content_test_rmStop = removeStopWordsFromTokenized(content_test_tokenized)

# visually inspect
print(content_test_rmStop[:30])

['HIS', 'LIFE', 'The', 'events', 'long', 'life', 'Izaak', 'Walton', 'carefully', 'investigated', 'Sir', 'Harris', 'Nicolas', '.', 'All', 'extricated', 'documents', 'alchemy', 'research', 'selected', ',', 'I', 'unaware', 'important', 'acquisitions', 'since', 'Sir', 'Harris', 'Nicolas', "'s"]


In [79]:
# Test stemming
#content_test_stemmed = performPorterStemmingOnContents(content_test_rmStop)

# visually inspect
#print(content_test_stemmed[:30])

['hi', 'life', 'the', 'event', 'long', 'life', 'izaak', 'walton', 'care', 'investig', 'sir', 'harri', 'nicola', '.', 'all', 'extric', 'document', 'alchemi', 'research', 'select', ',', 'I', 'unawar', 'import', 'acquisit', 'sinc', 'sir', 'harri', 'nicola', "'s"]


In [91]:
# Test remove punctuation
content_test_cleaned = removePunctuationFromTokenized(content_test_rmStop)

# visually inspect
print(content_test_cleaned[:30])

['HIS', 'LIFE', 'The', 'events', 'long', 'life', 'Izaak', 'Walton', 'carefully', 'investigated', 'Sir', 'Harris', 'Nicolas', 'All', 'extricated', 'documents', 'alchemy', 'research', 'selected', 'I', 'unaware', 'important', 'acquisitions', 'since', 'Sir', 'Harris', 'Nicolas', "'s", 'second', 'edition']


In [92]:
# Test convert to lower
content_test_clean_lower = convertItemsToLower(content_test_cleaned)
print(content_test_clean_lower[:30])

['his', 'life', 'the', 'events', 'long', 'life', 'izaak', 'walton', 'carefully', 'investigated', 'sir', 'harris', 'nicolas', 'all', 'extricated', 'documents', 'alchemy', 'research', 'selected', 'i', 'unaware', 'important', 'acquisitions', 'since', 'sir', 'harris', 'nicolas', "'s", 'second', 'edition']


### Wrap into a function to be used by NLTK

In [93]:
# process data without writing inspection file information to file
def processData(rawContents):
    cleaned = tokenizeContent(rawContents)
    cleaned = removeStopWordsFromTokenized(cleaned)
    cleaned = performPorterStemmingOnContents(cleaned)    
    cleaned = removePunctuationFromTokenized(cleaned)
    cleaned = convertItemsToLower(cleaned)
    return cleaned

## Create Functions For Output
- TFIDF
- Cosine Similarity
    - this function will both calcuate and output results

In [94]:
# print TFIDF values in 'table' format
def print_TFIDF_for_all(term, values, fileNames):
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    print('                ', end="")   #bank space for formatting output
    for n in range(len(fileNames)):
        print('{0:18}'.format(fileNames[n]), end="")    #file names
    print()
    for i in range(len(term)):
        print('{0:8}'.format(term[i]), end='\t|  ')     #the term
        for j in range(numValues):
            print('{0:.12f}'.format(values[i][j]), end='   ') #the value, corresponding to the file name, for the term
        print()

In [95]:
# write TFIDF values in 'table' format
def write_TFIDF_for_all(term, values, fileNames):
    filePath = "../results/tfid.txt"
    outFile = open(filePath, 'a')
    title = "TFIDF\n"
    outFile.write(title)
    values = values.transpose() # files along 'x-axis', terms along 'y-axis'
    numValues = len(values[0])
    outFile.write('               \t')   #bank space for formatting output
    for n in range(len(fileNames)):
        outFile.write('{0:18}'.format(fileNames[n]))    #file names
    outFile.write("\n")
    for i in range(len(term)):
        outFile.write('{0:15}'.format(term[i]))     #the term
        outFile.write('\t|  ')
        for j in range(numValues):
            outFile.write('{0:.12f}'.format(values[i][j])) #the value, corresponding to the file name, for the term
            outFile.write('   ')
        outFile.write("\n")

    outFile.close()

In [96]:
# TODO: modify this to build matrix then print from matrix form
def calc_and_print_CosineSimilarity_for_all(tfs, fileNames):
    #print(cosine_similarity(tfs[0], tfs[1]))
    print("\n\n\n========COSINE SIMILARITY====================================================================\n")
    numFiles = len(fileNames)
    names = []
    print('                   ', end="")    #formatting
    for i in range(numFiles):
        if i == 0:
            for k in range(numFiles):
                print(fileNames[k], end='   ')
            print()

        print(fileNames[i], end='   ')
        for n in range(numFiles):
            #print(fileNames[n], end='\t')
            matrixValue = cosine_similarity(tfs[i], tfs[n])
            numValue = matrixValue[0][0]
            #print(numValue, end='\t')
            names.append(fileNames[n])
            print(" {0:.8f}".format(numValue), end='         ')
            #(cosine_similarity(tfs[i], tfs[n]))[0][0]

        print()
    print("\n\n=============================================================================================\n")

In [97]:
def calc_and_write_CosineSimilarity_for_all(tfs, fileNames):
    filePath = "../results/cosine_similarity.txt"
    outFile = open(filePath, 'a')
    title = "COSINE SIMILARITY\n"
    outFile.write(title)
    numFiles = len(fileNames)
    names = []
    outFile.write('                   ')
    for i in range(numFiles):
        if i == 0:
            for k in range(numFiles):
                outFile.write(fileNames[k])
                outFile.write('   ')
            outFile.write("\n")
        outFile.write(fileNames[i])
        outFile.write('   ')

        for n in range(numFiles):
            matrixValue = cosine_similarity(tfs[i], tfs[n])
            numValue = matrixValue[0][0]
            names.append(fileNames[n])
            outFile.write('{0:.8f}'.format(numValue))
            outFile.write('         ')
            #(cosine_similarity(tfs[i], tfs[n]))[0][0]

        outFile.write("\n")

    outFile.close()

## Wrap Everything into `Main()`

In [98]:
def main(printResults=True):
    baseFolderPath = "./training/"

    fileNames, filePathList = returnListOfFilePaths(baseFolderPath)

    rawContentDict = create_docContentDict(filePathList)

    # calculate tfidf
    tfidf = TfidfVectorizer(tokenizer=processData, stop_words='english')
    tfs = tfidf.fit_transform(rawContentDict.values())
    tfs_Values = tfs.toarray()
    tfs_Term = tfidf.get_feature_names()
    
    if printResults:
        # print results
        print_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)
        calc_and_print_CosineSimilarity_for_all(tfs, fileNames)
    else:
        # write results to file
        write_TFIDF_for_all(tfs_Term, tfs_Values, fileNames)   
        calc_and_write_CosineSimilarity_for_all(tfs, fileNames)

In [99]:
main()

  'stop_words.' % sorted(inconsistent))


                10ten.txt         1one.txt          2two.txt          3three.txt        4four.txt         5five.txt         6six.txt          7seven.txt        8eight.txt        9nine.txt         
's      	|  0.102591663252   0.000000000000   0.087959889294   0.067286897574   0.082491998138   0.000000000000   0.000000000000   0.067566551444   0.000000000000   0.095684641768   
.bi     	|  0.000000000000   0.000000000000   0.000000000000   0.125341578829   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
.for    	|  0.000000000000   0.000000000000   0.000000000000   0.125341578829   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
.that   	|  0.000000000000   0.000000000000   0.000000000000   0.125341578829   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
1       	|  0.000000000000   0.000000000000   0.000000000000   0.000000

land    	|  0.000000000000   0.142909147931   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
lang    	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.089120352560   
latin   	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.251725032343   0.000000000000   0.000000000000   
left    	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.215112934778   0.000000000000   0.000000000000   0.000000000000   
lesli   	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.125862516171   0.000000000000   0.000000000000   
let     	|  0.000000000000   0.142909147931   0.000000000000   0.000000000000   0.000

thu     	|  0.000000000000   0.000000000000   0.000000000000   0.106551793208   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.075760441709   
told    	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.107556467389   0.000000000000   0.000000000000   0.000000000000   
topic   	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.107556467389   0.000000000000   0.000000000000   0.000000000000   
tradit  	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.107556467389   0.000000000000   0.000000000000   0.000000000000   
tranquil	|  0.095553529070   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.000000000000   
treatment	|  0.000000000000   0.000000000000   0.000000000000   0.000000000000   0.00