In [None]:
import re
import math
import string
import numpy as np
import matplotlib.pyplot as plt
from sklearn import decomposition, preprocessing
from sklearn.metrics.pairwise import euclidean_distances
%matplotlib inline

In [None]:
def textFileToWordCounts(filename,minCount):

    # read in document and create a vector of term counts
    fileptr = open(filename)

    # dictionary of words to counts
    wordmap = {}

    for line in fileptr:
        for token in line.split(" "):
            # convert to lowercase
            lowertoken = token.lower()
            # strip all non-character with regex
            striptoken = re.sub(r"[^a-z]","",lowertoken)
            if len(striptoken)>0:
                if striptoken not in wordmap.keys():
                    wordmap[striptoken]=0
                wordmap[striptoken]=wordmap[striptoken]+1
    fileptr.close()
    
    # prune out words with fewer than minCount occurences
    for token in list(wordmap.keys()):
        if wordmap[token] < minCount:
            del wordmap[token]
    #return dictionary of counts
    return wordmap


In [None]:
tmpWordMap = textFileToWordCounts('data/purdue.txt',3)
print(len(tmpWordMap))
print(tmpWordMap)

In [None]:
import os

# calculate term vector for set of files
directory = 'data'
minThresh=3
fileMap = {}
uniqueWords = set()
for filename in os.listdir(directory):
    tmpWordMap = textFileToWordCounts(directory + '/' + filename, minThresh)
    #print(len(tmpWordMap))
    fileMap[filename] = tmpWordMap
    # add words to overall list of words
    for tmpWord in tmpWordMap.keys():
        uniqueWords.add(tmpWord)

# create a document term matrix for the set of files
fileList = list(fileMap.keys())
numFiles = len(fileList)
numWords = len(uniqueWords)
uniqueWordList = list(uniqueWords)
uniqueWordList.sort()
print('Num words ' + str(numWords) + ', num files '+ str(numFiles))

# initialize matrix to zeros
termMatrix = np.zeros((numFiles,numWords))
# loop over each wordmap
for file in fileList:
    rowIdx = fileList.index(file)
    tmpWordMap = fileMap[file]
    for token in tmpWordMap.keys():
        # find location of word
        colIdx = uniqueWordList.index(token)
        # update count for appropriate cell of matrix
        termMatrix[rowIdx,colIdx]=tmpWordMap[token]
print(termMatrix[:6,:12])
print(fileList)

In [None]:
# visualize the data
#plt.imshow(termMatrix)
plt.imshow(termMatrix[:,:30])
uniqueWordList[0]

In [None]:
# reduce dimensionality with PCA
def applyPCA(data,numCmps):
    # mean center data
    X_scaled = preprocessing.scale(data, with_std=False) 
    # initialize PCA model
    pca = decomposition.PCA(n_components=numCmps)
    # fit PCA model with scaled data 
    X_trans = pca.fit_transform(X_scaled)
    # return transformed data and explained variance
    return [X_trans, pca.explained_variance_ratio_]

In [None]:
[transData, exvar] = applyPCA(termMatrix,numFiles)
# determine best number of dimensions
cmpIdx=range(1,numFiles+1)
plt.scatter(cmpIdx,exvar)

In [None]:
# plot transformed data, 1st two dimensions
plt.scatter(transData[:,0],transData[:,1])
# label points
for file in fileList:
    idx = fileList.index(file)
    plt.text(transData[idx,0]+3,transData[idx,1]-2,file) 

In [None]:
# normalize for document length
normMatrix = termMatrix.copy()
for i in range(termMatrix.shape[0]):
    docLen = sum(termMatrix[i,:])
    normMatrix[i,:] = termMatrix[i,:]/docLen
plt.imshow(normMatrix[:,:30])

In [None]:
# apply PCA to normalized data
[transNormData, exvar] = applyPCA(normMatrix,numFiles)

plt.scatter(transNormData[:,0],transNormData[:,1])
# label points
for file in fileList:
    idx = fileList.index(file)
    plt.text(transNormData[idx,0],transNormData[idx,1],file+'('+str(idx)+')') 

In [None]:
# calculate distances between documents
dists = euclidean_distances(transNormData)
plt.imshow(dists,cmap='RdBu')

In [None]:
# multiply term frequency by inverse document frequency 
# IDF(t) = log(Num of docs / Num of docs with term t in it)
tfidfMatrix = normMatrix.copy()
for j in range(normMatrix.shape[1]):
    docswterm = 0
    for i in range(numFiles):
        if normMatrix[i,j]>0:
            docswterm = docswterm+1
    termidf = 0
    if docswterm > 0:
        #print(str(j) + ':' + str(docswterm))
        termidf = math.log(numFiles/docswterm)
    #print(str(j) + ':' + str(termidf))
    tfidfMatrix[:,j] = normMatrix[:,j]*termidf
plt.imshow(tfidfMatrix[:,:30])

In [None]:
# apply PCA to normalized data
[transTFIDFData, exvar] = applyPCA(tfidfMatrix,numFiles)

plt.scatter(transTFIDFData[:,0],transTFIDFData[:,1])
# label points
for file in fileList:
    idx = fileList.index(file)
    plt.text(transTFIDFData[idx,0],transTFIDFData[idx,1],file) 

In [None]:
# calculate distances between documents
dists = euclidean_distances(transTFIDFData)
plt.imshow(dists,cmap='RdBu')

In [None]:
# find words with largest variance
vars = []
for j in range(tfidfMatrix.shape[1]):
    tmpvar = np.var(tfidfMatrix[:,j])
    vars.append(tmpvar)
print(max(vars))
thresh = np.percentile(vars,90)
for j in range(tfidfMatrix.shape[1]):
    if vars[j] > thresh:
        print(uniqueWordList[j])
        print(tfidfMatrix[:,j])