In [None]:
from os import listdir
from os.path import isfile, join
from string import punctuation
from collections import Counter
from nltk.stem import PorterStemmer
import operator
import math

In [None]:
ps = PorterStemmer()

# Stopwords

In [None]:
with open( './data/stopwords.txt', 'r' ) as f:
    stopwords = f.readlines()
stopwords = [ x.strip() for x in stopwords ] 

# Reading files

In [None]:
mypath = './data/citeseer/'

In [None]:
onlyfiles = [ f for f in listdir( mypath ) if isfile( join( mypath, f ) ) ]

In [None]:
dict_files = []
for file in onlyfiles:
    with open( mypath + file, 'r' ) as myfile:
        dict_files.append( { 'doc_id' : onlyfiles.index( file ), 'content' : myfile.read().replace( '\n', '' ) } )

In [None]:
num_files = len( dict_files )
print( num_files )

In [None]:
#dict_files[ 0 ]

# Tokenization and removing punctuation

In [None]:
punctuation

In [None]:
dict_tokenized_files = []
for file in dict_files:
    words = ''.join( c.lower() for c in file[ 'content' ] if ( c not in punctuation ) ).split()
    words2 = []
    for word in words:
        word = ps.stem( word )
        if word not in stopwords:
            words2.append( word )
    dict_tokenized_files.append( { 'doc_id' : file[ 'doc_id' ], 'words' : words2 } )

In [None]:
# Number of documents
len( dict_tokenized_files )

In [None]:
# Total number of words in the collection
total_words = 0
for file in dict_tokenized_files:
    total_words += len( file[ 'words' ] )
print( total_words )

In [None]:
#dict_tokenized_files

# Sparse matrix

In [None]:
sparse_matrix = dict()
word_id = 0
for file in dict_tokenized_files:
    
    uq_in_doc = set( file[ 'words' ] )
    for word in uq_in_doc:
        try:
            sparse_matrix[ word ][ 'docs' ] = sparse_matrix[ word ][ 'docs' ] + 1
        except KeyError:
            sparse_matrix[ word ] = dict()
            sparse_matrix[ word ][ 'docs' ] = 1
            sparse_matrix[ word ][ 'word_id' ] = word_id
            word_id += 1
    
    frecs_by_doc = Counter()
    frecs_by_doc.update( file[ 'words' ] )
    frecs_by_doc = dict( frecs_by_doc )
    for word in frecs_by_doc:
        try:
            sparse_matrix[ word ][ 'frecs_by_doc' ][ file[ 'doc_id' ] ] = frecs_by_doc[ word ]
        except KeyError:
            sparse_matrix[ word ][ 'frecs_by_doc' ] = dict()
            sparse_matrix[ word ][ 'frecs_by_doc' ][ file[ 'doc_id' ] ] = frecs_by_doc[ word ]

In [None]:
#sparse_matrix

In [None]:
# Vocabulary size
len( sparse_matrix )

In [None]:
# Top words
total_frecuencies = []
for word in sparse_matrix:
    total_frecuencies.append( ( word, sum( sparse_matrix[ word ][ 'frecs_by_doc' ].values() ) ) )
total_frecuencies = sorted( total_frecuencies, key = lambda x: x[ 1 ], reverse = True )
top_20 = total_frecuencies[ :20 ]
print( top_20 )

In [None]:
# Which are stopwords
which_stops = []
for word in top_20:
    if word[ 0 ] in stopwords:
        which_stops.append( word[ 0 ] )
print( which_stops )

In [None]:
# 15%
total_words_15 = total_words * 0.15
acum = 0
words_15 = []
for word in total_frecuencies:
    if acum <= total_words_15:
        words_15.append( word[ 0 ] )
        acum += word[ 1 ]
print( words_15 )

# TF-IDF

In [None]:
max_frecs_by_doc = []
for file in dict_tokenized_files:
    frecs_by_doc = Counter()
    frecs_by_doc.update( file[ 'words' ] )
    frecs_by_doc = frecs_by_doc.most_common( 1 )[ 0 ][ 1 ]
    #frecs_by_doc = dict( frecs_by_doc )
    max_frecs_by_doc.append( frecs_by_doc )

In [None]:
sparse_matrix_i = dict()
for word, values in sparse_matrix.items():
    df = sparse_matrix[ word ][ 'docs' ]
    idf = math.log( num_files / df, 2.0 )
    sparse_matrix_i[ word ] = dict()
    sparse_matrix_i[ word ][ 'word_id' ] = sparse_matrix[ word ][ 'word_id' ]
    sparse_matrix_i[ word ][ 'tfidf' ] = dict()
    for doc, tf in sparse_matrix[ word ][ 'frecs_by_doc' ].items():
        ifidf = ( tf / max_frecs_by_doc[ doc ] ) * idf
        sparse_matrix_i[ word ][ 'tfidf' ][ doc ] = ifidf        

In [None]:
#sparse_matrix_i

In [None]:
for doc in range( 0, num_files )[ 1:3 ]:
    print( 'Doc ID: ' + str( doc ) )
    for word, values in sparse_matrix_i.items():
        for d, tfidf in sparse_matrix_i[ word ][ 'tfidf' ].items():
            if doc == d:
                print( str( sparse_matrix_i[ word ][ 'word_id' ] ) + ' - ' + str( tfidf ) )