In [1]:
from os import listdir
from os.path import isfile, join
from string import punctuation
from collections import Counter
from nltk.stem import PorterStemmer

In [2]:
ps = PorterStemmer()

# Stopwords

In [3]:
with open( './data/stopwords.txt', 'r' ) as f:
    stopwords = f.readlines()
stopwords = [ x.strip() for x in stopwords ] 

# Reading files

In [4]:
mypath = './data/citeseer/'

In [5]:
onlyfiles = [ f for f in listdir( mypath ) if isfile( join( mypath, f ) ) ]

In [6]:
dict_files = []
for file in onlyfiles:
    with open( mypath + file, 'r' ) as myfile:
        dict_files.append( { 'doc_id' : onlyfiles.index( file ), 'content' : myfile.read().replace( '\n', '' ) } )

In [7]:
num_files = len( dict_files )
print( num_files )

3186


In [8]:
#dict_files[ 0 ]

# Tokenization and removing punctuation

In [9]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
dict_tokenized_files = []
for file in dict_files:
    words = ''.join( c.lower() for c in file[ 'content' ] if ( c not in punctuation ) ).split()
    dict_tokenized_files.append( { 'doc_id' : file[ 'doc_id' ], 'words' : words } )

In [11]:
# Number of documents
len( dict_tokenized_files )

3186

In [12]:
# Total number of words in the collection
total_words = 0
for file in dict_tokenized_files:
    total_words += len( file[ 'words' ] )
print( total_words )

476198


# Sparse matrix

In [13]:
sparse_matrix = dict()
word_id = 0
for file in dict_tokenized_files:
    
    uq_in_doc = set( file[ 'words' ] )
    for word in uq_in_doc:
        try:
            sparse_matrix[ word ][ 'docs' ] = sparse_matrix[ word ][ 'docs' ] + 1
        except KeyError:
            sparse_matrix[ word ] = dict()
            sparse_matrix[ word ][ 'docs' ] = 1
            sparse_matrix[ word ][ 'word_id' ] = word_id
            word_id += 1
    
    frecs_by_doc = Counter()
    frecs_by_doc.update( file[ 'words' ] )
    frecs_by_doc = dict( frecs_by_doc )
    for word in frecs_by_doc:
        try:
            sparse_matrix[ word ][ 'frecs_by_doc' ][ file[ 'doc_id' ] ] = frecs_by_doc[ word ]
        except KeyError:
            sparse_matrix[ word ][ 'frecs_by_doc' ] = dict()
            sparse_matrix[ word ][ 'frecs_by_doc' ][ file[ 'doc_id' ] ] = frecs_by_doc[ word ]

In [15]:
#sparse_matrix

In [16]:
# Vocabulary size
len( sparse_matrix )

19886

In [17]:
# Top words
total_frecuencies = []
for word in sparse_matrix:
    total_frecuencies.append( ( word, sum( sparse_matrix[ word ][ 'frecs_by_doc' ].values() ) ) )
total_frecuencies = sorted( total_frecuencies, key = lambda x: x[ 1 ], reverse = True )
top_20 = total_frecuencies[ :20 ]
print( top_20 )

[('the', 25662), ('of', 18638), ('and', 14131), ('a', 13345), ('to', 11536), ('in', 10067), ('for', 7379), ('is', 6577), ('we', 5138), ('that', 4820), ('this', 4446), ('are', 3737), ('on', 3656), ('an', 3281), ('with', 3200), ('as', 3057), ('by', 2765), ('data', 2691), ('be', 2500), ('information', 2322)]


In [18]:
# Which are stopwords
which_stops = []
for word in top_20:
    if word[ 0 ] in stopwords:
        which_stops.append( word[ 0 ] )
print( which_stops )

['the', 'of', 'and', 'a', 'to', 'in', 'for', 'is', 'we', 'that', 'this', 'are', 'on', 'an', 'with', 'as', 'by', 'be']


In [19]:
# 15%
total_words_15 = total_words * 0.15
acum = 0
words_15 = []
for word in total_frecuencies:
    if acum <= total_words_15:
        words_15.append( word[ 0 ] )
        acum += word[ 1 ]
print( words_15 )

['the', 'of', 'and', 'a']
