In [21]:
from os import listdir
from os.path import isfile, join
from string import punctuation
from collections import Counter
from nltk.stem import PorterStemmer
import math

In [2]:
ps = PorterStemmer()

# Stopwords

In [3]:
with open( './data/stopwords.txt', 'r' ) as f:
    stopwords = f.readlines()
stopwords = [ x.strip() for x in stopwords ] 

# Reading files

In [4]:
mypath = './data/citeseer/'

In [5]:
onlyfiles = [ f for f in listdir( mypath ) if isfile( join( mypath, f ) ) ]

In [6]:
dict_files = []
for file in onlyfiles:
    with open( mypath + file, 'r' ) as myfile:
        dict_files.append( { 'doc_id' : onlyfiles.index( file ), 'content' : myfile.read().replace( '\n', '' ) } )

In [7]:
num_files = len( dict_files )
print( num_files )

3186


In [8]:
#dict_files[ 0 ]

# Tokenization and removing punctuation

In [9]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
dict_tokenized_files = []
for file in dict_files:
    words = ''.join( c.lower() for c in file[ 'content' ] if ( c not in punctuation ) ).split()
    words2 = []
    for word in words:
        word = ps.stem( word )
        if word not in stopwords:
            words2.append( word )
    dict_tokenized_files.append( { 'doc_id' : file[ 'doc_id' ], 'words' : words2 } )

In [11]:
# Number of documents
len( dict_tokenized_files )

3186

In [12]:
# Total number of words in the collection
total_words = 0
for file in dict_tokenized_files:
    total_words += len( file[ 'words' ] )
print( total_words )

280812


# Sparse matrix

In [13]:
sparse_matrix = dict()
word_id = 0
for file in dict_tokenized_files:
    
    uq_in_doc = set( file[ 'words' ] )
    for word in uq_in_doc:
        try:
            sparse_matrix[ word ][ 'docs' ] = sparse_matrix[ word ][ 'docs' ] + 1
        except KeyError:
            sparse_matrix[ word ] = dict()
            sparse_matrix[ word ][ 'docs' ] = 1
            sparse_matrix[ word ][ 'word_id' ] = word_id
            word_id += 1
    
    frecs_by_doc = Counter()
    frecs_by_doc.update( file[ 'words' ] )
    frecs_by_doc = dict( frecs_by_doc )
    for word in frecs_by_doc:
        try:
            sparse_matrix[ word ][ 'frecs_by_doc' ][ file[ 'doc_id' ] ] = frecs_by_doc[ word ]
        except KeyError:
            sparse_matrix[ word ][ 'frecs_by_doc' ] = dict()
            sparse_matrix[ word ][ 'frecs_by_doc' ][ file[ 'doc_id' ] ] = frecs_by_doc[ word ]

In [15]:
#sparse_matrix

In [16]:
# Vocabulary size
len( sparse_matrix )

13589

In [17]:
# Top words
total_frecuencies = []
for word in sparse_matrix:
    total_frecuencies.append( ( word, sum( sparse_matrix[ word ][ 'frecs_by_doc' ].values() ) ) )
total_frecuencies = sorted( total_frecuencies, key = lambda x: x[ 1 ], reverse = True )
top_20 = total_frecuencies[ :20 ]
print( top_20 )

[('thi', 4446), ('system', 3741), ('data', 2691), ('agent', 2688), ('inform', 2398), ('model', 2315), ('paper', 2246), ('queri', 1905), ('user', 1756), ('learn', 1740), ('algorithm', 1584), ('1', 1552), ('approach', 1544), ('problem', 1543), ('applic', 1522), ('present', 1507), ('base', 1486), ('web', 1439), ('databas', 1424), ('comput', 1411)]


In [18]:
# Which are stopwords
which_stops = []
for word in top_20:
    if word[ 0 ] in stopwords:
        which_stops.append( word[ 0 ] )
print( which_stops )

[]


In [19]:
# 15%
total_words_15 = total_words * 0.15
acum = 0
words_15 = []
for word in total_frecuencies:
    if acum <= total_words_15:
        words_15.append( word[ 0 ] )
        acum += word[ 1 ]
print( words_15 )

['thi', 'system', 'data', 'agent', 'inform', 'model', 'paper', 'queri', 'user', 'learn', 'algorithm', '1', 'approach', 'problem', 'applic', 'present', 'base', 'web', 'databas', 'comput', 'method']


# TF-IDF

In [31]:
sparse_matrix_i = dict()
for word, values in sparse_matrix.items():
    df = sparse_matrix[ word ][ 'docs' ]
    idf = math.log( num_files / df, 2.0 )
    sparse_matrix_i[ word ] = dict()
    sparse_matrix_i[ word ][ 'word_id' ] = sparse_matrix[ word ][ 'word_id' ]
    sparse_matrix_i[ word ][ 'tfidf' ] = dict()
    for doc, tf in sparse_matrix[ word ][ 'frecs_by_doc' ].items():
        ifidf = tf * idf
        sparse_matrix_i[ word ][ 'tfidf' ][ doc ] = ifidf        

In [32]:
#sparse_matrix_i

In [34]:
for doc in range( 0, num_files )[ 1:2 ]:
    print( 'Doc ID: ' + str( doc ) )
    for word, values in sparse_matrix_i.items():
        for d, tfidf in sparse_matrix_i[ word ][ 'tfidf' ].items():
            if doc == d:
                print( str( sparse_matrix_i[ word ][ 'word_id' ] ) + ' - ' + str( tfidf ) )

Doc ID: 1
2 - 5.378326639881265
5 - 1.2462869620978676
21 - 1.7733444068710296
27 - 1.445237737054543
31 - 0.6334600052873457
35 - 1.7053157995569246
37 - 1.7733444068710296
43 - 2.2154657853524977
44 - 13.168069643596917
49 - 26.922559823667164
50 - 3.0906360916376734
51 - 18.83993564072168
52 - 2.933626978080646
53 - 9.052568050804155
54 - 7.67326130321001
55 - 6.683334241138436
56 - 6.352128332663061
57 - 4.0081739314457
58 - 3.4526552086170264
59 - 4.082941699847672
60 - 7.550067710274971
61 - 9.315602456637949
62 - 5.067674943194362
63 - 9.63753055152531
64 - 6.779549556397738
65 - 30.157704152412464
66 - 5.145677455195635
67 - 7.63753055152531
68 - 4.951030024342091
69 - 2.2581521844540475
70 - 4.671746266863223
71 - 23.490526888403117
72 - 9.93021041910763
73 - 4.546791793034516
74 - 11.637530551525309
75 - 5.263812004662864
76 - 6.47325086248863
77 - 9.315602456637949
78 - 10.49393759737955
79 - 9.315602456637949
80 - 8.830175629467707
81 - 0.7842209961216352
82 - 7.17809893288