In [6]:
from tika import parser
import pandas as pd
import numpy as np
import string
import time
import re
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [5]:
def my_analyzer(s):
    # Synonym List
    """syns = {'veh': 'vehicle', 'car': 'vehicle', 'chev':'cheverolet', \
              'chevy':'cheverolet', 'air bag': 'airbag', \
              'seat belt':'seatbelt', "n't":'not', 'to30':'to 30', \
              'wont':'would not', 'cant':'can not', 'cannot':'can not', \
              'couldnt':'could not', 'shouldnt':'should not', \
              'wouldnt':'would not', 'straightforward': 'straight forward' }
    """
    
    # Preprocess String s
    s = s.lower()
    # Replace special characters with spaces
    s = s.replace('-', ' ')
    s = s.replace('_', ' ')
    s = s.replace(',', '. ')
    # Replace not contraction with not
    s = s.replace("'nt", " not")
    s = s.replace("n't", " not")
    # Tokenize 
    tokens = word_tokenize(s)
    #tokens = [word.replace(',','') for word in tokens ]
    tokens = [word for word in tokens if ('*' not in word) and \
              ("''" != word) and ("``" != word) and \
              (word!='description') and (word !='dtype') \
              and (word != 'object') and (word!="'s")]
    
    # Map synonyms
    """for i in range(len(tokens)):
        if tokens[i] in syns:
            tokens[i] = syns[tokens[i]]
    """
            
    # Remove stop words
    punctuation = list(string.punctuation)+['..', '...']
    pronouns = ['i', 'he', 'she', 'it', 'him', 'they', 'we', 'us', 'them']
    others   = ["'d", "co", "ed", "put", "say", "get", "can", "become",\
                "los", "sta", "la", "use", "iii", "else","honda","vehicle"\
                "dealer","problem"]
    stop = stopwords.words('english') + punctuation + pronouns + others
    filtered_terms = [word for word in tokens if (word not in stop) and \
                  (len(word)>1) and (not word.replace('.','',1).isnumeric()) \
                  and (not word.replace("'",'',2).isnumeric())]
    
    # Lemmatization & Stemming - Stemming with WordNet POS
    # Since lemmatization requires POS need to set POS
    tagged_words = pos_tag(filtered_terms, lang='eng')
    # Stemming with for terms without WordNet POS
    stemmer = SnowballStemmer("english")
    wn_tags = {'N':wn.NOUN, 'J':wn.ADJ, 'V':wn.VERB, 'R':wn.ADV}
    wnl = WordNetLemmatizer()
    stemmed_tokens = []
    for tagged_token in tagged_words:
        term = tagged_token[0]
        pos  = tagged_token[1]
        pos  = pos[0]
        try:
            pos   = wn_tags[pos]
            stemmed_tokens.append(wnl.lemmatize(term, pos=pos))
        except:
            stemmed_tokens.append(stemmer.stem(term))
    return stemmed_tokens

TEST = corpus = [
     'This is the first document.'
]

#TEST = my_analyzer
tfidf_vect = TfidfVectorizer(max_df=1.0,
                             min_df=1, 
                             max_features=None,
                             lowercase=True)

raw = parser.from_file("D:/BOOKS/Ross_Probability10ed_Student_Solutions2010.pdf")

def new_analyzer(s):
    s = s.lower()
    return re.findall(r'[a-z]+', s)
#res = tfidf_vect.fit(list(raw['content']))
#tfidf_vect.get_feature_names()
print(new_analyzer(raw['content']))



In [37]:
def new_analyzer(s):
    s = s.lower()
    tokens = re.findall(r'[a-z]+', s)
    # Remove stop words
    punctuation = list(string.punctuation)+['..', '...']
    pronouns = ['i', 'he', 'she', 'it', 'him', 'they', 'we', 'us', 'them']
    others   = ["'d", "co", "ed", "put", "say", "get", "can", "become",\
                "los", "sta", "la", "use", "iii", "else","honda","vehicle"\
                "dealer","problem"]
    stop = stopwords.words('english') + punctuation + pronouns + others
    tokens = [token for token in tokens if (token not in stop) and (len(token) > 5)]
    return tokens
print(new_analyzer(raw['content']))



In [40]:
tfidf_vect = TfidfVectorizer(max_df=1,
                             min_df=0, 
                             max_features=None,
                             lowercase=False)
con_vect = CountVectorizer()
tfidf_vect.fit(new_analyzer(raw['content']))
con_vect.fit(new_analyzer(raw['content']))
print(len(tfidf_vect.get_feature_names()))
tfidf_vect.get_feature_names()
con_vect.vocabulary_
tfidf_vect.vocabulary_

676


{'instructor': 329,
 'manual': 369,
 'accompany': 4,
 'introduction': 333,
 'models': 386,
 'edition': 214,
 'sheldon': 568,
 'university': 649,
 'southern': 580,
 'angeles': 27,
 'amsterdam': 25,
 'boston': 58,
 'heidelberg': 296,
 'london': 362,
 'francisco': 271,
 'singapore': 574,
 'sydney': 616,
 'corporate': 147,
 'burlington': 63,
 'street': 598,
 'boulevard': 59,
 'langford': 347,
 'kidlington': 342,
 'rights': 554,
 'reserved': 540,
 'publication': 503,
 'reproduced': 534,
 'transmitted': 635,
 'electronic': 216,
 'mechanical': 377,
 'photocopying': 467,
 'recording': 517,
 'storage': 595,
 'retrieval': 550,
 'details': 186,
 'policies': 476,
 'arrangements': 41,
 'organizations': 434,
 'clearance': 91,
 'center': 79,
 'licensing': 359,
 'agency': 16,
 'website': 666,
 'contributions': 141,
 'protected': 500,
 'notices': 413,
 'practice': 482,
 'constantly': 131,
 'broaden': 62,
 'understanding': 645,
 'medical': 378,
 'treatment': 636,
 'necessary': 402,
 'practitioners': 483

In [45]:
set(tfidf_vect.vocabulary_.keys())

{'acceptance',
 'accepted',
 'accessible',
 'accident',
 'accompany',
 'accord',
 'acheck',
 'ackack',
 'actual',
 'addbleedmarks',
 'addcolorbars',
 'addcropmarks',
 'addpageinfo',
 'addregmarks',
 'afdrukken',
 'affects',
 'agency',
 'agrees',
 'aikajk',
 'aikajtcov',
 'allowpsxobjects',
 'allowtransparency',
 'alteracting',
 'alwaysembed',
 'amounts',
 'amsterdam',
 'analyze',
 'angeles',
 'animal',
 'animation',
 'antialiascolorimages',
 'antialiasgrayimages',
 'antialiasmonoimages',
 'appear',
 'applicable',
 'appointment',
 'appropriate',
 'approximate',
 'arbitrary',
 'arranged',
 'arrangement',
 'arrangements',
 'asreaderspreads',
 'assert',
 'attendant',
 'authors',
 'autofiltercolorimages',
 'autofiltergrayimages',
 'autopositionepsfiles',
 'autorotatepages',
 'available',
 'averaging',
 'barefooted',
 'beginning',
 'bilistically',
 'binding',
 'bitimages',
 'blocking',
 'boston',
 'boulevard',
 'boundary',
 'bounds',
 'broaden',
 'burlington',
 'business',
 'calcmykprofile',