In [1]:
import os
import time
import numpy as np
import sys

**How long the program took to acquire the text characteristics?**

-> 0.9574556350708008 s

**How the program handles:**

 **A. Upper and lower case words (e.g. "People", "people", "Apple", "apple")**
 
 -> All the words are converted to lower case and then tokenized
 
 **B. Words with dashes (e.g. "1996-97", "middle-class", "30-year", "tean-ager")**
 
 -> Hyphen is removed from compound words so middle-class becomes middleclass
 
 **C. Possessives (e.g. "sheriff's", "university's")**
 
 -> Apostrophee is also removed from words so sheriff's becomes sherrifs
 
 **D. Acronyms (e.g., "U.S.", "U.N.")**
 
 -> Dot(.) from acronyms are removed so U.S. becomes us
 
-> Numbers and digits are also tokenized

**Briefly discuss your major algorithms and data structures.**

-> To save results of tokenization and stemming dictionaries are used so that the retrieval and insertion of data takes o(1)

-> Dictionary stores data as {string(token or stem):integer(frequency)}

# Tokenization

In [2]:
class Tokenizer:
        
    def __init__(self,path): 
        self.Cranfield_path=path
        self.totalCount=0
        self.dictionary={}
        self.stopwords=self.createStopwordsdict()#Created stop words dictionary
        self.fileCount=0
        self.avgTokensPerdoc=0.0
    
    def createStopwordsdict(self):
        stopwords=['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']
        stopwordsDict={}
        for i in stopwords:
            stopwordsDict[i]=1
        return stopwordsDict
    
    def tokenize(self):
        for filename in os.listdir(self.Cranfield_path):
            self.fileCount+=1
            with open('\\'.join((self.Cranfield_path,filename))) as f:
                for currline in f.readlines():
                    tokenizedLine=currline.split()
                    for words in tokenizedLine:
                        if not words.startswith('<'):
                            if not words in self.stopwords:
                                words=words.strip()
                                words=words.lower()#Converted every word to lowercase
                                words=words.replace('.','')#Removed dot(.)
                                words=words.replace('-','')#Removed Hyphen from compound words and merged them
                                words=words.replace(',','')#Removed comma(,)
                                words=words.replace('(','')#Removed opening bracket (
                                words=words.replace(')','')#Removed closing bracket )
                                words=words.replace('/','')#Removed both slashes
                                words=words.replace('/','')
                                words=words.replace("'",'')#Removed apostrophee
                                if words!='' and words!=' ' and len(words)>2:#Removed words which contain less than 2 characters
                                    self.addTodict(words)
        self.dictionary=dict(sorted(self.dictionary.items(), key=lambda x: x[1],reverse=True))                                       
    
    def addTodict(self,word):
        self.totalCount+=1
        if word in self.dictionary:
            self.dictionary[word]+=1
        else:
            self.dictionary[word]=1
    
    def stats(self):
        print('Tokenization Stats')
        print('Number of Tokens: '+ str(self.totalCount))
        print('Number of Unique words: '+str(len(self.dictionary)))
        print('Number of words that occur only once: '+str(len([key for key,val in self.dictionary.items() if val==1])))
        print('30 Frequent words:')
        self.frequentwords(30)
        self.avgTokensPerdoc=self.totalCount/self.fileCount
        print('Average tokens per document: '+str(self.avgTokensPerdoc))
    
    def frequentwords(self,n):
        i=0
        for word,freq in self.dictionary.items():
            if i==n:
                break
            print(word+': '+str(freq))
            i+=1
    
    def getTokenDict(self):
        return self.dictionary
    
    def getfilecount(self):
        return self.fileCount
    
    def outputdict(self):
        np.savetxt('tokens.csv',np.array(list(self.dictionary.items())),delimiter=',',fmt='%s')

# Stemming

In [22]:
class Stemming:
    
    def __init__(self,filecount): 
        self.totalCount=0
        self.dictionary={}
        self.fileCount=filecount
        self.avgTokensPerdoc=0.0
    
    def isvowel(self,char):
        if char=='a' or char=='e' or char=='i' or char=='o' or char=='u':
            return True
        else:
            return False
    
    #*v*
    def containsVowel(self,word):
        for i in word:
            if self.isvowel(i):
                return True
        return False
    
    #*d
    def endsinDoubleCons(self,word):
        if len(word)>=2:
            if self.isConsonant(word,-1) and self.isConsonant(word,-2):
                return True
        return False
    
    #*o i.e. checks for cvc
    def check_O(self,word):
        if len(word)>=3:
            if self.isConsonant(word,-3) and self.isvowel(word[-2]) and self.isConsonant(word,-1):
                char=word[-1]
                if char!='w' and char!='y' and char!='x':
                    return True
        return False
        
    def isConsonant(self,word,i):
        char=word[i]
        if not self.isvowel(char):
            if char=='y':
                if i==0 or abs(i)==len(word):
                    return True
                elif not self.isvowel(i-1):
                    return False
                else:
                    return True
        return False
        
    #calculates m for each word    
    def base_M(self,word):
        base=[]
        basestr=''
        for i in range(len(word)):
            if self.isConsonant(word,i):
                if i!=0:
                    prev=base[-1]
                    if prev!='C':
                        base.append('C')
                else:
                    base.append('C')
            else:
                if i!=0:
                    prev=base[-1]
                    if prev!='V':
                        base.append('V')
                else:
                    base.append('V')
        return ''.join(base).count('VC')
                    
       
    def replace(self,word,original,changed):
        m=word.rfind(original)
        return word[:m]+changed
    
    def step1a(self,word):
        if word.endswith('sses'):
            word=self.replace(word,'sses','ss')
        elif word.endswith('ies'):
            word=self.replace(word,'ies','i')
        elif word.endswith('ss'):
            word=self.replace(word,'ss','ss')
        elif word.endswith('s'):
            word=self.replace(word,'s','')
        return word
    
    def step1b(self,word):
        flag=False
        if word.endswith('eed'):
            m=word.rfind('eed')
            result=word[:m]
            if self.base_M(result)>0:
                word=result
                word+='ee'
        elif word.endswith('ed'):
            m=word.rfind('ed')
            result=word[:m]
            if self.containsVowel(result):
                word=result
                flag=True
        elif word.endswith('ing'):
            m=word.rfind('ing')
            result=word[:m]
            if self.containsVowel(result):
                word=result
                flag=True
        if flag:
            if word.endswith('at') or word.endswith('bl') or word.endswith('iz'):
                word+='e'
            elif self.endsinDoubleCons(word) and not word.endswith('l') and not word.endswith('s') and not word.endswith('z'):
                word=word[:-1]
            elif self.base_M(word)==1 and self.check_O(word):
                word+='e'
        return word
    
    def step1c(self,word):
        if word.endswith('y'):
            result = word[:-1]
            if self.containsVowel(word):
                word=result
                word+='i'
        return word
    
    def step2(self,word):
        if self.base_M(word)>0:
            if word.endswith('ational'):
                word=self.replace(word,'ational','ate')
            elif word.endswith('tional'):
                word=self.replace(word,'tional','tion')
            elif word.endswith('enci'):
                word=self.replace(word,'enci','ence')
            elif word.endswith('anci'):
                word=self.replace(word,'anci','ance')
            elif word.endswith('izer'):
                word=self.replace(word,'izer','ize')
            elif word.endswith('abli'):
                word=self.replace(word,'abli','able')
            elif word.endswith('alli'):
                word=self.replace(word,'alli','al')
            elif word.endswith('entli'):
                word=self.replace(word,'entli','ent')
            elif word.endswith('eli'):
                word=self.replace(word,'eli','e')
            elif word.endswith('ousli'):
                word=self.replace(word,'ousli','ous')
            elif word.endswith('ization'):
                word=self.replace(word,'ization','ize')
            elif word.endswith('ation'):
                word=self.replace(word,'ation','ate')
            elif word.endswith('ator'):
                word=self.replace(word,'ator','ate')
            elif word.endswith('alism'):
                word=self.replace(word,'alism','al')
            elif word.endswith('iveness'):
                word=self.replace(word,'iveness','ive')
            elif word.endswith('fulness'):
                word=self.replace(word,'fulness','ful')
            elif word.endswith('ousness'):
                word=self.replace(word,'ousness','ous')
            elif word.endswith('aliti'):
                word=self.replace(word,'aliti','al')
            elif word.endswith('iviti'):
                word=self.replace(word,'iviti','ive')
            elif word.endswith('bliti'):
                word=self.replace(word,'bliti','ble')
        return word
    
    def step3(self,word):
        if self.base_M(word)>0:
            if word.endswith('icate'):
                word=self.replace(word,'icate','ic')
            elif word.endswith('ative'):
                word=self.replace(word,'ative','')
            elif word.endswith('alize'):
                word=self.replace(word,'alize','al')
            elif word.endswith('iciti'):
                word=self.replace(word,'iciti','ic')
            elif word.endswith('ical'):
                word=self.replace(word,'ical','ic')
            elif word.endswith('ful'):
                word=self.replace(word,'ful','')
            elif word.endswith('ness'):
                word=self.replace(word,'ness','')
        return word
    
    def step4(self,word):
        if self.base_M(word)>1:
            if word.endswith('al'):
                word=self.replace(word,'al','')
            elif word.endswith('ance'):
                word=self.replace(word,'ance','')
            elif word.endswith('ence'):
                word=self.replace(word,'ence','')
            elif word.endswith('er'):
                word=self.replace(word,'er','')
            elif word.endswith('ic'):
                word=self.replace(word,'ic','')
            elif word.endswith('able'):
                word=self.replace(word,'able','')
            elif word.endswith('ible'):
                word=self.replace(word,'ible','')
            elif word.endswith('ant'):
                word=self.replace(word,'ant','')
            elif word.endswith('ement'):
                word=self.replace(word,'ement','')
            elif word.endswith('ment'):
                word=self.replace(word,'ment','')
            elif word.endswith('ent'):
                word=self.replace(word,'ent','')
            elif word.endswith('ou'):
                word=self.replace(word,'ou','')
            elif word.endswith('ism'):
                word=self.replace(word,'ism','')
            elif word.endswith('ate'):
                word=self.replace(word,'ate','')
            elif word.endswith('iti'):
                word=self.replace(word,'iti','')
            elif word.endswith('ous'):
                word=self.replace(word,'ous','')
            elif word.endswith('ive'):
                word=self.replace(word,'ive','')
            elif word.endswith('ize'):
                word=self.replace(word,'ize','')
            elif word.endswith('ion'):
                m=word.rfind('ion')
                result=word[:m]
                if self.base_M(result)>1 and (result.endswith('s') or result.endswith('t')):
                    word=result
        return word
    
    def step5a(self,word):
        if word.endswith('e'):
            result=word[:-1]
            if (self.base_M(result)>1) or (self.base_M(result)==1 and not self.check_O(result)):
                word=result
        return word
    
    def step5b(self,word):
        if self.base_M(word)>1 and self.endsinDoubleCons(word) and word.endswith('l'):
            word=word[:-1]
        return word
    
    def stem(self,tokendict):
        for i,count in tokendict.items():
            word=i
            word=self.step1a(word)
            word=self.step1b(word)
            word=self.step1c(word)
            word=self.step2(word)
            word=self.step3(word)
            word=self.step4(word)
            word=self.step5a(word)
            word=self.step5b(word)
            self.addtodict(word,count)
    
    def addtodict(self,word,count):
        self.totalCount+=count
        if word in self.dictionary:
            self.dictionary[word]+=count
        else:
            self.dictionary[word]=count
            
    def stats(self):
        print('Stemming Stats')
        print('Number of distinct stems: '+ str(len(self.dictionary)))
        print('Number of stems that occur only once: '+str(len([key for key,val in self.dictionary.items() if val==1])))
        print('30 Frequent words:')
        self.frequentwords(30)
        self.avgTokensPerdoc=self.totalCount/self.fileCount
        print('Average stemmed tokens per document: '+str(self.avgTokensPerdoc))
        
    def frequentwords(self,n):
        self.dictionary=dict(sorted(self.dictionary.items(), key=lambda x: x[1],reverse=True))
        i=0
        for word,freq in self.dictionary.items():
            if i==n:
                break
            print(word+': '+str(freq))
            i+=1
            
    def getStemmingDict(self):
        return self.dictionary
    
    def outputdict(self):
        np.savetxt('stems.csv',np.array(list(self.dictionary.items())),delimiter=',',fmt='%s')

In [4]:
path = input('Enter Text\'s path:')

Enter Text's path:C:\Users\dkg27\Desktop\Cranfield


In [5]:
Token=Tokenizer(path)
start_tokenize_time=time.time()
d=Token.tokenize()
end_tokenize_time=time.time()

In [6]:
Token.stats()

Tokenization Stats
Number of Tokens: 132710
Number of Unique words: 12272
Number of words that occur only once: 6221
30 Frequent words:
flow: 1736
pressure: 1132
number: 964
boundary: 897
results: 885
mach: 816
theory: 775
layer: 728
method: 683
shock: 589
surface: 558
obtained: 539
given: 520
effects: 510
solution: 496
heat: 483
velocity: 481
temperature: 478
equations: 477
supersonic: 460
made: 449
ratio: 442
body: 439
wing: 430
presented: 425
found: 422
experimental: 421
laminar: 413
conditions: 411
effect: 402
Average tokens per document: 94.79285714285714


In [7]:
print('Time taken to tokenize: '+str(end_tokenize_time-start_tokenize_time)+' seconds')

Time taken to tokenize: 0.5826845169067383 seconds


In [23]:
stem=Stemming(Token.getfilecount())
start_stemming_time=time.time()
stem.stem(Token.getTokenDict())
end_stemming_time=time.time()

In [24]:
stem.stats()

Stemming Stats
Number of distinct stems: 10562
Number of stems that occur only once: 5499
30 Frequent words:
flow: 1965
number: 1336
pressure: 1279
result: 1069
boundari: 926
effect: 917
method: 883
theori: 868
layer: 859
solution: 847
mach: 817
equation: 774
bodi: 740
wing: 710
present: 685
surface: 661
obtain: 632
shock: 614
distribution: 598
problem: 591
ratio: 588
temperature: 583
velociti: 554
case: 545
given: 520
heat: 518
test: 518
condition: 515
plate: 494
us: 479
Average stemmed tokens per document: 94.79285714285714


In [19]:
print('Time taken for stemming: '+str(end_stemming_time-start_stemming_time)+' seconds')

Time taken for stemming: 0.3847775459289551 seconds


In [20]:
print('Time taken for acqiuring text characteristics(i.e. perfroming both tokenization and stemming): '+str(end_stemming_time-start_stemming_time+end_tokenize_time-start_tokenize_time)+' seconds')

Time taken for acqiuring text characteristics(i.e. perfroming both tokenization and stemming): 0.9674620628356934 seconds


In [21]:
Token.outputdict()
stem.outputdict()