In [73]:

# coding: utf-8

# In[438]:

import re
import numpy as np
import os
import codecs

rootDir = '.'
TOTAL_TRAINING_DOCUMENT_COUNT = 0
DOCUMENT_COUNT_PER_AUTHOR = {}
VOCABULARY = set()
authorBags = {}
authorFeatures = {}

def tokenize(text, choice): #if choice is 1 return the bag of words, else return the unique words with their counts
    bag = re.split('\W+', text)
    counts = {}
    for x in set(bag):
        counts.update({x:bag.count(x)})
    if choice:
        return bag
    else:
        return counts

    
for dirName, subdirList, fileList in os.walk(rootDir+'/trainingSet'):
    if(dirName != './trainingSet'):
        currDir = dirName[14:]
        
        authorBags.update({currDir:{}})
        authorFeatures.update({currDir:{'wordCount':0, #wordCount: total word count for the author.
                                        'sentenceCount':0, #sentenceCount: sentence count per document for the author.
                                        'wordLength':0, #wordLength: average word length for the author.
                                        'commaCount':0, #commaCount: average number of commas per sentence for the author.
                                        'exclamationCount':0, #exclamationCount: average number of exclamation marks per sentence for the author.
                                        'questionCount':0 #questionCount: average number of question marks per sentence for the author.
                                       }}) 



        #print(authorBags)
        fileCount = len(fileList)
        ##print(dirName)
        DOCUMENT_COUNT_PER_AUTHOR.update({currDir:0})
        
        for fname in fileList:
            TOTAL_TRAINING_DOCUMENT_COUNT += 1
            DOCUMENT_COUNT_PER_AUTHOR[currDir] += 1
            text = ''
            with codecs.open(dirName+'/'+fname, 'r', 'ISO-8859-9') as myfile:
                ##print(dirName+'/'+fname)
                text = myfile.read().lower()
                
                authorFeatures[currDir]['sentenceCount'] += text.count('.')
                authorFeatures[currDir]['commaCount'] += text.count(',')
                authorFeatures[currDir]['exclamationCount'] += text.count('!')
                authorFeatures[currDir]['questionCount'] += text.count('?')

                bagCounts = tokenize(text,0)
                for word in bagCounts:
                    authorFeatures[currDir]['wordCount'] += bagCounts[word]
                    authorFeatures[currDir]['wordLength'] += bagCounts[word]*len(word)
                    newWord = ''
                    if word.isdigit(): newWord = 'NUMBER'
                    else: newWord = word
                    
                    if newWord in authorBags[currDir]:
                        authorBags[currDir][newWord] += bagCounts[word]
                    else:
                        authorBags[currDir].update({newWord:bagCounts[word]})
        
        if '' in authorBags[currDir]: del authorBags[currDir]['']
            
        VOCABULARY.update(authorBags[currDir].keys())

        authorFeatures[currDir]['commaCount'] /= authorFeatures[currDir]['sentenceCount']
        authorFeatures[currDir]['exclamationCount'] /= authorFeatures[currDir]['sentenceCount']
        authorFeatures[currDir]['questionCount'] /= authorFeatures[currDir]['sentenceCount']
        authorFeatures[currDir]['sentenceCount'] /= DOCUMENT_COUNT_PER_AUTHOR[currDir]
        authorFeatures[currDir]['wordLength'] /= authorFeatures[currDir]['wordCount']
        authorFeatures[currDir]['wordCount'] /= DOCUMENT_COUNT_PER_AUTHOR[currDir]



# In[ ]:




In [74]:
# In[439]:

def text_class(text): 
    maxProb = -float('Inf')
    maxAuth = ''
    vocabSize = len(VOCABULARY)

    sentenceCount = text.count('.')
    commaCount = text.count(',')
    exclamationCount = text.count('!')
    questionCount = text.count('?')
    
    tokens = tokenize(text,1)
    
    for authName in authorBags:
        n = sum(authorBags[authName].values())
        #print(n , authName)
        prob = 0
        wordLength = 0
        
        for token in tokens:
            wordLength += len(token)
            smoother = 0.004
            if token in authorBags[authName]:
                tokenProb = (authorBags[authName][token]+smoother)/(n+smoother*vocabSize)
            else:
                tokenProb = smoother/(n+smoother*vocabSize)
            prob = prob + np.log(tokenProb)
            #print('tokenProb = ' , tokenProb)
        
        wordLength /= len(tokens)
        
        authorProb = DOCUMENT_COUNT_PER_AUTHOR[authName] / TOTAL_TRAINING_DOCUMENT_COUNT
        #print('authorProb = ', np.log(authorProb))
        prob += np.log(authorProb)
        prob -= abs(authorFeatures[authName]['wordCount'] - len(tokens))
        prob -= abs(authorFeatures[authName]['sentenceCount'] - sentenceCount)
        prob -= 10*abs(authorFeatures[authName]['wordLength'] - wordLength)
        prob -= 100*abs(authorFeatures[authName]['commaCount'] - commaCount/sentenceCount)
        prob -= 1000*abs(authorFeatures[authName]['exclamationCount'] - exclamationCount/sentenceCount)
        prob -= 1000*abs(authorFeatures[authName]['questionCount'] - questionCount/sentenceCount)

        #print(wordLength)
        #print(abs(authorFeatures[authName]['wordLength'] - wordLength))
        if prob > maxProb:
            maxProb = prob
            maxAuth = authName
        #print('Prob for auth ',authName, prob)
    
    #print(maxAuth)
    return maxAuth

In [86]:

# In[440]:

"""with codecs.open('./testSet/ahmetAltan/M7.txt', 'r', 'ISO-8859-9') as myfile:
    text = myfile.read()
    bagCounts = tokenize(text,1)
    text_class(bagCounts)"""

success = 0
totalTrial = 0
for dirName, subdirList, fileList in os.walk(rootDir+'/testSet'):
    if(dirName != './testSet'):
        currDir = dirName[10:]
        for fname in fileList:
            text = ''
            with codecs.open(dirName+'/'+fname, 'r', 'ISO-8859-9') as myfile:
                #print(dirName+'/'+fname)
                text = myfile.read().lower()
                auth = text_class(text)
                totalTrial += 1
                if auth == dirName[10:]:
                    #print('success')
                    success +=1
                #else:
                    #print('fail')

In [87]:
# In[441]:

print('Success rate :', success/ totalTrial)

Success rate : 0.7554945054945055


In [77]:
# In[442]:
#0.7527472527472527
with codecs.open('./testSet/abbasGuclu/13.txt', 'r', 'ISO-8859-9') as myfile:
    text = myfile.read()
    bagCounts = tokenize(text,1)
    text_class(text)

In [85]:
authorFeatures

{'abbasGuclu': {'commaCount': 0.75,
  'exclamationCount': 0.0397196261682243,
  'questionCount': 0.1705607476635514,
  'sentenceCount': 47.55555555555556,
  'wordCount': 560.1111111111111,
  'wordLength': 5.796468954572505},
 'abdullahAymaz': {'commaCount': 0.3299492385786802,
  'exclamationCount': 0.02030456852791878,
  'questionCount': 0.007614213197969543,
  'sentenceCount': 65.66666666666667,
  'wordCount': 549.3333333333334,
  'wordLength': 6.023361650485437},
 'ahmetAltan': {'commaCount': 1.1746724890829694,
  'exclamationCount': 0.0,
  'questionCount': 0.18777292576419213,
  'sentenceCount': 38.166666666666664,
  'wordCount': 542.5,
  'wordLength': 6.282642089093702},
 'ahmetHakan': {'commaCount': 0.19400499583680267,
  'exclamationCount': 0.004995836802664446,
  'questionCount': 0.04829308909242298,
  'sentenceCount': 133.44444444444446,
  'wordCount': 525.0,
  'wordLength': 5.859259259259259},
 'aliBulac': {'commaCount': 0.9336734693877551,
  'exclamationCount': 0.005102040816

In [79]:
currDir

'zekiCol'