# CSA file name classification

### Normalize the input data

In [120]:
import glob
import os

for file_path in glob.glob('./*.txt'):
    with open(file_path, 'r', encoding='ISO-8859-1') as source:
        with open('normal_' + os.path.basename(file_path), 'a') as target:
            for line in source:
                new_line = ''
                char = ' '
                for next_char in line:
                    if char.isdigit() and next_char.isdigit():
                        new_line  += next_char
                    elif char.isalpha() and next_char.isalpha() and not (char.islower() and next_char.isupper()):
                        new_line += next_char
                    elif next_char.isdigit() or next_char.isalpha():
                        new_line += ' ' + next_char
                    char = next_char
                try:
                    target.write(new_line.strip().lower() + '\n')
                except:
                    pass

### Load files into memory

In [1]:
with open('normal_csa.txt', 'r') as file:
    csa_filenames = file.read().splitlines()

with open('normal_home.txt', 'r') as file:
    home_filenames = file.read().splitlines()
    
with open('normal_external_drive.txt', 'r') as file:
    external_drive_filenames = file.read().splitlines()
    
with open('normal_nsrl.txt', 'r') as file:
    nsrl_filenames = file.read().splitlines()[0::50] # Only take every 50th to not skew the input data

### Create training and test data sets

In [2]:
from sklearn.model_selection import train_test_split

filenames = []
target = []

filenames.extend(csa_filenames)
target.extend([0] * len(csa_filenames))

filenames.extend(home_filenames)
target.extend([1] * len(home_filenames))

filenames.extend(external_drive_filenames)
target.extend([1] * len(external_drive_filenames))

filenames.extend(nsrl_filenames)
target.extend([1] * len(nsrl_filenames))

X_train, X_test, y_train, y_test = train_test_split(filenames, 
                                                    target, 
                                                    test_size=0.1, 
                                                    random_state=109)

### Imports

In [10]:
from statistics import median, mean
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.model_selection import GridSearchCV

### Classify filenames using NB and only CSA for training

In [118]:
class NBClassifier:
    def __init__(self, word_count_dict):
        self.word_count_dict = word_count_dict
        self.normalizer = max(word_count_dict.values())
        
    def get_nb_scores(self, string_array):
        result = []
        for string in string_array:
            result.append(self.get_nb_score(string))
        return result

    def get_nb_score(self, string):
        score = 1
        count = 0
        for word in string.split(' '):
            score *= (self.word_count_dict.get(word, 0) + 0.1) / self.normalizer
            count += 1
        return score / count

def print_information(title, scores):
    print('{}\t{}\t{:.10f}\t{:.6f}'.format(title, median(scores), mean(scores), max(scores)))
    
def calc_and_print(word_count_dict):
    nb_csa_clf = NBClassifier(word_count_dict)
    print('Title\t\tMedian\t\t\tMean:\t\tMax:')
    print_information('CSA test', nb_csa_clf.get_nb_scores(csa_filenames_test))
    print_information('CSA all ', nb_csa_clf.get_nb_scores(csa_filenames))
    print_information('Home dir', nb_csa_clf.get_nb_scores(home_filenames))
    print_information('External', nb_csa_clf.get_nb_scores(external_drive_filenames))
    print_information('NSRL list', nb_csa_clf.get_nb_scores(nsrl_filenames))

print('\nAll words')
calc_and_print(word_counts)
print('\nSkip singles')
calc_and_print({key: value for key, value in word_counts.items() if value > 1 })
print('\nSkip single and doubles')
calc_and_print({key: value for key, value in word_counts.items() if value > 2 })
print('\nSkip 10 most common')
tenth_most_common = sorted(word_counts.values(), reverse=True)[10]
calc_and_print({key: value for key, value in word_counts.items() if value <  tenth_most_common})



All words
Title		Median			Mean:		Max:
CSA test	9.32773214538338e-10	0.0001193148	1.000000
CSA all 	1.345064768124739e-09	0.0001007138	1.000000
Home dir	6.808818812584162e-20	0.0000009497	0.007432
External	1.8364258726484017e-14	0.0000221181	0.089162
NSRL list	6.489343397337807e-20	0.0000276151	0.023117

Skip singles
Title		Median			Mean:		Max:
CSA test	6.632623725765875e-10	0.0001193072	1.000000
CSA all 	9.607419179837564e-10	0.0001006755	1.000000
Home dir	5.902072501651128e-20	0.0000009489	0.007432
External	1.7179839371188966e-14	0.0000221180	0.089162
NSRL list	4.7275307102777694e-20	0.0000276128	0.023117

Skip single and doubles
Title		Median			Mean:		Max:
CSA test	5.255420704766012e-10	0.0001193019	1.000000
CSA all 	6.50305337159031e-10	0.0001006617	1.000000
Home dir	5.902072501651128e-20	0.0000009480	0.007432
External	1.2010584306914509e-14	0.0000221178	0.089162
NSRL list	3.55298891890441e-20	0.0000276027	0.023117

Skip 10 most common
Title		Median			Mean:		Max:
CSA test	5.0167628

### Classify using Naive Bayes

In [16]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1, 1e-1, 1e-2, 1e-3)}
              
nb_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

nb_gs_clf = GridSearchCV(nb_clf, parameters, n_jobs=-1, cv=3)
nb_gs_clf.fit(X_train, y_train)

print(nb_gs_clf.best_params_)

{'clf__alpha': 0.1, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [17]:
print(nb_gs_clf.score(X_test, y_test))
print(nb_gs_clf.score(csa_filenames, [0]*len(csa_filenames)))
print(nb_gs_clf.score(home_filenames, [0]*len(home_filenames)))
print(nb_gs_clf.score(external_drive_filenames, [0]*len(external_drive_filenames)))
print(nb_gs_clf.score(nsrl_filenames, [0]*len(nsrl_filenames)))

0.9709848356533252
0.9953585614100415
0.0038658161737064193
0.23845101793599313
0.017322799240090052


### Classify using SVM

In [26]:
from sklearn.linear_model import SGDClassifier

svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf-svm', SGDClassifier(loss='hinge', tol=0.001))])

svm_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..._state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [27]:
print(svm_clf.score(X_test, y_test))
print(svm_clf.score(csa_filenames, [0]*len(csa_filenames)))
print(svm_clf.score(home_filenames, [0]*len(home_filenames)))
print(svm_clf.score(external_drive_filenames, [0]*len(external_drive_filenames)))
print(svm_clf.score(nsrl_filenames, [0]*len(nsrl_filenames)))

0.9649737121366623
0.9903610438776144
0.002830509190108496
0.5168926575749286
0.024250488228855935


In [21]:
from sklearn.linear_model import SGDClassifier

svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf-svm', SGDClassifier(loss='hinge', tol=0.001))])

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1, 1e-1, 1e-2)}

gs_clf_svm = GridSearchCV(svm_clf, parameters_svm, n_jobs=-1, cv=3)
gs_clf_svm.fit(X_train, y_train)

print(gs_clf_svm.best_params_)

{'clf-svm__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}


In [23]:
print(gs_clf_svm.score(X_test, y_test))
print(gs_clf_svm.score(csa_filenames, [0]*len(csa_filenames)))
print(gs_clf_svm.score(home_filenames, [0]*len(home_filenames)))
print(gs_clf_svm.score(external_drive_filenames, [0]*len(external_drive_filenames)))
print(gs_clf_svm.score(nsrl_filenames, [0]*len(nsrl_filenames)))

0.9288896877603293
0.9315893998634642
0.0035662807551509034
0.7203079207533899
0.025893757721987008
