In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from os import listdir
from os.sys import argv
from os.path import exists
def reportPerformance(y_pred, test_labels, model_name):
    print(model_name)
    print('Accuracy:  ',accuracy_score(y_pred, test_labels),'\n',
          'Precision: ',precision_score(y_pred, test_labels),'\n',
          'Recall:    ',recall_score(y_pred, test_labels),'\n',
          'F-Score:   ',f1_score(y_pred, test_labels),'\n', sep='')

ModuleNotFoundError: No module named 'os.sys'; 'os' is not a package

In [None]:
if len(argv) != 7:
    print('Error: Insufficient amount of arguments')
    exit(1)
train_dir = argv[1] + '/'
test_dir  = argv[2] + '/'
representation = argv[3]
classifier = argv[4]
stop_words = argv[5]
regularization = argv[6]
if not exists(train_dir):
    print('Error: train directory \'', train_dir,
          '\' does not exist.', sep='')
    exit(1)
if not exists(test_dir):
    print('Error: test directory \'', test_dir,
          '\' does not exist.', sep='')
    exit(1)
if representation not in ['bow', 'tfidf']:
    print('Error: representation \'', representation,
          '\' is not supported', sep='')
    exit(1)
if classifier not in ['nbayes', 'regression']:
    print('Error: classifier \'', classifier, 
          '\' is not supported', sep='')
    exit(1)
if stop_words not in ['0', '1']:
    print('Error: stop words must be 0 or 1')
    exit(1)
if regularization not in ['no', 'l1', 'l2']:
    print('Error: regularization must be no, l1 or l2')
    exit(1)
stop_words = int(stop_words)

In [49]:
train_labels = []
train_files = []
test_labels = []
test_files = []
label_dir = ['neg', 'pos']

CV = CountVectorizer(input='filename', stop_words='english')
TV = TfidfVectorizer(input='filename', stop_words='english')
for l in label_dir:
    for file in listdir(train_dir + l + '/'):
        train_labels.append(0 if l == 'neg' else 1)
        train_files.append(train_dir + l + '/' + file)
    for file in listdir(test_dir + l + '/'):
        test_labels.append(0 if l == 'neg' else 1)
        test_files.append(test_dir + l + '/' + file)
CVF = CV.fit_transform(train_files)
TVF = TV.fit_transform(train_files)
CVT = CV.transform(test_files)
TVT = TV.transform(test_files)

In [51]:
LR1 = LogisticRegression()
X1 = LR1.fit(CVF, train_labels)
LR2 = LogisticRegression()
X2 = LR2.fit(TVF, train_labels)

In [52]:
NB1 = MultinomialNB()
X3 = NB1.fit(CVF, train_labels)
NB2 = MultinomialNB()
X4 = NB2.fit(TVF, train_labels)

In [53]:
reportPerformance(X1.predict(CVT), test_labels, 'Logistic Regression with CountVectorizer')
reportPerformance(X2.predict(TVT), test_labels, 'Logistic Regression with TfidfVectorizer')
reportPerformance(X3.predict(CVT), test_labels, 'Naive Bayes with CountVectorizer')
reportPerformance(X4.predict(TVT), test_labels, 'Naive Bayes with TfidfVectorizer')



Logistic Regression with CountVectorizer
Accuracy:  0.85908
Precision: 0.85016
Recall:    0.8656023458499633
F-Score:   0.8578116801872704

Logistic Regression with TfidfVectorizer
Accuracy:  0.879
Precision: 0.88048
Recall:    0.877881470846295
F-Score:   0.8791788153532771

Naive Bayes with CountVectorizer
Accuracy:  0.81968
Precision: 0.75976
Recall:    0.8632066896927831
F-Score:   0.8081865373159731

Naive Bayes with TfidfVectorizer
Accuracy:  0.82992
Precision: 0.78152
Recall:    0.8652790079716564
F-Score:   0.8212694409415722

