In [1]:
import pandas
import numpy
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score

In [2]:
number_of_folds =64

In [3]:
maximum_iterations = 50

In [4]:
# Read in the data
data_frame = pandas.read_csv('../data/sentiment.csv')  # read_csv returns a pandas.DataFrame

In [5]:
# Visually verify that we read in the data correctly
print('Dimensions of data:', data_frame.shape)
print('First few items in data:')
print(data_frame.head())

Dimensions of data: (2000, 3)
First few items in data:
     id                                             review     label
0  1045  my giant is two movies for the price of one , ...  negative
1   366   ( note : there are spoilers regarding the fil...  positive
2   628  note : some may consider portions of the follo...  positive
3   606  in october of 1962 the united states found its...  positive
4   668  contrary to the title ,  the boxer  is not ano...  positive


In [6]:
# Convert from pandas.DataFrame to numpy.array
# because KFold.split() takes "plain" arrays as parameters
data_array = numpy.array(data_frame['review'])
label_array = numpy.array(data_frame['label'])

In [7]:
def learn_vocabulary(tfidf_vectorizer, data_train):
    # Learn vocabulary, learn tf-idf matrix with training data, and return matrix
    tfidf_train = tfidf_vectorizer.fit_transform(data_train)
    # Now vocabulary is fixed
    
    return tfidf_vectorizer, tfidf_train

In [8]:
def apply_tfidf(tfidf_vectorizer, data_test):
    # Apply tf-idf matrix to test data, and return matrix
    tfidf_test = tfidf_vectorizer.transform(data_test)
    
    return tfidf_vectorizer, tfidf_test

In [9]:
def initialize_classifier():
    # Initialize a PassiveAggressiveClassifier, that's our machine learning method
    # Documentation:
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html
    classifier = PassiveAggressiveClassifier(max_iter=maximum_iterations)  # Max iterations
    
    return classifier

In [10]:
def train_classifier(classifier, tfidf_train, label_train):
    # Train our classifier, i.e. train our model, i.e. run the machine learning method
    classifier.fit(tfidf_train, label_train)
    
    return classifier

In [11]:
def apply_classifier(classifier, tfidf_test):
    # Apply classifier to test data, i.e. predict labels for test data
    label_predicted = classifier.predict(tfidf_test)
    
    return label_predicted

In [12]:
def print_results(label_test, label_predicted):
    # Print results
    accuracy = accuracy_score(label_test, label_predicted)
    precision_positive = precision_score(label_test, label_predicted, pos_label='positive')
    precision_negative = precision_score(label_test, label_predicted, pos_label='negative')
    recall_positive = recall_score(label_test, label_predicted, pos_label='positive')
    recall_negative = recall_score(label_test, label_predicted, pos_label='negative')
    f1_positive = f1_score(label_test, label_predicted, pos_label='positive')
    f1_negative = f1_score(label_test, label_predicted, pos_label='negative')
    print()
    print('Accuracy:', accuracy)
    print('Precision positive:', precision_positive, 'negative:', precision_negative)
    print('Recall    positive:', recall_positive, 'negative:', recall_negative)
    print('F1        positive:', f1_positive, 'negative:', f1_negative)
    print()
    print('Confusion matrix:')
    print(confusion_matrix(label_test, label_predicted, labels=['positive', 'negative']))
    
    return accuracy

In [13]:
def run():
    # Split the dataset into k train + test folds
    kfold = KFold(n_splits=number_of_folds)
    fold = 1
    accuracy_accumulated = 0.0

    for train_index, test_index in kfold.split(data_array):
        # Beginning of one fold
        print('---------- fold', fold, '----------')
        print()
        data_train, data_test = data_array[train_index], data_array[test_index]
        label_train, label_test = label_array[train_index], label_array[test_index]
        print('Train:', train_index, ' --- Test:', test_index)
        tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)  # Max document frequency
        tfidf_vectorizer, tfidf_train = learn_vocabulary(tfidf_vectorizer, data_train)
        tfidf_vectorizer, tfidf_test = apply_tfidf(tfidf_vectorizer, data_test)
        classifier = initialize_classifier()
        classifier = train_classifier(classifier, tfidf_train, label_train)
        label_predicted = apply_classifier(classifier, tfidf_test)
        accuracy = print_results(label_test, label_predicted)
        accuracy_accumulated += accuracy
        
        print()
        fold += 1
    
    # Report average accuracy
    accuracy_average = accuracy_accumulated / number_of_folds
    print('Average accuracy:', accuracy_average)

In [14]:
run()#first run with maximum_iterations = 50

---------- fold 1 ----------

Train: [  32   33   34 ... 1997 1998 1999]  --- Test: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]

Accuracy: 0.8125
Precision positive: 0.8125 negative: 0.8125
Recall    positive: 0.8125 negative: 0.8125
F1        positive: 0.8125 negative: 0.8125

Confusion matrix:
[[13  3]
 [ 3 13]]

---------- fold 2 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
 56 57 58 59 60 61 62 63]

Accuracy: 0.90625
Precision positive: 0.9 negative: 0.9166666666666666
Recall    positive: 0.9473684210526315 negative: 0.8461538461538461
F1        positive: 0.9230769230769231 negative: 0.8799999999999999

Confusion matrix:
[[18  1]
 [ 2 11]]

---------- fold 3 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
 88 89 90 91 92 93 94 95]

Accuracy: 0.90


Accuracy: 0.8064516129032258
Precision positive: 0.75 negative: 0.8666666666666667
Recall    positive: 0.8571428571428571 negative: 0.7647058823529411
F1        positive: 0.7999999999999999 negative: 0.8125

Confusion matrix:
[[12  2]
 [ 4 13]]

---------- fold 20 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
 623 624 625 626 627 628 629 630 631 632 633 634 635]

Accuracy: 0.8387096774193549
Precision positive: 0.8823529411764706 negative: 0.7857142857142857
Recall    positive: 0.8333333333333334 negative: 0.8461538461538461
F1        positive: 0.8571428571428571 negative: 0.8148148148148148

Confusion matrix:
[[15  3]
 [ 2 11]]

---------- fold 21 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
 654 655 656 657 658 659 660 661 662 663 664 665 666]

Accuracy: 0.8387096774193549
Precision positive: 0.6 ne


Accuracy: 0.8387096774193549
Precision positive: 0.9166666666666666 negative: 0.7894736842105263
Recall    positive: 0.7333333333333333 negative: 0.9375
F1        positive: 0.8148148148148148 negative: 0.8571428571428572

Confusion matrix:
[[11  4]
 [ 1 15]]

---------- fold 38 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
 1191 1192 1193]

Accuracy: 0.8387096774193549
Precision positive: 0.7692307692307693 negative: 0.8888888888888888
Recall    positive: 0.8333333333333334 negative: 0.8421052631578947
F1        positive: 0.8 negative: 0.8648648648648649

Confusion matrix:
[[10  2]
 [ 3 16]]

---------- fold 39 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
 1222 1223


Accuracy: 0.8709677419354839
Precision positive: 0.8823529411764706 negative: 0.8571428571428571
Recall    positive: 0.8823529411764706 negative: 0.8571428571428571
F1        positive: 0.8823529411764706 negative: 0.8571428571428571

Confusion matrix:
[[15  2]
 [ 2 12]]

---------- fold 55 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717
 1718 1719 1720]

Accuracy: 0.8064516129032258
Precision positive: 0.8235294117647058 negative: 0.7857142857142857
Recall    positive: 0.8235294117647058 negative: 0.7857142857142857
F1        positive: 0.8235294117647058 negative: 0.7857142857142857

Confusion matrix:
[[14  3]
 [ 3 11]]

---------- fold 56 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734
 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 174

In [15]:
maximum_iterations = 200
run()#first run with maximum_iterations = 200

---------- fold 1 ----------

Train: [  32   33   34 ... 1997 1998 1999]  --- Test: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]

Accuracy: 0.8125
Precision positive: 0.8125 negative: 0.8125
Recall    positive: 0.8125 negative: 0.8125
F1        positive: 0.8125 negative: 0.8125

Confusion matrix:
[[13  3]
 [ 3 13]]

---------- fold 2 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
 56 57 58 59 60 61 62 63]

Accuracy: 0.875
Precision positive: 0.8947368421052632 negative: 0.8461538461538461
Recall    positive: 0.8947368421052632 negative: 0.8461538461538461
F1        positive: 0.8947368421052632 negative: 0.8461538461538461

Confusion matrix:
[[17  2]
 [ 2 11]]

---------- fold 3 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
 88 89 90 91 92 93 94 95]

A


Accuracy: 0.8387096774193549
Precision positive: 0.8 negative: 0.875
Recall    positive: 0.8571428571428571 negative: 0.8235294117647058
F1        positive: 0.8275862068965518 negative: 0.8484848484848485

Confusion matrix:
[[12  2]
 [ 3 14]]

---------- fold 20 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
 623 624 625 626 627 628 629 630 631 632 633 634 635]

Accuracy: 0.8387096774193549
Precision positive: 0.8823529411764706 negative: 0.7857142857142857
Recall    positive: 0.8333333333333334 negative: 0.8461538461538461
F1        positive: 0.8571428571428571 negative: 0.8148148148148148

Confusion matrix:
[[15  3]
 [ 2 11]]

---------- fold 21 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
 654 655 656 657 658 659 660 661 662 663 664 665 666]

Accuracy: 0.8064516129032258
Precision positive: 0.545454


Accuracy: 0.8064516129032258
Precision positive: 0.8461538461538461 negative: 0.7777777777777778
Recall    positive: 0.7333333333333333 negative: 0.875
F1        positive: 0.7857142857142856 negative: 0.823529411764706

Confusion matrix:
[[11  4]
 [ 2 14]]

---------- fold 38 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
 1191 1192 1193]

Accuracy: 0.8064516129032258
Precision positive: 0.75 negative: 0.8421052631578947
Recall    positive: 0.75 negative: 0.8421052631578947
F1        positive: 0.75 negative: 0.8421052631578947

Confusion matrix:
[[ 9  3]
 [ 3 16]]

---------- fold 39 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
 1222 1223 1224]

Accuracy: 0.935483870


Accuracy: 0.9032258064516129
Precision positive: 0.8888888888888888 negative: 0.9230769230769231
Recall    positive: 0.9411764705882353 negative: 0.8571428571428571
F1        positive: 0.9142857142857143 negative: 0.888888888888889

Confusion matrix:
[[16  1]
 [ 2 12]]

---------- fold 55 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717
 1718 1719 1720]

Accuracy: 0.8064516129032258
Precision positive: 0.8235294117647058 negative: 0.7857142857142857
Recall    positive: 0.8235294117647058 negative: 0.7857142857142857
F1        positive: 0.8235294117647058 negative: 0.7857142857142857

Confusion matrix:
[[14  3]
 [ 3 11]]

---------- fold 56 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734
 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745

In [16]:
maximum_iterations = 3200
run()#first run with maximum_iterations = 3200

---------- fold 1 ----------

Train: [  32   33   34 ... 1997 1998 1999]  --- Test: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]

Accuracy: 0.8125
Precision positive: 0.8125 negative: 0.8125
Recall    positive: 0.8125 negative: 0.8125
F1        positive: 0.8125 negative: 0.8125

Confusion matrix:
[[13  3]
 [ 3 13]]

---------- fold 2 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
 56 57 58 59 60 61 62 63]

Accuracy: 0.875
Precision positive: 0.8947368421052632 negative: 0.8461538461538461
Recall    positive: 0.8947368421052632 negative: 0.8461538461538461
F1        positive: 0.8947368421052632 negative: 0.8461538461538461

Confusion matrix:
[[17  2]
 [ 2 11]]

---------- fold 3 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
 88 89 90 91 92 93 94 95]

A


Accuracy: 0.8387096774193549
Precision positive: 0.8 negative: 0.875
Recall    positive: 0.8571428571428571 negative: 0.8235294117647058
F1        positive: 0.8275862068965518 negative: 0.8484848484848485

Confusion matrix:
[[12  2]
 [ 3 14]]

---------- fold 20 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
 623 624 625 626 627 628 629 630 631 632 633 634 635]

Accuracy: 0.8387096774193549
Precision positive: 0.8823529411764706 negative: 0.7857142857142857
Recall    positive: 0.8333333333333334 negative: 0.8461538461538461
F1        positive: 0.8571428571428571 negative: 0.8148148148148148

Confusion matrix:
[[15  3]
 [ 2 11]]

---------- fold 21 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
 654 655 656 657 658 659 660 661 662 663 664 665 666]

Accuracy: 0.8387096774193549
Precision positive: 0.6 nega


Accuracy: 0.8387096774193549
Precision positive: 0.9166666666666666 negative: 0.7894736842105263
Recall    positive: 0.7333333333333333 negative: 0.9375
F1        positive: 0.8148148148148148 negative: 0.8571428571428572

Confusion matrix:
[[11  4]
 [ 1 15]]

---------- fold 38 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176
 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
 1191 1192 1193]

Accuracy: 0.8709677419354839
Precision positive: 0.8333333333333334 negative: 0.8947368421052632
Recall    positive: 0.8333333333333334 negative: 0.8947368421052632
F1        positive: 0.8333333333333334 negative: 0.8947368421052632

Confusion matrix:
[[10  2]
 [ 2 17]]

---------- fold 39 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 


Accuracy: 0.9032258064516129
Precision positive: 0.8888888888888888 negative: 0.9230769230769231
Recall    positive: 0.9411764705882353 negative: 0.8571428571428571
F1        positive: 0.9142857142857143 negative: 0.888888888888889

Confusion matrix:
[[16  1]
 [ 2 12]]

---------- fold 55 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717
 1718 1719 1720]

Accuracy: 0.8064516129032258
Precision positive: 0.8235294117647058 negative: 0.7857142857142857
Recall    positive: 0.8235294117647058 negative: 0.7857142857142857
F1        positive: 0.8235294117647058 negative: 0.7857142857142857

Confusion matrix:
[[14  3]
 [ 3 11]]

---------- fold 56 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734
 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745

1st:
    maximum_iterations: 50
    Average accuracy: 0.8580676663306449
2nd:
    maximum_iterations: 200
    Average accuracy: 0.8595325100806448
3rd:
    maximum_iterations: 3200
    Average accuracy: 0.8576423891129029

i dont know if i changed the ammount of max iterations incorrectly, but it does not seem like the average accuracy changes for the better or worse