In [1]:
import pandas
import numpy
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score

In [2]:
number_of_folds = 4

In [3]:
# Read in the data
data_frame = pandas.read_csv('../data/sentiment.csv')  # read_csv returns a pandas.DataFrame

In [4]:
# Visually verify that we read in the data correctly
print('Dimensions of data:', data_frame.shape)
print('First few items in data:')
print(data_frame.head())

Dimensions of data: (2000, 3)
First few items in data:
     id                                             review     label
0  1045  my giant is two movies for the price of one , ...  negative
1   366   ( note : there are spoilers regarding the fil...  positive
2   628  note : some may consider portions of the follo...  positive
3   606  in october of 1962 the united states found its...  positive
4   668  contrary to the title ,  the boxer  is not ano...  positive


In [5]:
# Convert from pandas.DataFrame to numpy.array
# because KFold.split() takes "plain" arrays as parameters
data_array = numpy.array(data_frame['review'])
label_array = numpy.array(data_frame['label'])

In [6]:
def learn_vocabulary(tfidf_vectorizer, data_train):
    # Learn vocabulary, learn tf-idf matrix with training data, and return matrix
    tfidf_train = tfidf_vectorizer.fit_transform(data_train)
    # Now vocabulary is fixed
    
    return tfidf_vectorizer, tfidf_train

In [7]:
def apply_tfidf(tfidf_vectorizer, data_test):
    # Apply tf-idf matrix to test data, and return matrix
    tfidf_test = tfidf_vectorizer.transform(data_test)
    
    return tfidf_vectorizer, tfidf_test

In [8]:
def initialize_classifier():
    # Initialize a PassiveAggressiveClassifier, that's our machine learning method
    # Documentation:
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html
    classifier = PassiveAggressiveClassifier(max_iter=50)  # Max iterations
    
    return classifier

In [9]:
def train_classifier(classifier, tfidf_train, label_train):
    # Train our classifier, i.e. train our model, i.e. run the machine learning method
    classifier.fit(tfidf_train, label_train)
    
    return classifier

In [10]:
def apply_classifier(classifier, tfidf_test):
    # Apply classifier to test data, i.e. predict labels for test data
    label_predicted = classifier.predict(tfidf_test)
    
    return label_predicted

In [11]:
def print_results(label_test, label_predicted):
    # Print results
    accuracy = accuracy_score(label_test, label_predicted)
    precision_positive = precision_score(label_test, label_predicted, pos_label='positive')
    precision_negative = precision_score(label_test, label_predicted, pos_label='negative')
    recall_positive = recall_score(label_test, label_predicted, pos_label='positive')
    recall_negative = recall_score(label_test, label_predicted, pos_label='negative')
    f1_positive = f1_score(label_test, label_predicted, pos_label='positive')
    f1_negative = f1_score(label_test, label_predicted, pos_label='negative')
    print()
    print('Accuracy:', accuracy)
    print('Precision positive:', precision_positive, 'negative:', precision_negative)
    print('Recall    positive:', recall_positive, 'negative:', recall_negative)
    print('F1        positive:', f1_positive, 'negative:', f1_negative)
    print()
    print('Confusion matrix:')
    print(confusion_matrix(label_test, label_predicted, labels=['positive', 'negative']))
    
    return accuracy

In [12]:
def run():
    # Split the dataset into k train + test folds
    kfold = KFold(n_splits=number_of_folds)
    fold = 1
    accuracy_accumulated = 0.0

    for train_index, test_index in kfold.split(data_array):
        # Beginning of one fold
        print('---------- fold', fold, '----------')
        print()
        data_train, data_test = data_array[train_index], data_array[test_index]
        label_train, label_test = label_array[train_index], label_array[test_index]
        print('Train:', train_index, ' --- Test:', test_index)
        tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)  # Max document frequency
        tfidf_vectorizer, tfidf_train = learn_vocabulary(tfidf_vectorizer, data_train)
        tfidf_vectorizer, tfidf_test = apply_tfidf(tfidf_vectorizer, data_test)
        classifier = initialize_classifier()
        classifier = train_classifier(classifier, tfidf_train, label_train)
        label_predicted = apply_classifier(classifier, tfidf_test)
        accuracy = print_results(label_test, label_predicted)
        accuracy_accumulated += accuracy
        
        print()
        fold += 1
    
    # Report average accuracy
    accuracy_average = accuracy_accumulated / number_of_folds
    print('Average accuracy:', accuracy_average)

In [13]:
run()

---------- fold 1 ----------

Train: [ 500  501  502 ... 1997 1998 1999]  --- Test: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225


Accuracy: 0.854
Precision positive: 0.8715953307392996 negative: 0.8353909465020576
Recall    positive: 0.8484848484848485 negative: 0.8601694915254238
F1        positive: 0.8598848368522073 negative: 0.8475991649269311

Confusion matrix:
[[224  40]
 [ 33 203]]

Average accuracy: 0.8405


In [14]:
number_of_folds = 16
run()

---------- fold 1 ----------

Train: [ 125  126  127 ... 1997 1998 1999]  --- Test: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124]

Accuracy: 0.864
Precision positive: 0.8461538461538461 negative: 0.8833333333333333
Recall    positive: 0.8870967741935484 negative: 0.8412698412698413
F1        positive: 0.8661417322834646 negative: 0.8617886178861788

Confusion matrix:
[[55  7]
 [10 53]]

---------- fold 2 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140


Accuracy: 0.816
Precision positive: 0.7719298245614035 negative: 0.8529411764705882
Recall    positive: 0.8148148148148148 negative: 0.8169014084507042
F1        positive: 0.7927927927927928 negative: 0.8345323741007195

Confusion matrix:
[[44 10]
 [13 58]]

---------- fold 11 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263
 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277
 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305
 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333
 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347
 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361
 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374]

Accuracy: 0.848
Pre

In [15]:
number_of_folds = 256
run()

---------- fold 1 ----------

Train: [   8    9   10 ... 1997 1998 1999]  --- Test: [0 1 2 3 4 5 6 7]

Accuracy: 0.875
Precision positive: 0.8 negative: 1.0
Recall    positive: 1.0 negative: 0.75
F1        positive: 0.888888888888889 negative: 0.8571428571428571

Confusion matrix:
[[4 0]
 [1 3]]

---------- fold 2 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [ 8  9 10 11 12 13 14 15]

Accuracy: 0.875
Precision positive: 1.0 negative: 0.8
Recall    positive: 0.75 negative: 1.0
F1        positive: 0.8571428571428571 negative: 0.888888888888889

Confusion matrix:
[[3 1]
 [0 4]]

---------- fold 3 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [16 17 18 19 20 21 22 23]

Accuracy: 0.625
Precision positive: 0.6666666666666666 negative: 0.5
Recall    positive: 0.8 negative: 0.3333333333333333
F1        positive: 0.7272727272727272 negative: 0.4

Confusion matrix:
[[4 1]
 [2 1]]

---------- fold 4 ----------

Train: [   0    1    2 ... 1997 1998 1999]  -

  _warn_prf(average, modifier, msg_start, len(result))



Accuracy: 0.875
Precision positive: 0.0 negative: 1.0
Recall    positive: 0.0 negative: 0.875
F1        positive: 0.0 negative: 0.9333333333333333

Confusion matrix:
[[0 0]
 [1 7]]

---------- fold 21 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [160 161 162 163 164 165 166 167]

Accuracy: 0.875
Precision positive: 0.75 negative: 1.0
Recall    positive: 1.0 negative: 0.8
F1        positive: 0.8571428571428571 negative: 0.888888888888889

Confusion matrix:
[[3 0]
 [1 4]]

---------- fold 22 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [168 169 170 171 172 173 174 175]

Accuracy: 0.875
Precision positive: 0.8333333333333334 negative: 1.0
Recall    positive: 1.0 negative: 0.6666666666666666
F1        positive: 0.9090909090909091 negative: 0.8

Confusion matrix:
[[5 0]
 [1 2]]

---------- fold 23 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [176 177 178 179 180 181 182 183]

Accuracy: 0.875
Precision positive: 0.75 negative: 1


Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[3 0]
 [0 5]]

---------- fold 48 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [376 377 378 379 380 381 382 383]

Accuracy: 0.875
Precision positive: 1.0 negative: 0.8
Recall    positive: 0.75 negative: 1.0
F1        positive: 0.8571428571428571 negative: 0.888888888888889

Confusion matrix:
[[3 1]
 [0 4]]

---------- fold 49 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [384 385 386 387 388 389 390 391]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[5 0]
 [0 3]]

---------- fold 50 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [392 393 394 395 396 397 398 399]

Accuracy: 0.875
Precision positive: 0.8 negative: 1.0
Recall    positive: 1.0 negative: 0.75
F1        positive: 0.888


Accuracy: 0.75
Precision positive: 0.75 negative: 0.75
Recall    positive: 0.75 negative: 0.75
F1        positive: 0.75 negative: 0.75

Confusion matrix:
[[3 1]
 [1 3]]

---------- fold 75 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [592 593 594 595 596 597 598 599]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[4 0]
 [0 4]]

---------- fold 76 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [600 601 602 603 604 605 606 607]

Accuracy: 0.875
Precision positive: 1.0 negative: 0.75
Recall    positive: 0.8 negative: 1.0
F1        positive: 0.888888888888889 negative: 0.8571428571428571

Confusion matrix:
[[4 1]
 [0 3]]

---------- fold 77 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [608 609 610 611 612 613 614 615]

Accuracy: 0.75
Precision positive: 0.75 negative: 0.75
Recall    positive: 0.75 negative: 0.75
F1        positi


Accuracy: 0.875
Precision positive: 0.8 negative: 1.0
Recall    positive: 1.0 negative: 0.75
F1        positive: 0.888888888888889 negative: 0.8571428571428571

Confusion matrix:
[[4 0]
 [1 3]]

---------- fold 102 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [808 809 810 811 812 813 814 815]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[3 0]
 [0 5]]

---------- fold 103 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [816 817 818 819 820 821 822 823]

Accuracy: 0.75
Precision positive: 0.8333333333333334 negative: 0.5
Recall    positive: 0.8333333333333334 negative: 0.5
F1        positive: 0.8333333333333334 negative: 0.5

Confusion matrix:
[[5 1]
 [1 1]]

---------- fold 104 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [824 825 826 827 828 829 830 831]

Accuracy: 0.75
Precision positive: 0.5 negative: 1.0
Recall    positi

  _warn_prf(average, modifier, msg_start, len(result))



Accuracy: 0.75
Precision positive: 0.0 negative: 0.75
Recall    positive: 0.0 negative: 1.0
F1        positive: 0.0 negative: 0.8571428571428571

Confusion matrix:
[[0 2]
 [0 6]]

---------- fold 120 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [952 953 954 955 956 957 958 959]

Accuracy: 0.75
Precision positive: 0.7142857142857143 negative: 1.0
Recall    positive: 1.0 negative: 0.3333333333333333
F1        positive: 0.8333333333333333 negative: 0.5

Confusion matrix:
[[5 0]
 [2 1]]

---------- fold 121 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [960 961 962 963 964 965 966 967]

Accuracy: 0.875
Precision positive: 0.75 negative: 1.0
Recall    positive: 1.0 negative: 0.8
F1        positive: 0.8571428571428571 negative: 0.888888888888889

Confusion matrix:
[[3 0]
 [1 4]]

---------- fold 122 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [968 969 970 971 972 973 974 975]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0



Accuracy: 0.625
Precision positive: 0.75 negative: 0.5
Recall    positive: 0.6 negative: 0.6666666666666666
F1        positive: 0.6666666666666665 negative: 0.5714285714285715

Confusion matrix:
[[3 2]
 [1 2]]

---------- fold 146 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1160 1161 1162 1163 1164 1165 1166 1167]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[3 0]
 [0 5]]

---------- fold 147 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1168 1169 1170 1171 1172 1173 1174 1175]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[2 0]
 [0 6]]

---------- fold 148 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1176 1177 1178 1179 1180 1181 1182 1183]

Accuracy: 0.875
Precision positive: 1.0 negative: 0.6666666666666666
Recall


Accuracy: 0.75
Precision positive: 1.0 negative: 0.6
Recall    positive: 0.6 negative: 1.0
F1        positive: 0.7499999999999999 negative: 0.7499999999999999

Confusion matrix:
[[3 2]
 [0 3]]

---------- fold 172 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1368 1369 1370 1371 1372 1373 1374 1375]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[5 0]
 [0 3]]

---------- fold 173 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1376 1377 1378 1379 1380 1381 1382 1383]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[6 0]
 [0 2]]

---------- fold 174 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1384 1385 1386 1387 1388 1389 1390 1391]

Accuracy: 0.875
Precision positive: 0.8 negative: 1.0
Recall    positive: 1.0 negative: 0.75


Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[4 0]
 [0 4]]

---------- fold 198 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1576 1577 1578 1579 1580 1581 1582 1583]

Accuracy: 0.625
Precision positive: 0.5 negative: 0.75
Recall    positive: 0.6666666666666666 negative: 0.6
F1        positive: 0.5714285714285715 negative: 0.6666666666666665

Confusion matrix:
[[2 1]
 [2 3]]

---------- fold 199 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1584 1585 1586 1587 1588 1589 1590 1591]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[5 0]
 [0 3]]

---------- fold 200 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1592 1593 1594 1595 1596 1597 1598 1599]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0

  _warn_prf(average, modifier, msg_start, len(result))



Accuracy: 0.8571428571428571
Precision positive: 1.0 negative: 0.0
Recall    positive: 0.8571428571428571 negative: 0.0
F1        positive: 0.923076923076923 negative: 0.0

Confusion matrix:
[[6 1]
 [0 0]]

---------- fold 218 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1727 1728 1729 1730 1731 1732 1733]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[2 0]
 [0 5]]

---------- fold 219 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1734 1735 1736 1737 1738 1739 1740]

Accuracy: 0.7142857142857143
Precision positive: 1.0 negative: 0.3333333333333333
Recall    positive: 0.6666666666666666 negative: 1.0
F1        positive: 0.8 negative: 0.5

Confusion matrix:
[[4 2]
 [0 1]]

---------- fold 220 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1741 1742 1743 1744 1745 1746 1747]

Accuracy: 1.0
Precision positive: 1.0 negative: 1


Accuracy: 0.8571428571428571
Precision positive: 0.8 negative: 1.0
Recall    positive: 1.0 negative: 0.6666666666666666
F1        positive: 0.888888888888889 negative: 0.8

Confusion matrix:
[[4 0]
 [1 2]]

---------- fold 244 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1909 1910 1911 1912 1913 1914 1915]

Accuracy: 0.5714285714285714
Precision positive: 0.8 negative: 0.0
Recall    positive: 0.6666666666666666 negative: 0.0
F1        positive: 0.7272727272727272 negative: 0.0

Confusion matrix:
[[4 2]
 [1 0]]

---------- fold 245 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1916 1917 1918 1919 1920 1921 1922]

Accuracy: 1.0
Precision positive: 1.0 negative: 1.0
Recall    positive: 1.0 negative: 1.0
F1        positive: 1.0 negative: 1.0

Confusion matrix:
[[5 0]
 [0 2]]

---------- fold 246 ----------

Train: [   0    1    2 ... 1997 1998 1999]  --- Test: [1923 1924 1925 1926 1927 1928 1929]

Accuracy: 1.0
Precision positive: 1.0 negative: 1

1st run:
    number of folds: 4
    Average accuracy: 0.8405
2nd run:
    number of folds: 16
    Average accuracy: ~0.8540
    increase by %: ~1.6%
3rd run:
    number of folds: 256
    Average accuracy: ~0.8616
    increase by % over 1st run: ~2.5%
    increase by % over 2nd run: ~0.9%


By jumping from 4 folds to 16 folds the accuracy only increases by a small ammount, if we take that even further and take 256 folds it only increases its accuracy over the second run by about 0.9%.
Considdering it takes some time to calculate 256 folds and the improvement is less than 1% i would not deem it worth it to increase the number of folds to more than 16 in this case.