# Logistic models for blog post

This notebook works up some quick and dirty bag-of-words models, to see how much this approach suffers when we cut whole documents into 128- or 256-word chunks.

We're going to use LogisticRegression from scikit-learn, and apply it in three ways:

1. To whole documents.

2. To BERT-sized chunks.

3. Aggregating the votes from BERT-sized chunks to produce a document-level prediction.

In [1]:
# Things that will come in handy

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
from scipy.stats import pearsonr
import random, glob, csv

# Modeling whole movie reviews from the IMDb dataset

@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}


In [12]:
raw = pd.read_csv('sentimentdata.tsv', sep = '\t')

fullname = 'sentiment'

raw = raw.sample(frac = 1)
# that is in effect a shuffle

cut = round(len(raw) * .75)

train = raw.iloc[0: cut, : ]
test = raw.iloc[cut : , : ]

In [46]:
lex = Counter()

delchars = ''.join(c for c in map(chr, range(256)) if not c.isalpha())
spaces = ' ' * len(delchars)
punct2space = str.maketrans(delchars, spaces)

def getwords(text):
    global punct2space
    text = text.replace('<br />', ' ')
    words = text.translate(punct2space).split()
    return words

def get_dataset(rootfolder):
    
    negpaths = glob.glob(rootfolder + '/neg/*.txt')
    pospaths = glob.glob(rootfolder + '/pos/*.txt')
    paths = [(0, x) for x in negpaths] + [(1, x) for x in pospaths]
    
    index = 0
    lines = []
    lex = Counter()
    labels = []
    texts = []
    
    for label, p in paths:
        
        with open(p) as f:
            text = f.read().strip().lower()
            words = getwords(text)
            for w in words:
                lex[w] += 1
            labels.append(label)
            texts.append(text)

    vocab = [x[0] for x in lex.most_common()]
    print(vocab[0:10])
    
    df = pd.DataFrame.from_dict({'sent': labels, 'text': texts})
    df = df.sample(frac = 1)
    # shuffle
    
    return vocab, df

In [47]:
def make_matrix(df, vocab, cut):
    
    lexicon = dict()
    for i in range(cut):
        lexicon[vocab[i]] = i
    
    y = []
    x = []
    
    for i, row in df.iterrows():
        y.append(int(row['sent']))
        x_row = np.zeros(cut)
        words = getwords(row.text)
        for w in words:
            if w in lexicon:
                idx = lexicon[w]
                x_row[idx] = x_row[idx] + 1
        
        x_row = x_row / np.sum(len(words))
        
        x.append(x_row)
    
    x = np.array(x)
    
    return x, y
    

In [63]:
triplets = []

vocab, train_df = get_dataset('/Volumes/TARDIS/aclImdb/train')
print('got training')
dummy, test_df = get_dataset('/Volumes/TARDIS/aclImdb/test')
print('got test')

for cut in range(3200, 5200, 200):

    for reg_const in [.00001, .0001, .0003, .001, .01, .1]:
        
        trainingset, train_y = make_matrix(train_df, vocab, cut)
        testset, test_y = make_matrix(test_df, vocab, cut)
        
        model = LogisticRegression(C = reg_const)
        stdscaler = StandardScaler()
        stdscaler.fit(trainingset)
        scaledtraining = stdscaler.transform(trainingset)
        model.fit(scaledtraining, train_y)

        scaledtest = stdscaler.transform(testset)
        predictions = [x[1] for x in model.predict_proba(scaledtest)]
        predictions = np.round(predictions)
        accuracy = accuracy_score(predictions, test_y)
        f1 = f1_score(predictions, test_y)
        print(cut, reg_const, f1, accuracy)
        triplets.append((accuracy, cut, reg_const))

random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])

['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
got training
['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
got test
3200 1e-05 0.8492538504508818 0.84888
3200 0.0001 0.8725717483411942 0.87248
3200 0.0003 0.8759947214779862 0.87596
3200 0.001 0.8708091908091908 0.87068
3200 0.01 0.8593687574910108 0.8592
3200 0.1 0.8512644321041908 0.85108
3400 1e-05 0.8499640488935049 0.84976
3400 0.0001 0.8742203742203742 0.87416
3400 0.0003 0.8748 0.8748
3400 0.001 0.8709380870538391 0.87084
3400 0.01 0.8584463921239045 0.85852
3400 0.1 0.8480916336256958 0.84828
3600 1e-05 0.8521301254895692 0.852
3600 0.0001 0.8736008954269269 0.87352
3600 0.0003 0.8757154865308411 0.8758
3600 0.001 0.8704771053474224 0.87056
3600 0.01 0.8545301407772832 0.85492
3600 0.1 0.8419151583256411 0.84244
3800 1e-05 0.8524065027628734 0.85256
3800 0.0001 0.8751649934002641 0.87516
3800 0.0003 0.8758065162505511 0.87604
3800 0.001 0.8714886796233221 0.87172
3800 0.01 0.8534115566985454 0.854

### Cut down the reviews to 128-word chunks; how does it perform?

Here I'm using the same data files that were given to BERT.

In [96]:
def get_datachunks(filepath):
    
    data = pd.read_csv(filepath, sep = '\t', header = None, names = ['idx', 'sent', 'dummy', 'text'], quoting = csv.QUOTE_NONE)
    
    lex = Counter()
    
    for i, row in data.iterrows():
        text = row['text'].strip().lower()
        words = getwords(text)
        for w in words:
            lex[w] += 1

    vocab = [x[0] for x in lex.most_common()]
    print(vocab[0:10])
    
    df = data.loc[ : , ['sent', 'text']]
    
    return vocab, df

triplets = []

vocab, train_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/train_sentiment.tsv')
print('got training')
dummy, test_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/dev_sentiment.tsv')
print('got test')

for cut in range(2200, 6200, 400):

    for reg_const in [.00001, .00005, .0001, .0003, .001]:
        
        trainingset, train_y = make_matrix(train_df, vocab, cut)
        testset, test_y = make_matrix(test_df, vocab, cut)
        
        model = LogisticRegression(C = reg_const)
        stdscaler = StandardScaler()
        stdscaler.fit(trainingset)
        scaledtraining = stdscaler.transform(trainingset)
        model.fit(scaledtraining, train_y)

        scaledtest = stdscaler.transform(testset)
        predictions = [x[1] for x in model.predict_proba(scaledtest)]
        predictions = np.round(predictions)
        accuracy = accuracy_score(predictions, test_y)
        f1 = f1_score(predictions, test_y)
        print(cut, reg_const, f1, accuracy)
        triplets.append((accuracy, cut, reg_const))

random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])

['the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this']
got training
['the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this']
got test
2200 1e-05 0.7959050511868602 0.792454125291722
2200 5e-05 0.8080544755826236 0.8045806067816775
2200 0.0001 0.8107440429402385 0.8071889443097058
2200 0.0003 0.8112095722189735 0.8079439893836087
2200 0.001 0.8087246808318672 0.8059762961607102
2600 1e-05 0.7992070107459056 0.796069189584954
2600 5e-05 0.8111916445324809 0.8080812703061364
2600 0.0001 0.8132145107570438 0.8100947238365441
2600 0.0003 0.8128761750185974 0.8100718436827895
2600 0.001 0.8105947703563201 0.8080812703061364
3000 1e-05 0.8009198719393966 0.7979682423465886
3000 5e-05 0.8134257800189095 0.8103692856815998
3000 0.0001 0.8152501914155745 0.8122912185969889
3000 0.0003 0.8147196682891653 0.8118793758294056
3000 0.001 0.8115392429064807 0.8089735963025672
3400 1e-05 0.8019611830362187 0.7994554523406397
3400 5e-05 0.813970803907854 0.8113531322930491
3400 0.0001 0.81

### How much can we improve our chunk-level results by aggregating them?

In [88]:
trainingset, train_y = make_matrix(train_df, vocab, 5200)
testset, test_y = make_matrix(test_df, vocab, 5200)
model = LogisticRegression(C = .0001)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)

scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]

In [89]:
# make a dataframe
meta = pd.read_csv('bertmeta/dev_rows_sentiment.tsv', sep = '\t')
pred = pd.DataFrame.from_dict({'idx': meta['idx'], 'pred': predictions, 'real': test_y})
pred = pred.set_index('idx')
pred.head()

Unnamed: 0_level_0,pred,real
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
996,0.161673,0
38639,0.28418,1
5648,0.055561,0
43581,0.590964,1
36158,0.660262,1


In [93]:
right = 0

for idx, row in pred.iterrows():
    if row['pred'] >= 0.5:
        predclass = 1
    else:
        predclass = 0
        
    if predclass == row['real']:
        right += 1

print(right / len(pred))

0.8162266050427859


In [94]:
byvol = meta.groupby('docid')
rightvols = 0
allvols = 0
bertprobs = dict()

for vol, df in byvol:
    total = 0
    right = 0
    positive = 0
    df.set_index('idx', inplace = True)
    predicted = []
    for idx, row in df.iterrows():
        predict = pred.loc[idx, 'pred']
        predicted.append(predict)
        true_class = row['class']
    
    volmean = sum(predicted) / len(predicted)
    if volmean >= 0.5:
        predicted_class = 1
    else:
        predicted_class = 0
    
    if true_class == predicted_class:
        rightvols += 1
    allvols += 1

print()
print('Overall accuracy:', rightvols / allvols)


Overall accuracy: 0.86454402849027


# What about the parallel problem for genre?

We use the same data that was passed to BERT.

In [102]:
triplets = []

vocab, train_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/train_Mystery256.tsv')
print('got training')
dummy, test_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/dev_Mystery256.tsv')
print('got test')

for cut in range(2000, 6200, 400):

    for reg_const in [.00001, .00005, .0001, .0003, .001]:
        
        trainingset, train_y = make_matrix(train_df, vocab, cut)
        testset, test_y = make_matrix(test_df, vocab, cut)
        
        model = LogisticRegression(C = reg_const)
        stdscaler = StandardScaler()
        stdscaler.fit(trainingset)
        scaledtraining = stdscaler.transform(trainingset)
        model.fit(scaledtraining, train_y)

        scaledtest = stdscaler.transform(testset)
        predictions = [x[1] for x in model.predict_proba(scaledtest)]
        predictions = np.round(predictions)
        accuracy = accuracy_score(predictions, test_y)
        f1 = f1_score(predictions, test_y)
        print(cut, reg_const, f1, accuracy)
        triplets.append((accuracy, cut, reg_const))

random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])

['the', 'and', 'to', 'a', 'of', 'i', 'he', 'in', 'was', 'it']
got training
['the', 'and', 'to', 'of', 'a', 'i', 'he', 'in', 'was', '”']
got test
2000 1e-05 0.7380035630136199 0.744955805019953
2000 5e-05 0.7374129774222086 0.7453847014507888
2000 0.0001 0.7321996032586715 0.7407041360534069
2000 0.0003 0.7232408229436059 0.7325923992093387
2000 0.001 0.7135843513220967 0.7236788125163167
2400 1e-05 0.7414472798600026 0.7492820646701227
2400 5e-05 0.7369196471842591 0.7469324581359789
2400 0.0001 0.73058597016667 0.7416738149405139
2400 0.0003 0.7198266606804739 0.732349979487562
2400 0.001 0.7091865673101371 0.7227277813001156
2800 1e-05 0.7440613026819923 0.7508671166971246
2800 5e-05 0.7382283753036907 0.7468392197814493
2800 0.0001 0.7325671089170009 0.7413940998769254
2800 0.0003 0.7211278485901893 0.7307276321187484
2800 0.001 0.7093805583481555 0.7196882109424533
3200 1e-05 0.7449090804795502 0.7512214224443367
3200 5e-05 0.7397687260693052 0.7477902509976504
3200 0.0001 0.733396

### and now aggregating the genre chunks

In [104]:
# best model

trainingset, train_y = make_matrix(train_df, vocab, 6000)
testset, test_y = make_matrix(test_df, vocab, 6000)
model = LogisticRegression(C = .00001)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)

scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]

# make a dataframe
meta = pd.read_csv('bertmeta/dev_rows_Mystery256.tsv', sep = '\t')
pred = pd.DataFrame.from_dict({'idx': meta['idx'], 'pred': predictions, 'real': test_y})
pred = pred.set_index('idx')
pred.head()

Unnamed: 0_level_0,pred,real
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
42274,0.372493,0
47664,0.248213,0
834,0.545889,1
17125,0.713855,1
33412,0.247855,0


In [105]:
byvol = meta.groupby('docid')
rightvols = 0
allvols = 0
bertprobs = dict()

for vol, df in byvol:
    total = 0
    right = 0
    positive = 0
    df.set_index('idx', inplace = True)
    predicted = []
    for idx, row in df.iterrows():
        predict = pred.loc[idx, 'pred']
        predicted.append(predict)
        true_class = row['class']
    
    volmean = sum(predicted) / len(predicted)
    if volmean >= 0.5:
        predicted_class = 1
    else:
        predicted_class = 0
    
    if true_class == predicted_class:
        rightvols += 1
    allvols += 1

print()
print('Overall accuracy:', rightvols / allvols)


Overall accuracy: 0.8770491803278688


**Aside:** It's really remarkable how powerful binary voting can be. In this case models of genre at 256-word scale are pretty awful (75.5% accuracy) but aggregate up to 87.7% accuracy. But that's still not quite in the same league with models that can see whole novels; in that case the detective/mystery genre can be modeled with more than 91% accuracy. Something is lost when we can't see the whole elephant at once.