# Testing Political Bias with Neural Networks (Stacking LSTM)

##### Importing Libraries and Loading data

In [117]:
import pandas as pd
import numpy as np
from tqdm import tqdm
pd.set_option('display.max_colwidth', -1)

  after removing the cwd from sys.path.


In [118]:
df1 = pd.read_csv('/Users/feihuyan/Downloads/news/articles1.csv')
df2 = pd.read_csv('/Users/feihuyan/Downloads/news/articles2.csv')
df3 = pd.read_csv('/Users/feihuyan/Downloads/news/articles3.csv')

##### Preparing data

In [119]:
df = pd.concat([df1, df2, df3])

In [125]:
bias1 = { 'Atlantic': 2,
         'Breitbart': 1,
         'Business Insider': 2,
         'Buzzfeed News': 2,
         'CNN': 0,
         'Fox News': 1,
         'Guardian': 2,
         'NPR': 2,
         'National Review': 2,
         'New York Post': 2,
         'New York Times': 0,
         'Reuters': 2,
         'Talking Points Memo': 2,
         'Vox': 2,
         'Washington Post': 2}
df['bias1'] = df['publication'].map(bias1)
left = df[df['bias1'] == 0]
right = df[df['bias1'] == 1]

In [126]:
from nltk.tokenize import sent_tokenize
def passage_to_sentences(df, political_affiliation):
    l = []
    for row in tqdm(df):
        for sentence in sent_tokenize(row):
            l.append(sentence)
    result = pd.DataFrame({'text': l, 'target': political_affiliation})
    return result

In [127]:
left_sent = passage_to_sentences(left['content'], 0)

100%|██████████| 19291/19291 [00:31<00:00, 618.89it/s]


In [128]:
right_sent = passage_to_sentences(right['content'], 1)

100%|██████████| 28135/28135 [00:23<00:00, 1198.71it/s]


In [129]:
train_df = pd.concat([left_sent, right_sent])
train_df = train_df.sample(frac=1)
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(list(train_df['text']))

##### Testing

In [223]:
from nltk.tokenize import sent_tokenize
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
stacked_lstm = keras.models.load_model('/Users/feihuyan/news_stack_lstm')

In [226]:
def test(text, model, true_class, verbose=True, verbose_r=False, passage=True, check=True):
    test_text = []
    if passage: 
        for sentence in sent_tokenize(text):
            test_text.append(sentence)
    else:
        test_text = [text]
    
    text_token = tokenizer.texts_to_sequences(test_text)
    text_pad = pad_sequences(text_token, maxlen = 25)
    pred_class = model.predict_classes(text_pad)
    pred_score = model.predict(text_pad)
    naive_loss = abs(true_class - pred_score)
    
    pol_class = 'right'
    content = 'sentence'
    success = 'succeeds'
 
    pred_score_passage = np.sum(pred_score)/int(len(text_pad))
    if passage:
        pred_class = 1
        if pred_score_passage < 0.5: pred_class, pol_class = 0, 'left'
        bias = float(abs(0.5-pred_score_passage)/0.5*100)
        content = 'passage'
        if check: 
            if pred_class != true_class: success = 'fails'
    else:
        if pred_score < 0.5: pol_class = 'left'
        bias = float(abs(0.5-pred_score)/0.5*100)
        if check: 
            if pred_class != true_class: success = 'fails'
    
    if verbose_r:
        print('=======================================================================')
        print('Your answer:')
        print('------------')
        print(f'  - This {content} leans toward the {pol_class} in the political spectrum.')
        print(f'  - It is approximately {bias: .2f}% percent biased to the {pol_class}.')
        print(f'  - This model {success} in predicting the political bias.')
        print('=======================================================================')
    
    return pred_class, pred_score, naive_loss, bias

In [244]:
def batch_testing(df, publication, model, true_class, num_rows):
    i = 0 
    b = 0
    l = []
    result = 0
    pol_class = 'right'
    df_temp = df[df['publication'] == publication]['content']
    df_input = df_temp[:num_rows]
    for row in tqdm(df_input):
        x = test(row, model, true_class)
        i += int(x[0])
        score = sum(x[1])/len(x[1])
        b += score
        l.append(score)
    try:
        bias_total = b/len(df_input)
    except ZeroDivisionError:
        print('ERROR: review your input please...')
    if true_class == 0: 
        result = 1-i/len(df_input)
        pol_class = 'left'
    else: result = i/len(df_input)
    std = np.std(l)
    print('========================================================================')
    print(f'Testing {num_rows} articles from {publication}')
    print(f'Articles in batch are generally ({result*100}%) towards the {pol_class}')
    print(f'Standard deviation : {std: .4f}')
    print(f'Total bias score: {bias_total}')
    print('========================================================================')
    return result, bias_total

In [245]:
convnn = keras.models.load_model('/Users/feihuyan/news_convnn')
lstm = keras.models.load_model('/Users/feihuyan/news_lstm')
rnn = keras.models.load_model('/Users/feihuyan/news_rnn')
nn = keras.models.load_model('/Users/feihuyan/news_nn')

### Testing Breitbart

In [278]:
breitbart = batch_testing(df, 'Breitbart', stacked_lstm, 1, 2000)

100%|██████████| 2000/2000 [04:30<00:00,  7.40it/s]

Testing 2000 articles from Breitbart
Articles in batch are generally (89.9%) towards the right
Standard deviation :  0.1129
Total bias score: [0.6448016]





In [279]:
breitbart = batch_testing(df, 'Breitbart', convnn, 1, 2000)

100%|██████████| 2000/2000 [01:57<00:00, 17.04it/s]


Testing 2000 articles from Breitbart
Articles in batch are generally (92.9%) towards the right
Standard deviation :  0.1152
Total bias score: [0.6703297]


In [280]:
breitbart = batch_testing(df, 'Breitbart', lstm, 1, 2000)

100%|██████████| 2000/2000 [02:24<00:00, 13.83it/s]

Testing 2000 articles from Breitbart
Articles in batch are generally (84.3%) towards the right
Standard deviation :  0.1220
Total bias score: [0.6176962]





In [281]:
breitbart = batch_testing(df, 'Breitbart', rnn, 1, 2000)

100%|██████████| 2000/2000 [01:59<00:00, 16.67it/s]

Testing 2000 articles from Breitbart
Articles in batch are generally (79.05%) towards the right
Standard deviation :  0.1122
Total bias score: [0.5884283]





In [282]:
breitbart = batch_testing(df, 'Breitbart', nn, 1, 2000)

100%|██████████| 2000/2000 [01:53<00:00, 17.62it/s]

Testing 2000 articles from Breitbart
Articles in batch are generally (79.0%) towards the right
Standard deviation :  0.1118
Total bias score: [0.5856423]





### Testing Guardian

In [283]:
guard = batch_testing(df, 'Guardian', stacked_lstm, 0, 2000)

100%|██████████| 2000/2000 [06:40<00:00,  4.99it/s]

Testing 2000 articles from Guardian
Articles in batch are generally (66.75%) towards the left
Standard deviation :  0.0857
Total bias score: [0.46471673]





In [284]:
guard = batch_testing(df, 'Guardian', convnn, 0, 2000)

100%|██████████| 2000/2000 [02:03<00:00, 16.19it/s]

Testing 2000 articles from Guardian
Articles in batch are generally (63.74999999999999%) towards the left
Standard deviation :  0.0904
Total bias score: [0.4697565]





In [285]:
guard = batch_testing(df, 'Guardian', lstm, 0, 2000)

100%|██████████| 2000/2000 [02:58<00:00, 11.21it/s]


Testing 2000 articles from Guardian
Articles in batch are generally (82.05%) towards the left
Standard deviation :  0.0837
Total bias score: [0.4277601]


In [286]:
guard = batch_testing(df, 'Guardian', rnn, 0, 2000)

100%|██████████| 2000/2000 [02:11<00:00, 15.20it/s]

Testing 2000 articles from Guardian
Articles in batch are generally (79.45%) towards the left
Standard deviation :  0.0795
Total bias score: [0.43806446]





In [287]:
guard = batch_testing(df, 'Guardian', nn, 0, 2000)

100%|██████████| 2000/2000 [01:57<00:00, 17.07it/s]

Testing 2000 articles from Guardian
Articles in batch are generally (83.8%) towards the left
Standard deviation :  0.0781
Total bias score: [0.4258068]





### Testing Washington Post

In [288]:
wp = batch_testing(df, 'Washington Post', stacked_lstm, 0, 2000)

100%|██████████| 2000/2000 [07:59<00:00,  4.17it/s]


Testing 2000 articles from Washington Post
Articles in batch are generally (52.0%) towards the left
Standard deviation :  0.0889
Total bias score: [0.49565268]


In [289]:
wp = batch_testing(df, 'Washington Post', convnn, 0, 2000)

100%|██████████| 2000/2000 [02:06<00:00, 15.87it/s]

Testing 2000 articles from Washington Post
Articles in batch are generally (52.900000000000006%) towards the left
Standard deviation :  0.0884
Total bias score: [0.49322283]





In [290]:
wp = batch_testing(df, 'Washington Post', lstm, 0, 2000)

100%|██████████| 2000/2000 [03:02<00:00, 10.94it/s]

Testing 2000 articles from Washington Post
Articles in batch are generally (68.8%) towards the left
Standard deviation :  0.0893
Total bias score: [0.45934796]





In [291]:
wp = batch_testing(df, 'Washington Post', rnn, 0, 2000)

100%|██████████| 2000/2000 [02:20<00:00, 14.25it/s]


Testing 2000 articles from Washington Post
Articles in batch are generally (71.45%) towards the left
Standard deviation :  0.0831
Total bias score: [0.45616978]


In [292]:
wp = batch_testing(df, 'Washington Post', nn, 0, 2000)

100%|██████████| 2000/2000 [02:02<00:00, 16.28it/s]

Testing 2000 articles from Washington Post
Articles in batch are generally (72.39999999999999%) towards the left
Standard deviation :  0.0811
Total bias score: [0.45311067]





### Testing National Review

In [296]:
nr = batch_testing(df, 'National Review', stacked_lstm, 1, 1000)

100%|██████████| 1000/1000 [03:33<00:00,  4.69it/s]

Testing 1000 articles from National Review
Articles in batch are generally (66.4%) towards the right
Standard deviation :  0.0758
Total bias score: [0.53111553]





In [297]:
nr = batch_testing(df, 'National Review', convnn, 1, 1000)

100%|██████████| 1000/1000 [01:01<00:00, 16.37it/s]

Testing 1000 articles from National Review
Articles in batch are generally (71.2%) towards the right
Standard deviation :  0.0779
Total bias score: [0.5403635]





In [298]:
nr = batch_testing(df, 'National Review', lstm, 1, 1000)

100%|██████████| 1000/1000 [01:25<00:00, 11.69it/s]

Testing 1000 articles from National Review
Articles in batch are generally (48.199999999999996%) towards the right
Standard deviation :  0.0754
Total bias score: [0.49454114]





In [299]:
nr = batch_testing(df, 'National Review', rnn, 1, 1000)

100%|██████████| 1000/1000 [01:04<00:00, 15.51it/s]

Testing 1000 articles from National Review
Articles in batch are generally (46.7%) towards the right
Standard deviation :  0.0696
Total bias score: [0.49423274]





In [300]:
nr = batch_testing(df, 'National Review', nn, 1, 1000)

100%|██████████| 1000/1000 [00:58<00:00, 17.17it/s]

Testing 1000 articles from National Review
Articles in batch are generally (46.300000000000004%) towards the right
Standard deviation :  0.0692
Total bias score: [0.49429196]





### Testing Talking Points Memo

In [301]:
tpm = batch_testing(df, 'Talking Points Memo', stacked_lstm, 0, 1000)

100%|██████████| 1000/1000 [02:12<00:00,  7.54it/s]

Testing 1000 articles from Talking Points Memo
Articles in batch are generally (44.49999999999999%) towards the left
Standard deviation :  0.1395
Total bias score: [0.52180046]





In [302]:
tpm = batch_testing(df, 'Talking Points Memo', convnn, 0, 1000)

100%|██████████| 1000/1000 [01:00<00:00, 16.43it/s]

Testing 1000 articles from Talking Points Memo
Articles in batch are generally (44.199999999999996%) towards the left
Standard deviation :  0.1496
Total bias score: [0.5237587]





In [303]:
tpm = batch_testing(df, 'Talking Points Memo', lstm, 0, 1000)

100%|██████████| 1000/1000 [01:11<00:00, 13.94it/s]

Testing 1000 articles from Talking Points Memo
Articles in batch are generally (61.199999999999996%) towards the left
Standard deviation :  0.1467
Total bias score: [0.47595504]





In [304]:
tpm = batch_testing(df, 'Talking Points Memo', rnn, 0, 1000)

100%|██████████| 1000/1000 [01:00<00:00, 16.53it/s]

Testing 1000 articles from Talking Points Memo
Articles in batch are generally (61.1%) towards the left
Standard deviation :  0.1365
Total bias score: [0.47636384]





In [305]:
tpm = batch_testing(df, 'Talking Points Memo', nn, 0, 1000)

100%|██████████| 1000/1000 [00:56<00:00, 17.72it/s]

Testing 1000 articles from Talking Points Memo
Articles in batch are generally (58.3%) towards the left
Standard deviation :  0.1277
Total bias score: [0.48654646]





### Testing CNN

In [306]:
cnn = batch_testing(df, 'CNN', stacked_lstm, 0, 2000)

100%|██████████| 2000/2000 [05:36<00:00,  5.95it/s]

Testing 2000 articles from CNN
Articles in batch are generally (94.10000000000001%) towards the left
Standard deviation :  0.0911
Total bias score: [0.36568773]





In [307]:
cnn = batch_testing(df, 'CNN', convnn, 0, 2000)

100%|██████████| 2000/2000 [01:59<00:00, 16.74it/s]

Testing 2000 articles from CNN
Articles in batch are generally (94.25%) towards the left
Standard deviation :  0.0974
Total bias score: [0.34854847]





In [308]:
cnn = batch_testing(df, 'CNN', lstm, 0, 2000)

100%|██████████| 2000/2000 [02:36<00:00, 12.74it/s]

Testing 2000 articles from CNN
Articles in batch are generally (97.2%) towards the left
Standard deviation :  0.0898
Total bias score: [0.3165043]





In [309]:
cnn = batch_testing(df, 'CNN', rnn, 0, 2000)

100%|██████████| 2000/2000 [02:04<00:00, 16.01it/s]

Testing 2000 articles from CNN
Articles in batch are generally (96.5%) towards the left
Standard deviation :  0.0845
Total bias score: [0.34322616]





In [310]:
cnn = batch_testing(df, 'CNN', nn, 0, 2000)

100%|██████████| 2000/2000 [01:56<00:00, 17.17it/s]


Testing 2000 articles from CNN
Articles in batch are generally (94.39999999999999%) towards the left
Standard deviation :  0.0875
Total bias score: [0.36225712]
