# Classification models 
- Train BERT models to classify each topic. Start off as binary classification and not multiclass/multilabel 
- Training locally on a CPU, therefore these models are a first-pass experimental stage 
- Models are used to predict on all the data and then manually inspect of the predicted topics make sense

In [12]:
labelled_data_path = "../Data/labelling/news_and_twitter_labelling_1.xlsx"

## Prep the labels for training

In [13]:
import os 
import pandas as pd 
import numpy as np
import math
import pickle 

In [14]:
# labelled data 
df = pd.read_excel(labelled_data_path, sheet_name="guardian", index_col=0)
# all the data with all the fields 
df_all = pd.read_csv("resources/df_guardian_lem_nov14.csv", index_col=0)

Count the number of labelled exampled in each category 

In [15]:
categories = [
    'economy', 'case_reporting', 'treatments_vaccines',
    'education', 'travel_lockdown', 'healthcare',
    'other', 'politics', 'environment', 'social_issues'
]

In [16]:
total_labelled = 0
for category in categories: 
    df_category = df.dropna(subset=[category])
    print("Number of examples for {}: {}".format(category, len(df_category)))
    total_labelled += len(df_category)
print(total_labelled)

Number of examples for economy: 211
Number of examples for case_reporting: 109
Number of examples for treatments_vaccines: 32
Number of examples for education: 51
Number of examples for travel_lockdown: 182
Number of examples for healthcare: 84
Number of examples for other: 139
Number of examples for politics: 105
Number of examples for environment: 21
Number of examples for social_issues: 153
1087


### Create training and val data for each 
- also include testing data from the full dataset 
- take all the positive examples for that category 
- for the negative examples, take as many or up to 2 times as many examples from other categories

In [17]:
def get_train_data(category, prop_negative=1, test_size=500):
    df_positive = df.dropna(subset=[category])[['webPublicationDate', 'title_subtitle'] + [category]]
    df_negative = df[df[category].isna()].sample(n = math.floor(prop_negative * len(df_positive)))[['webPublicationDate', 'title_subtitle'] + [category]]
    df_negative = df_negative.fillna("no")
    df_category = pd.concat([df_positive, df_negative]).sample(frac=1)
    # finally replace yes and no with 1 and 0 
    df_category[category] = df_category[category].replace({"yes": 1, 'no':0})
    # match to the full data to get all the columns 
    df_category = df_category[[category]].join(df_all, how="inner")
    # get test data 
    df_test = df_all[~df_all.index.isin(df_category.index)]#.sample(test_size)
    return df_category, df_test

In [18]:
prop_negative = 1
test_size = 1000
dfs_train, dfs_test = {}, {}
for category in categories: 
    df_train, df_test =get_train_data(category, prop_negative, test_size)
    dfs_train[category] = df_train
    dfs_test[category] = df_test

# Models 

In [19]:
from sklearn.model_selection import train_test_split

from tensorflow import keras

from transformers import TFBertModel,  BertConfig, BertTokenizerFast
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.utils import to_categorical

### Set up the model 

In [20]:
def get_model(max_length):
    model_name = 'bert-base-uncased'
    config = BertConfig.from_pretrained(model_name)
    config.output_hidden_states = False
    # Load tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
    # Load the Transformers model
    transformer_model = TFBertModel.from_pretrained(model_name, config = config)
    
    bert = transformer_model.layers[0]
    input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
    inputs = {'input_ids': input_ids}
    bert_model = bert(inputs)[1]
    dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
    pooled_output = dropout(bert_model, training=False)

    issue = Dense(
        activation='softmax',
        units=2, 
        kernel_initializer=TruncatedNormal(stddev=config.initializer_range), 
        name='issue')(pooled_output)
    outputs = {'issue': issue}

    model = Model(inputs=inputs, outputs=outputs, name='BERT')

    model.summary()
    
    # Set an optimizer
    optimizer = Adam(
        learning_rate=5e-05,
        epsilon=1e-08,
        decay=0.01,
        clipnorm=1.0)
    
    # Set loss and metrics
    loss = {'issue': BinaryCrossentropy(from_logits = True)}
    metric = {'issue': BinaryAccuracy('accuracy')}
    
    # Compile the model
    model.compile(
        optimizer = optimizer,
        loss = loss, 
        metrics = metric)
    
    return model, tokenizer

In [21]:
def transform_train_data(data, label, field, max_length):
    # Ready output data for the model
    y = to_categorical(data[label])
    
    # Tokenize the input
    x = tokenizer(
        text=data[field].to_list(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding=True, 
        return_tensors='tf',
        return_token_type_ids = False,
        return_attention_mask = False,
        verbose = True)
    return x, y

In [22]:
def fit_model(model, data, label, field, max_length, batch_size=64, epochs=5):
    x, y = transform_train_data(data, label, field, max_length)
    # Fit the model
    history = model.fit(
        x={'input_ids': x['input_ids']},
        y={'issue': y},
        validation_split=0.2,
        batch_size=batch_size,
        epochs=epochs,
    )
    return history, model

### set up evaluation and prediction 

In [23]:
def evaluate(data_val, label, field, max_length, model):
    val_y = to_categorical(data_val[label])
    val_x = tokenizer(
        text=data_val[field].to_list(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding=True, 
        return_tensors='tf',
        return_token_type_ids = False,
        return_attention_mask = False,
        verbose = True)
    # Run evaluation
    model_eval = model.evaluate(
        x={'input_ids': val_x['input_ids'], }
    )
    return model_eval

In [24]:
def predict(data_test, field, max_length, model, tokenizer):
    test_x = tokenizer(
        text=data_test[field].to_list(),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding=True, 
        return_tensors='tf',
        return_token_type_ids = False,
        return_attention_mask = False,
        verbose = True)
    preds = model.predict(test_x['input_ids'])
    
    preds_df = pd.DataFrame(preds['issue'])
    preds_df.columns = ['pred_0', 'pred_1']
    preds_df.index = data_test.index
    preds_df = preds_df.join(df, how="inner")
    return preds_df

## --> Train and predict 

### economy

In [15]:
label = "economy"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train['economy'], test_size = 0.2, stratify = dfs_train['economy'][[label]])
max_length = 100

In [16]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
# save the model 
model.save('models/model_economy')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_economy/assets


In [18]:
# load the model
reconstructed_model = keras.models.load_model("models/model_economy")

In [19]:
preds_df = predict(data_test=dfs_test['economy'], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [20]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

["Coronavirus profiteers' condemned as polluters gain bailout billions.  Leading figures condemn bailouts backing sectors that disregard green economy goals in recovery after pandemic ",
 'Covid-19 could cause permanent shift towards home working. Tech firms will benefit, but some companies could find employees don’t want to return to the office',
 'Will investing in our newfound sense of community bring returns?. Local firms offering high interest rates are selling a better way to rebuild the UK after Covid. But there are risks',
 'Billion-dollar wildlife industry in Vietnam under assault as law drafted to halt trading. Move aimed at street markets, online traders and farms with links to illegal wildlife trading',
 'Africa leads calls for debt relief in face of coronavirus crisis. IMF and World Bank back moves to put pressure on creditors in bid to strengthen health systems',
 "Mike Ashley's Frasers Group buys fitness chain DW Sports. Frasers Group to pay initial £37m for ‘certain ass

In [21]:
preds_df.to_csv("models/preds_economy.csv")

### case_reporting

In [15]:
label = "case_reporting"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train[label], test_size = 0.2, stratify = dfs_train[label][[label]])
max_length = 100

In [16]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
# save the model 
model.save('models/model_{}'.format(label))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_case_reporting/assets


In [18]:
# load the model
reconstructed_model = keras.models.load_model('models/model_{}'.format(label))

In [19]:
preds_df = predict(data_test=dfs_test[label], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [20]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

['UK manufacturing shows hint of recovery after coronavirus. IHS Markit says worst of downturn is probably over after fall in factory output slows<br>',
 'The coronavirus panic is turning the UK into a hostile environment for east Asians | Sam Phan. Stereotypes are spreading as quickly as the virus. On the bus, in the street, people have started treating us as if we’re infected, says student Sam Phan',
 'Coronavirus: EU states enact tough measures to stem spread. Several states ban mass events and close schools, as infection and death rates fall in China',
 'Britain wins rare praise for leading race to test life-saving Covid drugs. UK’s high infection rate and centralised NHS have enabled Recovery team to help victims across the world',
 '‘We’ve learned how we need to act’: Spain braces for second wave of Covid. An increase in infections, particularly among younger age groups, is causing a spike centred on the area around Madrid',
 'Coronavirus: is this the start of a second wave and i

In [21]:
preds_df.to_csv("models/preds_{}.csv".format(label))

### treatments_vaccines

In [16]:
label = "treatments_vaccines"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train[label], test_size = 0.2, stratify = dfs_train[label][[label]])
max_length = 100

In [17]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [18]:
# save the model 
model.save('models/model_{}'.format(label))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_treatments_vaccines/assets


In [19]:
# load the model
reconstructed_model = keras.models.load_model('models/model_{}'.format(label))

In [20]:
preds_df = predict(data_test=dfs_test[label], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [21]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

['Why does Covid-19 affect ethnic minorities so badly? It isn’t to do with biology | Zubaida Haque. All the evidence points to race inequality as an urgent aspect of the epidemic, but the government refuses to act, says Zubaida Haque of the Runnymede Trust',
 'After coronavirus, focus on the climate emergency | Letters. <strong>Letters: </strong>Signatories including <strong>Dr Wolfgang Knorr</strong><strong> </strong>say it is game over for preventing dangerous climate change, <strong>Colin Hines</strong> says a green infrastructure should be prioritised in a post-Covid-19 world, and <strong>Andy Radford</strong> on why we should consider permanent changes to the way we live ',
 'Myths and realities of public sector pay hikes. <strong>Letters: </strong><strong>George Binette </strong>says less than a quarter of workers will benefit, while<strong> </strong><strong>Vic Rayner </strong>decries the fact that care home staff have been forgotten and <strong>Angela Pickering</strong> believe

In [23]:
preds_df.to_csv("models/preds_{}.csv".format(label))

### education

In [15]:
label = "education"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train[label], test_size = 0.2, stratify = dfs_train[label][[label]])
max_length = 100

In [16]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
# save the model 
model.save('models/model_{}'.format(label))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_education/assets


In [18]:
# load the model
reconstructed_model = keras.models.load_model('models/model_{}'.format(label))

In [19]:
preds_df = predict(data_test=dfs_test[label], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [20]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

['We must not let the row over British schools descend into an identity war | Zoe Williams. It’s dispiriting to watch the debate break down into meaningless caricatures, says Guardian columnist Zoe Williams',
 "England's libraries begin to reopen but grave fears remain over long-term futures. As branches prepare to start restoring services, experts warn a ‘perfect financial storm’ will cause further closures",
 'Children over 12 should wear face masks to combat Covid, says WHO.  But UK government says masks are not recommended for use by primary or secondary pupils <br>',
 'Give NHS workers a pay rise as well as a clap | Letters. <strong>Letters: </strong>Readers respond to an article by Ian Macdonald, a psychologist who wrote about the dangers of describing health workers as saints',
 'Workers’ health and safety must be key | Letters. Employers who fail to prioritise the wellbeing of their staff when they return to work should be held to account',
 'Donations to be quarantined as UK c

In [21]:
preds_df.to_csv("models/preds_{}.csv".format(label))

### travel_lockdown

In [15]:
label = "travel_lockdown"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train[label], test_size = 0.2, stratify = dfs_train[label][[label]])
max_length = 100

In [16]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
# save the model 
model.save('models/model_{}'.format(label))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_travel_lockdown/assets


In [18]:
# load the model
reconstructed_model = keras.models.load_model('models/model_{}'.format(label))

In [19]:
preds_df = predict(data_test=dfs_test[label], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [20]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

["Police call for end to 'lockdown-shaming' as a weapon in feuds. Forces receiving thousands of complaints about rule-breaking, fear many are being used to settle scores",
 'Flexible working will be norm after lockdown, say Barclays and WPP bosses. Crowded office buildings ‘may be a thing of the past’ as staff safety prioritised after Covid-19 crisis',
 'Chancellor extends UK furlough scheme until end of October. Rishi Sunak says programme will run for further four months as Britain exits lockdown',
 'Low demand for power causes problems for National Grid. Energy system operator says lower usage during lockdown could put network under stress',
 'As UK lockdowns ease, fears grow of return to pre-pandemic crime and pollution levels. Carbon emissions, crime and air pollution all fell but are now starting to rebound',
 'Sales of alcohol, tea and coffee soar in shops since UK lockdown.  Consumers continue to largely eat and drink at home even as Covid-19 lockdown loosens ',
 "UK police rece

In [22]:
preds_df.to_csv("models/preds_{}.csv".format(label))

### healthcare

In [25]:
label = "healthcare"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train[label], test_size = 0.2, stratify = dfs_train[label][[label]])
max_length = 100

In [26]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length, epochs = 10)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
# save the model 
model.save('models/model_{}'.format(label))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_healthcare/assets


In [28]:
# load the model
reconstructed_model = keras.models.load_model('models/model_{}'.format(label))

In [29]:
preds_df = predict(data_test=dfs_test[label], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [30]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

["NHS groups 'nervous' about lockdown easing without contact tracing. Experts say reliance on human tracking without app could expose England to rise in Covid-19 cases<br>",
 'Isolation period for those with Covid symptoms may be increased to 10 days. Proposal comes as concern over second wave ‘very high’ in NHS',
 'Migrant healthcare staff still paying NHS fee despite Johnson U-turn. Many NHS workers charged £400 to use health service after PM said he would axe surcharge',
 'Emergency law would safeguard jobs of NHS volunteers as virus crisis deepens. Proposals include four-week job guarantee, banning over-70s from big gatherings and stadium closures in battle against Covid-19',
 'Critical mass of Android users crucial for NHS contact-tracing app. Experts say NHS relying on ‘Android herd immunity’ to overcome Apple-related issues',
 'NHS coronavirus crisis volunteers frustrated at lack of tasks. Vast majority of 750,000 people who signed up to help are yet to be called into action',
 

In [31]:
preds_df.to_csv("models/preds_{}.csv".format(label))

### other

### politics

In [14]:
label = "politics"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train[label], test_size = 0.2, stratify = dfs_train[label][[label]])
max_length = 100

In [15]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length, epochs = 7)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [16]:
# save the model 
model.save('models/model_{}'.format(label))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_politics/assets


In [17]:
# load the model
reconstructed_model = keras.models.load_model('models/model_{}'.format(label))

In [18]:
preds_df = predict(data_test=dfs_test[label], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [25]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

['The Guardian view on Covid and the north-south divide: inequality kills | Editorial. <strong>Editorial: </strong>Boris Johnson won a landslide promising to ‘level up’ the UK. Yet the pandemic threatens to level down the country',
 'We must not let the government seize back control from doctors | Andrew Lansley. Boris Johnson wants to restructure the NHS. But the lesson of Covid-19 is there is not too little central power, but too much, says former health secretary Andrew Lansley',
 'Boris Johnson says four nations working on family Christmas plan. Prime minister concedes test-and-trace system ‘hasn’t had as much impact’ as desired',
 'Bereaved relatives call for immediate inquiry into Covid-19 crisis. Lawyers for 450 people call on Boris Johnson to start public inquiry to help prevent deaths',
 'As business bigwigs fight to end lockdown, the hero fending them off is … Boris Johnson? | Joel Golby. Tory donors and a few bored people may want the shops to reopen, but let’s not pack away

In [20]:
preds_df.to_csv("models/preds_{}.csv".format(label))

### environment

In [14]:
label = "environment"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train[label], test_size = 0.2, stratify = dfs_train[label][[label]])
max_length = 100

In [15]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length, epochs = 7)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [16]:
# save the model 
model.save('models/model_{}'.format(label))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_environment/assets


In [17]:
# load the model
reconstructed_model = keras.models.load_model('models/model_{}'.format(label))

In [18]:
preds_df = predict(data_test=dfs_test[label], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [19]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

['The coronavirus has exposed the imbalances in modern Britain. A bigger, smarter state is now needed, with devolved decisions, a greener economy and a stronger safety net',
 "In 12 weeks we can turn the tide': now we can ask – was Boris Johnson right?. Coronavirus deaths and infections are dropping, but experts fear lockdown is being eased too soon",
 "The perfect time to start': how book clubs are enduring and flourishing during Covid-19. As the world goes into lockdown, more reading groups are moving on to Zoom, Twitter and Instagram to bring readers together",
 'What will coronavirus mean for the British economy?.  As the UK faces what may be its worst ever recession, we begin a monthly series exploring the financial shock to business and living standards',
 'Our blueprint for a post-coronavirus future | Letters. As we recover from the pandemic, here’s how we must create a more caring and united society',
 'Coronavirus: looking for good news – Run for Heroes and an opera-singing do

In [20]:
preds_df.to_csv("models/preds_{}.csv".format(label))

### social_issues

In [14]:
label = "social_issues"
field = "title_subtitle_bow"
# Split into train and val
data, data_val = train_test_split(dfs_train[label], test_size = 0.2, stratify = dfs_train[label][[label]])
max_length = 100

In [15]:
model, tokenizer = get_model(max_length = max_length)
history, model = fit_model(model, data, label, field, max_length, epochs = 7)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       ((None, 100, 768), (None, 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
issue (Dense)                (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [16]:
# save the model 
model.save('models/model_{}'.format(label))

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: models/model_social_issues/assets


In [17]:
# load the model
reconstructed_model = keras.models.load_model('models/model_{}'.format(label))

In [18]:
preds_df = predict(data_test=dfs_test[label], field=field, max_length=max_length, model=reconstructed_model, tokenizer=tokenizer)

In [19]:
preds_df.nlargest(30, "pred_1")['title_subtitle'].tolist()

['It is time we made masks compulsory | Letters. <strong>Letters: </strong>Countries where face mask use is widespread have seen fewer Covid-19 deaths, writes <strong>Prof David Smith</strong>, while <strong>Philip Rundall</strong> thinks shops should do more to encourage customers to wear them',
 'Care home crisis is the result of years of neglect | Letters. Letters: <strong>Bill Shaw</strong> says the Thatcher government systematically dismantled social care provision, while <strong>Les Bright </strong>highlights what went wrong with private services. Plus letters from <strong>Tom Wilson</strong> and <strong>J</strong><strong>anet Broadmore</strong>',
 'Universities have let their students down | Letters. <strong>Letters: </strong>Current difficulties were predictable and avoidable, writes <strong>one academic</strong>, while <strong>Maria Gajewska </strong>wonders when a fee refund might be due. Meanwhile, <strong>William Proctor </strong>believes it is the government, not instituti

In [20]:
preds_df.to_csv("models/preds_{}.csv".format(label))

## Inspecting all the preds 

In [14]:
preds_case_reporting = pd.read_csv("./models/preds_case_reporting.csv", index_col=0)
# not great 

In [15]:
preds_case_reporting.nlargest(30, "pred_1")

Unnamed: 0_level_0,pred_0,pred_1,webPublicationDate,title_subtitle,economy,case_reporting,treatments_vaccines,education,travel_lockdown,healthcare,other,politics,environment,social_issues
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
business/2020/jun/01/uk-manufacturing-shows-hint-of-recovery-after-coronavirus,0.136602,0.863398,2020-06-01T17:13:17Z,UK manufacturing shows hint of recovery after ...,yes,,,,,,,,,
commentisfree/2020/jan/27/coronavirus-panic-uk-hostile-environment-east-asians,0.138551,0.861449,2020-01-27T16:15:51Z,The coronavirus panic is turning the UK into a...,,,,,,,,,,yes
world/2020/mar/10/coronavirus-several-eu-states-ban-mass-events-after-italian-lockdown,0.138692,0.861308,2020-03-10T17:59:56Z,Coronavirus: EU states enact tough measures to...,,,,,yes,,,,,
world/2020/jul/26/britain-wins-rare-praise-for-leading-race-to-test-life-saving-coronavirus-drugs,0.14121,0.858791,2020-07-26T06:00:16Z,Britain wins rare praise for leading race to t...,,,yes,,,,,,,
world/2020/sep/13/weve-learned-how-we-need-to-act-spain-braces-for-second-wave-of-covid,0.142081,0.857919,2020-09-13T07:03:52Z,‘We’ve learned how we need to act’: Spain brac...,,,,,,,yes,,,
world/2020/sep/13/is-this-the-start-of-a-second-wave-and-is-the-uk-prepared,0.143937,0.856063,2020-09-13T08:03:55Z,Coronavirus: is this the start of a second wav...,,,,,yes,,,,,
world/2020/nov/08/uk-scientists-seek-mutant-covid-samples-from-danish-mink-farms,0.145518,0.854482,2020-11-08T15:41:29Z,UK scientists seek mutant Covid samples from ...,,,yes,,,,,,,
travel/2020/may/29/britain-left-off-safe-list-of-countries-free-to-holiday-in-greece,0.148028,0.851972,2020-05-29T17:10:31Z,Britain left off 'safe list' of countries free...,,,,,yes,,,,,
politics/2020/sep/08/uks-public-spending-watchdog-estimates-210bn-coronavirus-bill,0.148437,0.851563,2020-09-08T17:28:22Z,Coronavirus bill has cost UK government £210bn...,yes,,,,,,,,,
world/2020/may/21/did-the-uk-government-prepare-for-the-wrong-kind-of-pandemic,0.149262,0.850738,2020-05-21T12:17:46Z,Covid-19: did the UK government prepare for th...,,,,,,,,yes,,


In [25]:
preds_economy = pd.read_csv("./models/preds_economy.csv", index_col=0)
# pretty good 

In [26]:
preds_economy.nlargest(30, "pred_1")

Unnamed: 0_level_0,pred_0,pred_1,webPublicationDate,title_subtitle,economy,case_reporting,treatments_vaccines,education,travel_lockdown,healthcare,other,politics,environment,social_issues
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
environment/2020/apr/17/coronavirus-profiteers-condemned-as-polluters-gain-bailout-billions,0.022296,0.977704,2020-04-17T09:00:09Z,Coronavirus profiteers' condemned as polluters...,,,,,,,,,yes,
technology/2020/mar/13/covid-19-could-cause-permanent-shift-towards-home-working,0.022448,0.977552,2020-03-13T17:11:50Z,Covid-19 could cause permanent shift towards h...,,,,,,,,,,yes
money/2020/may/31/will-investing-in-our-newfound-sense-of-community-bring-returns,0.023188,0.976811,2020-05-31T07:00:06Z,Will investing in our newfound sense of commun...,,,,,,,yes,,,
environment/2020/mar/18/billion-dollar-wildlife-industry-in-vietnam-under-assault-as-law-drafted-to-halt-trading,0.02347,0.97653,2020-03-18T10:16:19Z,Billion-dollar wildlife industry in Vietnam un...,,,,,,,yes,,,
global-development/2020/mar/25/africa-leads-calls-for-debt-relief-in-face-of-coronavirus-crisis,0.023953,0.976047,2020-03-25T05:00:49Z,Africa leads calls for debt relief in face of ...,,,,,,,yes,,,
business/2020/aug/24/mike-ashleys-frasers-group-buys-fitness-chain-dw-sports,0.024186,0.975814,2020-08-24T08:27:39Z,Mike Ashley's Frasers Group buys fitness chain...,,,,,,,yes,,,
business/2020/jun/10/goldman-sachs-closes-marcus-account-to-new-savers-coronavirus,0.026429,0.973571,2020-06-10T11:44:53Z,Goldman Sachs closes Marcus account to new UK ...,,,,,,,yes,,,
environment/2020/mar/24/covid-19-economic-rescue-plans-must-be-green-say-environmentalists,0.026835,0.973165,2020-03-24T10:06:28Z,"Covid-19 economic rescue plans must be green, ...",,,,,,,,,yes,
money/2020/nov/07/uk--stamp-duty-holiday-deadline-covid-mortgages,0.026956,0.973044,2020-11-07T07:00:04Z,UK homebuyers told to act fast to beat stamp d...,,,,,,,yes,,,
business/2020/nov/05/future-market-for-covid-vaccines-could-be-worth-more-than-10bn-a-year,0.027954,0.972046,2020-11-05T16:11:14Z,Future market for Covid vaccines 'could be wor...,,,yes,,,,,,,


In [27]:
preds_education = pd.read_csv("./models/preds_education.csv", index_col=0)
# not good 

In [28]:
preds_education.nlargest(30, "pred_1")

Unnamed: 0_level_0,pred_0,pred_1,webPublicationDate,title_subtitle,economy,case_reporting,treatments_vaccines,education,travel_lockdown,healthcare,other,politics,environment,social_issues
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
commentisfree/2020/may/19/british-schools-identity-war-parents-unions,0.351586,0.648414,2020-05-19T06:00:50Z,We must not let the row over British schools d...,,,,,,,,,,yes
books/2020/jul/03/uk-libraries-reopen-grave-fears-for-long-term-future,0.353205,0.646795,2020-07-03T09:49:44Z,England's libraries begin to reopen but grave ...,yes,,,,,,,,,
world/2020/aug/22/children-should-wear-face-masks-to-combat-covid-says-who,0.353831,0.646169,2020-08-22T21:33:28Z,Children over 12 should wear face masks to com...,,,,,yes,,,,,
society/2020/apr/27/give-nhs-staff-a-pay-rise-not-just-a-clap,0.358877,0.641123,2020-04-27T16:30:10Z,Give NHS workers a pay rise as well as a clap ...,,,,,,yes,,,,
commentisfree/2020/may/09/workers-health-and-safety-is-key-letters,0.362044,0.637956,2020-05-09T21:32:07Z,Workers’ health and safety must be key | Lette...,,,,,,yes,,,,
society/2020/may/22/donations-to-be-quarantined-as-uk-charity-shops-plan-to-reopen,0.363467,0.636533,2020-05-22T16:16:19Z,Donations to be quarantined as UK charity shop...,yes,,,,,,,,,
world/2020/mar/24/arts-council-england-promises-160m-to-buoy-public-during-lockdown,0.366015,0.633985,2020-03-24T17:58:32Z,Arts Council England promises £160m to 'buoy p...,yes,,,,,,,,,
society/2020/may/07/mental-health-patients-in-crisis-because-of-coronavirus-cutbacks,0.369543,0.630457,2020-05-06T23:01:18Z,Mental health patients in crisis because of co...,,,,,,yes,,,,
education/2020/apr/20/imperial-college-london-warns-of-cuts-in-face-of-coronavirus,0.3715,0.6285,2020-04-20T16:38:43Z,Imperial College London warns of cuts in face ...,yes,,,,,,,,,
commentisfree/2020/nov/03/website-independent-bookshops-high-street,0.371813,0.628187,2020-11-03T07:00:49Z,A new website for independent bookshops is jus...,yes,,,,,,,,,


In [31]:
preds_environment = pd.read_csv("./models/preds_environment.csv", index_col=0)
# not good 

In [30]:
preds_environment.nlargest(30, "pred_1")

Unnamed: 0_level_0,pred_0,pred_1,webPublicationDate,title_subtitle,economy,case_reporting,treatments_vaccines,education,travel_lockdown,healthcare,other,politics,environment,social_issues
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
business/2020/may/03/the-coronavirus-has-exposed-the-imbalances-in-modern-britain-life,0.237704,0.762296,2020-05-03T10:09:31Z,The coronavirus has exposed the imbalances in ...,,,,,,,,,,yes
world/2020/jun/11/in-12-weeks-we-can-turn-the-tide-now-we-can-ask-was-pm-right,0.258558,0.741442,2020-06-11T05:00:41Z,In 12 weeks we can turn the tide': now we can ...,,yes,,,yes,,,,,
books/2020/mar/26/the-perfect-time-to-start-how-book-clubs-are-enduring-and-flourishing-during-covid-19,0.263092,0.736908,2020-03-26T13:45:29Z,The perfect time to start': how book clubs are...,,,,,yes,,,,,
world/2020/apr/24/what-will-coronavirus-mean-for-the-british-economy,0.279559,0.720441,2020-04-24T06:00:16Z,What will coronavirus mean for the British eco...,yes,,,,,,,,,
commentisfree/2020/may/24/letters-our-blueprint-for-a-post-coronavirus-future,0.292224,0.707776,2020-05-24T04:59:13Z,Our blueprint for a post-coronavirus future | ...,,,,,,,,,,yes
news/2020/apr/13/coronavirus-looking-for-good-news-run-for-heroes-and-an-opera-singing-doctor,0.300122,0.699878,2020-04-13T17:43:51Z,Coronavirus: looking for good news – Run for H...,,,,,,,yes,,,
world/2020/mar/20/after-work-drinks-by-video-how-to-survive-100-days-of-solitude-coronavirus,0.304546,0.695454,2020-03-20T07:30:16Z,After-work drinks by video: how to survive 100...,,,,,yes,,,,,
world/2020/sep/28/shielding-in-the-uk-how-are-you-feeling-about-the-winter-months-coronavirus,0.304891,0.695109,2020-09-28T10:53:26Z,Shielding in the UK: how are you feeling about...,,,,,,,,,,yes
music/2020/jul/03/socially-distanced-outdoor-music-venue-to-open-in-newcastle-in-august,0.307119,0.692881,2020-07-03T11:41:45Z,Socially distanced outdoor music venue to open...,,,,,,,yes,,,
world/2020/may/03/i-feel-like-a-1950s-housewife-how-lockdown-has-exposed-the-gender-divide,0.311269,0.688731,2020-05-03T08:10:46Z,‘I feel like a 1950s housewife’: how lockdown ...,,,,,,,,,,yes


In [32]:
preds_healthcare = pd.read_csv("./models/preds_healthcare.csv", index_col=0)
# so-so; re-examine 

In [36]:
preds_healthcare[preds_healthcare['pred_1']>0.5]['title_subtitle'].tolist()

['Pupils begin studying at home – a challenge for them, parents and teachers. Schools provide online classes, but some pupils struggle to stay motivated after exams were cancelled',
 'No 10 tells holidaymakers to claim universal credit for Covid-19 quarantine. Unions urge government to ensure statutory sick pay for those forced to self-isolate after travelling in Spain',
 'Face-mask wearers do not stop washing their hands, study suggests. Scientists say people unlikely to reduce one Covid-19 measure when adopting another',
 'Hancock says Covid testing crisis may last weeks as UK hospitals plug gaps. Admission comes as more people with Covid symptoms turn up at A&amp;E and call 111',
 'Three more Midlands meat factory workers test positive for coronavirus. Health officials in England and Wales monitoring new cases linked to food processing plants ',
 'Yasmin Qureshi MP in hospital with pneumonia after positive Covid test. Bolton MP and shadow international development minister being tre

In [34]:
preds_politics = pd.read_csv("./models/preds_politics.csv", index_col=0)
# not bad but may be worth doing it in a different way or dropping the topic 

In [36]:
preds_politics.nlargest(60, "pred_1")['title_subtitle'].tolist()

['The Guardian view on Covid and the north-south divide: inequality kills | Editorial. <strong>Editorial: </strong>Boris Johnson won a landslide promising to ‘level up’ the UK. Yet the pandemic threatens to level down the country',
 'We must not let the government seize back control from doctors | Andrew Lansley. Boris Johnson wants to restructure the NHS. But the lesson of Covid-19 is there is not too little central power, but too much, says former health secretary Andrew Lansley',
 'Boris Johnson says four nations working on family Christmas plan. Prime minister concedes test-and-trace system ‘hasn’t had as much impact’ as desired',
 'Bereaved relatives call for immediate inquiry into Covid-19 crisis. Lawyers for 450 people call on Boris Johnson to start public inquiry to help prevent deaths',
 'As business bigwigs fight to end lockdown, the hero fending them off is … Boris Johnson? | Joel Golby. Tory donors and a few bored people may want the shops to reopen, but let’s not pack away

In [37]:
preds_social_issues = pd.read_csv("./models/preds_social_issues.csv", index_col=0)
# so-so 

In [38]:
preds_social_issues.nlargest(60, "pred_1")['title_subtitle'].tolist()

['It is time we made masks compulsory | Letters. <strong>Letters: </strong>Countries where face mask use is widespread have seen fewer Covid-19 deaths, writes <strong>Prof David Smith</strong>, while <strong>Philip Rundall</strong> thinks shops should do more to encourage customers to wear them',
 'Care home crisis is the result of years of neglect | Letters. Letters: <strong>Bill Shaw</strong> says the Thatcher government systematically dismantled social care provision, while <strong>Les Bright </strong>highlights what went wrong with private services. Plus letters from <strong>Tom Wilson</strong> and <strong>J</strong><strong>anet Broadmore</strong>',
 'Universities have let their students down | Letters. <strong>Letters: </strong>Current difficulties were predictable and avoidable, writes <strong>one academic</strong>, while <strong>Maria Gajewska </strong>wonders when a fee refund might be due. Meanwhile, <strong>William Proctor </strong>believes it is the government, not instituti

In [39]:
preds_travel_lockdown = pd.read_csv("./models/preds_travel_lockdown.csv", index_col=0)
# good but it's mostly just picking the word lockdown 

In [40]:
preds_travel_lockdown.nlargest(60, "pred_1")['title_subtitle'].tolist()

["Police call for end to 'lockdown-shaming' as a weapon in feuds. Forces receiving thousands of complaints about rule-breaking, fear many are being used to settle scores",
 'Flexible working will be norm after lockdown, say Barclays and WPP bosses. Crowded office buildings ‘may be a thing of the past’ as staff safety prioritised after Covid-19 crisis',
 'Chancellor extends UK furlough scheme until end of October. Rishi Sunak says programme will run for further four months as Britain exits lockdown',
 'Low demand for power causes problems for National Grid. Energy system operator says lower usage during lockdown could put network under stress',
 'As UK lockdowns ease, fears grow of return to pre-pandemic crime and pollution levels. Carbon emissions, crime and air pollution all fell but are now starting to rebound',
 'Sales of alcohol, tea and coffee soar in shops since UK lockdown.  Consumers continue to largely eat and drink at home even as Covid-19 lockdown loosens ',
 "UK police rece

In [41]:
preds_treatments_vaccines = pd.read_csv("./models/preds_treatments_vaccines.csv", index_col=0)
# not very good 

In [42]:
preds_treatments_vaccines.nlargest(60, "pred_1")['title_subtitle'].tolist()

['Why does Covid-19 affect ethnic minorities so badly? It isn’t to do with biology | Zubaida Haque. All the evidence points to race inequality as an urgent aspect of the epidemic, but the government refuses to act, says Zubaida Haque of the Runnymede Trust',
 'After coronavirus, focus on the climate emergency | Letters. <strong>Letters: </strong>Signatories including <strong>Dr Wolfgang Knorr</strong><strong> </strong>say it is game over for preventing dangerous climate change, <strong>Colin Hines</strong> says a green infrastructure should be prioritised in a post-Covid-19 world, and <strong>Andy Radford</strong> on why we should consider permanent changes to the way we live ',
 'Myths and realities of public sector pay hikes. <strong>Letters: </strong><strong>George Binette </strong>says less than a quarter of workers will benefit, while<strong> </strong><strong>Vic Rayner </strong>decries the fact that care home staff have been forgotten and <strong>Angela Pickering</strong> believe