! if you have prepared data - you can load these in the section marked 'pre-loaded data - start here!'

# Preparation

In [5]:
import pickle
import numpy as np
import pandas as pd
import os, random, glob, json, nltk, re
from nltk.corpus import stopwords

from sentence_transformers import SentenceTransformer

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier

# Filter Gigaword

## Read in the full corpus 
this is the full corpus with only information about date_location and concatenated head and sentences - requires a lot of RAM but it resolves the overfitting problem and is quick to choose a balanced corpus. 

When I ran this, I added the words poverty, slave ' aid '(with spaces) to Jan's list.

I have turned the entire corpus into a single json file ~ 7GB. It has 2 columns - name/date and text - text includes headline + all 5 sentences in 1 column.

In [None]:
big_df = pd.read_json('./pickles/frame_to_rule2.json', encoding = 'utf-8')

# Dataframe Construction

## Keyword Lookup

### The threshold set to 4 produces 3000+ articles / set to 5 produces 880
experimented with threshold 3 - produces 21000 articles but scores were worse on both baseline and bert - seems that it's not a high-quality search

In [None]:
def keyword_search(keywords, big_df, threshold = 0):
    articles = []
    article_keywords = []
    for text in big_df['text']:
        keywords_found = set()
        for keyword in keywords:
            if keyword in text:
                keywords_found.add(keyword)
        if len(keywords_found) >= threshold:
            articles.append(text)
            article_keywords.append(keywords_found)
    return articles, article_keywords

In [None]:
topic = 'poverty'

In [None]:
with open(f'keywords/{topic}.txt', 'r', encoding = 'utf-8') as infile:
    keywords = infile.read().splitlines()

#fine-grained search, kwarg 'threshold' indicates the number of keywords that should be present in the text
articles, keywords_per_article = keyword_search(keywords, big_df, threshold = 4)

print('Number of articles found:', len(articles))

### add 'text', 'labels' column

In [None]:
columns = ['text']
filtered_df = pd.DataFrame(articles, columns = columns)

In [None]:
filtered_df['label'] = 'related'

In [None]:
# there are duplicate texts in the dataset - this should drop them. 
filtered_df.drop_duplicates(subset = 'text', inplace = True, keep = 'first')

In [None]:
# in order to clear the memory to construct the balancing - unrelated - datset, the file is dumped to a pickle file
# and the kernel was restarted. 

pickle.dump(filtered_df, open('./saved_models/interim_threshold4_related_only_delete_immediately.pkl', 'wb'))

#### potential memory wipe point

## random selection of non-related articles to balance the training set 

remember to clear the kernel before continuing:
this method takes a long time - watch a vid; play a tune. 

For some reason if I just randomly sample the full dataframe (very fast) the stats consistently drop 10+ points!!

In [None]:
filtered_df = pickle.load(open('./saved_models/interim_threshold4_related_only_delete_immediately.pkl', 'rb'))

In [None]:
#Tip: to save memory write the line texts to a csv file instead of yield (I think it should work)

def random_list (filename):
    with open (filename, 'r') as infile:
#         print(filename)
        file = pd.read_json(infile, orient = 'index')
    random_index = random.randint(0,file.shape[0])
    if keyword not in file.iloc[random_index-1].text:
        #if memory is an issue, uncomment these 3 line:
#         text_only = file.iloc[random_index-1]['text'].rstrip('\n')+'\n'
#         with open ('balancing_data.csv', 'a') as outfile:
#             outfile.write(text_only)
        # and comment out this yield statement
        yield file.iloc[random_index-1]
        
    else:
        pass

In [None]:
num = (round (filtered_df.shape[0]*1.1))
keyword = 'poverty'
syphon = []

while len(syphon) < num:
    filename = f'pickles/jsons2/{random.choice(os.listdir("pickles/jsons2"))}'
    syphon.extend(random_list(filename))
    

In [None]:
list_for_balancing = []

for gen_list in syphon:
    list_for_balancing.append(gen_list)

In [None]:
balance_df = pd.DataFrame(list_for_balancing)

In [None]:
balance_df.drop('date', inplace=True, axis = 1)

### add label

In [None]:
balance_df['label']= 'unrelated'
balance_df = balance_df[['text', 'label']]

In [None]:
# check to see if there are any duplicates in this section - make sure len (balance_df) is longer than len(filtered_df)
balance_df.drop_duplicates(subset = 'text', inplace = True, keep = 'first')
len(balance_df)

### Concat related and unrelated

In [None]:
balanced_labeled_df = pd.concat([filtered_df, balance_df])

In [None]:
# the missing id column is not an issue since they are not used for representation
balanced_labeled_df = balanced_labeled_df.reset_index(drop = True)
#check for duplicates again - keeping all 'first' (related) texts
balanced_labeled_df.drop_duplicates(subset = 'text', inplace = True, keep = 'first')

In [None]:
#prune the excess unrelated texts to make a 50:50 balanced dataset.
balanced_labeled_df = balanced_labeled_df.iloc[:filtered_df.shape[0]*2]

## Save for portable version

In [None]:
# change threshold value in name depending on threshold set (example here is threshhold 4)
balanced_labeled_df.to_json('./saved_models/balanced_df_WITH_aid_with_slave_threshold_4_keep.json')

#### potential memory wipe point

## Load portable version - of df to save time intensive problems above

In [None]:
import pandas as pd

balanced_labeled_df = pd.read_json('./saved_models/balanced_df_WITH_aid_with_slave_threshold_4_keep.json', encoding = 'utf-8')
train_y = list(balanced_labeled_df.label)

# BERT

## Train - BERT 

### not necessary if you have pickled embeddings already - this step is performed below 

if using previously saved embeddings, do make sure the embeddings you have match the dataframe loaded above. If you re-populate the dataframe, you have to re-encode the embeddings. 

### load model and encode sentences

In [None]:
model = SentenceTransformer ('roberta-large-nli-mean-tokens')

In [None]:
texts = balanced_labeled_df.text.to_list()

In [None]:
embedded_texts = []
for text in texts:
    sentences = nltk.sent_tokenize(text)
    if len(sentences) >=6:
        sentence_embeddings = model.encode(sentences[:6])
    else:
        for n in range (6-len(sentences)):
            sentences.append(sentences[0])
            sentence_embeddings = model.encode(sentences[:6])
    embedded_texts.append(sentence_embeddings)

### Concatenate embeddings

In [None]:
# concatenate the embeddings to get 1 vector per document
train_X = []

for six_embeds in embedded_texts:
    new = np.concatenate(six_embeds)
    train_X.append(new)

### save for portable version

In [None]:
# once the above is done, dumping the embeds to memory here means you can skip the above. 
pickle.dump(concatenated_X, open('./saved_models/balanced_df_WITH_aid_with_slave_threshold_4_keep_RoBERTa.pkl', 'wb'))

# pre-loaded data - start here!

#### potential memory wipe point
feel free to purge your overburdened memory here. 

In [6]:
import pandas as pd
train_X = pickle.load(open('./saved_models/balanced_df_WITH_aid_with_slave_threshold_4_keep_RoBERTa.pkl', 'rb'))

balanced_labeled_df = pd.read_json('./saved_models/balanced_df_WITH_aid_with_slave_threshold_4_keep.json', encoding = 'utf-8')
train_y = list(balanced_labeled_df.label)

### load test data and encode sentences

In [7]:
# sheet name 'Related_SDG1' is both costa and starbucks aggregated data
dfs = pd.read_excel('./classify_only/combined_testset.xlsx', sheet_name='Related_SDG1')
dfs['text']=dfs['Headline'].astype(str)+' '+dfs['First 5 sentences']
dfs = dfs[['text', 'label']]
test_text = dfs['text'].to_list()

In [8]:
model = SentenceTransformer ('roberta-large-nli-mean-tokens')

In [9]:
# # use same model as previously - RoBERTa
embedded_test = []
for text in test_text:
    sentences = nltk.sent_tokenize(text)
    if len(sentences) >=6:
        sentence_embeddings = model.encode(sentences[:6])
    else:
        for n in range (6-len(sentences)):
            sentences.append(sentences[0])
            sentence_embeddings = model.encode(sentences[:6])
    embedded_test.append(sentence_embeddings)

### concatenate test embeddings

In [10]:
BERT_test_X = []

for six_embeds in embedded_test:
    new = np.concatenate(six_embeds)
    BERT_test_X.append(new)

In [11]:
BERT_test_y = dfs['label'].to_list()

## SVM Classifier

In [12]:
BERT_classifier = LinearSVC(random_state=0, tol=1e-5)
BERT_classifier.fit(train_X, train_y)
SVM_predictions = list(BERT_classifier.predict(BERT_test_X))
predicted_test_scores= BERT_classifier.decision_function(BERT_test_X) 
print(classification_report(BERT_test_y, SVM_predictions))



              precision    recall  f1-score   support

     related       0.88      0.90      0.89        41
   unrelated       0.82      0.78      0.80        23

    accuracy                           0.86        64
   macro avg       0.85      0.84      0.85        64
weighted avg       0.86      0.86      0.86        64



In [13]:
predicted_test_scores = np.round(abs(predicted_test_scores),3).tolist()

##  Multi-level Perceptron Classifier

In [14]:
MLPclf = MLPClassifier(solver='sgd', alpha=1e-5,hidden_layer_sizes=(15,), random_state=1, max_iter=500)
MLPclf.fit(train_X, train_y)
predictionsMLP=MLPclf.predict(BERT_test_X)
print(classification_report(BERT_test_y, predictionsMLP))
predicted_NN_scores= MLPclf.predict_proba(BERT_test_X) 

              precision    recall  f1-score   support

     related       0.90      0.93      0.92        41
   unrelated       0.86      0.83      0.84        23

    accuracy                           0.89        64
   macro avg       0.88      0.88      0.88        64
weighted avg       0.89      0.89      0.89        64



In [15]:
predicted_NN_scores_percent = [round(max(score),3) for score in predicted_NN_scores]

# Baseline System

## prepare BOW model from training data

In [16]:
texts = balanced_labeled_df.text.to_list()

In [17]:
# cleaning the texts for lower case words only
BOW_texts_list = []

for text in texts:
    sentences = nltk.sent_tokenize(text)
    
    new_text =[]
    if len(sentences) >=6:
        for sent in sentences [:6]:
            sent = sent.lower()
            sent = re.sub(r'\W',' ',sent)
            sent = re.sub(r'\s+',' ',sent)
            new_text.append(sent)
            
    else:
        for sent in sentences:
            sent = sent.lower()
            sent = re.sub(r'\W',' ',sent)
            sent = re.sub(r'\s+',' ',sent)
            new_text.append(sent)
    BOW_texts_list.append(new_text)

# each text is a single string after this ( i feel this is an inefficient way to do this, but it works)
concatenated_sents = [''.join(item)for item in BOW_texts_list]

In [18]:
BOW_train_y = list(balanced_labeled_df.label)
count_vec = CountVectorizer(stop_words=stopwords.words('english')) 
# this is the model
BOW_model = count_vec.fit_transform(concatenated_sents)

## Classify - BoW

In [19]:
dfs = pd.read_excel('./classify_only/combined_testset.xlsx', sheet_name='Related_SDG1')
dfs['text']=dfs['Headline'].astype(str)+' '+dfs['First 5 sentences']
dfs = dfs[['text', 'label']]
test_text = dfs['text'].to_list()
test_y = dfs['label'].to_list()

In [20]:
#this vector transforms the texts to fit the BOW model (put test texts in to the function)
test_X = count_vec.transform(test_text)

In [21]:
BOW_classifier = LinearSVC(random_state=0, tol=1e-5)
BOW_classifier.fit(BOW_model,BOW_train_y)
predicted_label = BOW_classifier.predict(test_X)
print(classification_report(test_y, predicted_label))



              precision    recall  f1-score   support

     related       1.00      0.61      0.76        41
   unrelated       0.59      1.00      0.74        23

    accuracy                           0.75        64
   macro avg       0.79      0.80      0.75        64
weighted avg       0.85      0.75      0.75        64



# output results

In [23]:
dfs['SVM_predictions']=SVM_predictions
dfs['SVM_confidence'] = predicted_test_scores
dfs['NN_predictions'] = predictionsMLP
dfs['NN_confidence'] = predicted_NN_scores_percent
dfs['Baseline_predictions']=predicted_label
dfs.to_csv('results.tsv', sep='\t')