# ASSIGNMENT1


# Trained classifiers for Hillary Clinton in the Stance SemEval 2016 dataset.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 7.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [3]:
import spacy
import csv
import random
import time
import numpy as np
import pandas as pd
import re
import string

from spacy.util import minibatch, compounding
import sys
from spacy import displacy
from itertools import chain

from sklearn.metrics import classification_report

In [4]:
def load_data(fnames):
    data = []
    for fname in fnames:
        data.append(pd.read_csv(fname, sep='\t', encoding='utf-8'))
    data = pd.concat(data)
    targets = set(data['Target'])
    return data, list(targets)

In [5]:
def cleanup(tweet):
    """we remove urls, hashtags and user symbols"""
    tweet = re.sub(r"http\S+", "", tweet.replace("#", "").replace("@", "").replace('\n', ' ').replace('\t', ' '))
    return tweet

In [6]:
trial_file = "/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/semeval2016-task6-trialdata.utf-8.txt"
train_file = "/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/semeval2016-task6-trainingdata.utf-8.txt"
test_file = "/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/SemEval2016-Task6-subtaskA-testdata-gold.txt"

training_data, targets = load_data([trial_file, train_file])
training_data['Clean_tweet'] = training_data['Tweet'].apply(cleanup)

test_data, _ = load_data([test_file])
test_data['Clean_tweet'] = test_data['Tweet'].apply(cleanup)
display(training_data)

Unnamed: 0,ID,Target,Tweet,Stance,Clean_tweet
0,1,Hillary Clinton,"@tedcruz And, #HandOverTheServer she wiped cle...",AGAINST,"tedcruz And, HandOverTheServer she wiped clean..."
1,2,Hillary Clinton,Hillary is our best choice if we truly want to...,FAVOR,Hillary is our best choice if we truly want to...
2,3,Hillary Clinton,@TheView I think our country is ready for a fe...,AGAINST,TheView I think our country is ready for a fem...
3,4,Hillary Clinton,I just gave an unhealthy amount of my hard-ear...,AGAINST,I just gave an unhealthy amount of my hard-ear...
4,5,Hillary Clinton,@PortiaABoulger Thank you for adding me to you...,NONE,PortiaABoulger Thank you for adding me to your...
...,...,...,...,...,...
2809,2910,Legalization of Abortion,"There's a law protecting unborn eagles, but no...",AGAINST,"There's a law protecting unborn eagles, but no..."
2810,2911,Legalization of Abortion,I am 1 in 3... I have had an abortion #Abortio...,AGAINST,I am 1 in 3... I have had an abortion Abortion...
2811,2912,Legalization of Abortion,How dare you say my sexual preference is a cho...,AGAINST,How dare you say my sexual preference is a cho...
2812,2913,Legalization of Abortion,"Equal rights for those 'born that way', no rig...",AGAINST,"Equal rights for those 'born that way', no rig..."


In [7]:
for target in targets:
  training_data[training_data['Target'] == target][['Stance', 'Clean_tweet']].to_csv(f"/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/train.{target}.tsv",
          sep="\t", index=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar="")
  test_data[test_data['Target'] == target][['Stance', 'Clean_tweet']].to_csv(f"/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/test.{target}.tsv",
          sep="\t", index=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar="")

In [8]:
def load_data_spacy(fname):
  training_data = pd.read_csv(fname, sep='\t', encoding='utf-8')
  #train_data.dropna(axis = 0, how ='any',inplace=True)
  #train_data['Num_words_text'] = train_data['text'].apply(lambda x:len(str(x).split())) 
  #mask = train_data['Num_words_text'] >2
  #train_data = train_data[mask]
  print(training_data['Stance'].value_counts())
   
  train_texts = training_data['Clean_tweet'].tolist()
  train_cats = training_data['Stance'].tolist()
  final_train_cats=[]
  for cat in train_cats:
    cat_list = {}
    if cat == 'AGAINST':
      cat_list['AGAINST'] =  1
      cat_list['FAVOR'] =  0
      cat_list['NONE'] =  0
    elif cat == 'FAVOR':
      cat_list['AGAINST'] =  0
      cat_list['FAVOR'] =  1
      cat_list['NONE'] =  0
    else:
      cat_list['AGAINST'] =  0
      cat_list['FAVOR'] =  0
      cat_list['NONE'] =  1
    final_train_cats.append(cat_list)
    
  train_data = list(zip(train_texts, [{"cats": cats} for cats in final_train_cats]))
  return train_data, train_texts, train_cats

In [9]:
training_data, train_texts, train_cats = load_data_spacy('/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/train.Hillary Clinton.tsv')
print(training_data[:10])
print(len(training_data))
test_data, test_texts, test_cats = load_data_spacy('/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/test.Hillary Clinton.tsv')
print(len(test_data))

AGAINST    393
NONE       178
FAVOR      118
Name: Stance, dtype: int64
[('tedcruz And, HandOverTheServer she wiped clean + 30k deleted emails, explains dereliction of duty/lies re Benghazi,etc tcot SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('Hillary is our best choice if we truly want to continue being a progressive nation. Ohio SemST', {'cats': {'AGAINST': 0, 'FAVOR': 1, 'NONE': 0}}), ("TheView I think our country is ready for a female pres, it can't ever be Hillary SemST", {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ("I just gave an unhealthy amount of my hard-earned money away to the big gov't & untrustworthy IRS. WhyImNotVotingForHillary SemST", {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('PortiaABoulger Thank you for adding me to your list SemST', {'cats': {'AGAINST': 0, 'FAVOR': 0, 'NONE': 1}}), ("Hillary can not win. Here's hoping the Dems offer a real candidate like Warren. Warren2016 SemST", {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('Resp

In [10]:
def Sort(sub_li):
  # reverse = True (Soresulting_list = list(first_list)rts in Descending  order) 
  # key is set to sort using second element of  
  # sublist lambda has been used 
  return(sorted(sub_li, key = lambda x: x[1],reverse=True))  

# run the predictions on each sentence in the evaluation  dataset, and return the metrics
def evaluate(tokenizer, textcat, test_texts, test_cats ):
  docs = (tokenizer(text) for text in test_texts)
  preds = []
  for i, doc in enumerate(textcat.pipe(docs)):
    #print(doc.cats.items())
    scores = Sort(doc.cats.items())
    #print(scores)
    catList=[]
    for score in scores:
      catList.append(score[0])
    preds.append(catList[0])
        
  labels = ['AGAINST', 'FAVOR']
  print(classification_report(test_cats, preds,labels=labels))

In [11]:
def train_spacy(  train_data, iterations,test_texts,test_cats, model_arch, dropout = 0.3, model=None, init_tok2vec=None):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {cats: ['AGAINST'|'FAVOR'|'NONE']})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    
    nlp = spacy.load('en_core_web_sm')
    

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": model_arch}
        )
        nlp.add_pipe(textcat, last=True)
         # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("AGAINST")
    textcat.add_label("FAVOR")
    textcat.add_label("NONE")


    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(16.0, 64.0, 1.5)
        for i in range(iterations):
            print('Iteration: '+str(i))
            start_time = time.clock()
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the test data 
                evaluate(nlp.tokenizer, textcat, test_texts,test_cats)
            print ('Elapsed time'+str(time.clock() - start_time)+  "seconds")
        with nlp.use_params(optimizer.averages):
            model_name = model_arch + "_Hillary_2016"
            filepath = "/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/resources/" + model_name 
            nlp.to_disk(filepath)
    return nlp 

In [12]:
nlp = train_spacy(training_data, 20, test_texts, test_cats, "bow")

Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.58      1.00      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.58      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.46      0.79      0.58       217

Elapsed time0.5200769999999997seconds
Iteration: 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.58      1.00      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.58      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.46      0.79      0.58       217

Elapsed time0.2540710000000006seconds
Iteration: 2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.58      1.00      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.58      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.46      0.79      0.58       217

Elapsed time0.26079700000000017seconds
Iteration: 3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.58      1.00      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.58      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.46      0.79      0.58       217

Elapsed time0.24650600000000011seconds
Iteration: 4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.58      1.00      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.58      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.46      0.79      0.58       217

Elapsed time0.24869699999999995seconds
Iteration: 5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.58      1.00      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.58      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.46      0.79      0.58       217

Elapsed time0.2517719999999999seconds
Iteration: 6


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.58      1.00      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.58      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.46      0.79      0.58       217

Elapsed time0.2474210000000001seconds
Iteration: 7


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.59      0.99      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.59      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.47      0.79      0.59       217

Elapsed time0.2504880000000007seconds
Iteration: 8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.59      0.99      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.59      0.79      0.67       217
   macro avg       0.29      0.50      0.37       217
weighted avg       0.47      0.79      0.59       217

Elapsed time0.24470800000000104seconds
Iteration: 9


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.59      0.99      0.74       172
       FAVOR       0.00      0.00      0.00        45

   micro avg       0.59      0.78      0.67       217
   macro avg       0.30      0.49      0.37       217
weighted avg       0.47      0.78      0.59       217

Elapsed time0.2570699999999988seconds
Iteration: 10
              precision    recall  f1-score   support

     AGAINST       0.61      0.98      0.75       172
       FAVOR       1.00      0.02      0.04        45

   micro avg       0.61      0.78      0.68       217
   macro avg       0.80      0.50      0.40       217
weighted avg       0.69      0.78      0.60       217

Elapsed time0.24511399999999917seconds
Iteration: 11




              precision    recall  f1-score   support

     AGAINST       0.61      0.98      0.75       172
       FAVOR       1.00      0.02      0.04        45

   micro avg       0.61      0.78      0.69       217
   macro avg       0.81      0.50      0.40       217
weighted avg       0.69      0.78      0.61       217

Elapsed time0.2676380000000016seconds
Iteration: 12




              precision    recall  f1-score   support

     AGAINST       0.61      0.98      0.75       172
       FAVOR       1.00      0.02      0.04        45

   micro avg       0.61      0.78      0.69       217
   macro avg       0.81      0.50      0.40       217
weighted avg       0.69      0.78      0.61       217

Elapsed time0.2356309999999997seconds
Iteration: 13




              precision    recall  f1-score   support

     AGAINST       0.61      0.98      0.75       172
       FAVOR       1.00      0.02      0.04        45

   micro avg       0.61      0.78      0.69       217
   macro avg       0.81      0.50      0.40       217
weighted avg       0.69      0.78      0.61       217

Elapsed time0.2390220000000003seconds
Iteration: 14




              precision    recall  f1-score   support

     AGAINST       0.62      0.98      0.76       172
       FAVOR       1.00      0.04      0.09        45

   micro avg       0.62      0.78      0.69       217
   macro avg       0.81      0.51      0.42       217
weighted avg       0.70      0.78      0.62       217

Elapsed time0.25788400000000067seconds
Iteration: 15




              precision    recall  f1-score   support

     AGAINST       0.62      0.97      0.75       172
       FAVOR       1.00      0.04      0.09        45

   micro avg       0.62      0.78      0.69       217
   macro avg       0.81      0.51      0.42       217
weighted avg       0.70      0.78      0.62       217

Elapsed time0.2447329999999983seconds
Iteration: 16




              precision    recall  f1-score   support

     AGAINST       0.62      0.97      0.76       172
       FAVOR       0.75      0.07      0.12        45

   micro avg       0.63      0.78      0.69       217
   macro avg       0.69      0.52      0.44       217
weighted avg       0.65      0.78      0.63       217

Elapsed time0.24421800000000005seconds
Iteration: 17




              precision    recall  f1-score   support

     AGAINST       0.62      0.96      0.76       172
       FAVOR       0.75      0.07      0.12        45

   micro avg       0.62      0.77      0.69       217
   macro avg       0.69      0.51      0.44       217
weighted avg       0.65      0.77      0.62       217

Elapsed time0.24588999999999928seconds
Iteration: 18




              precision    recall  f1-score   support

     AGAINST       0.63      0.96      0.76       172
       FAVOR       0.67      0.09      0.16        45

   micro avg       0.63      0.78      0.70       217
   macro avg       0.65      0.52      0.46       217
weighted avg       0.64      0.78      0.63       217

Elapsed time0.24234899999999904seconds
Iteration: 19




              precision    recall  f1-score   support

     AGAINST       0.64      0.96      0.77       172
       FAVOR       0.80      0.18      0.29        45

   micro avg       0.65      0.80      0.72       217
   macro avg       0.72      0.57      0.53       217
weighted avg       0.68      0.80      0.67       217

Elapsed time0.24412399999999934seconds




In [16]:
textcat_bow = spacy.load("/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/resources/bow_Hillary_2016")
tweets = textcat_bow(test_texts[10])
print("Text: "+ test_texts[10])
print("Gold Label:"+ test_cats[10])
print(" Predicted Label:") 
print(tweets.cats)
print("=======================================")

Text: The government has given no explanation of why the law was changed macedonia HRCtte SemST
Gold Label:AGAINST
 Predicted Label:
{'AGAINST': 0.5614825487136841, 'FAVOR': 0.11962402611970901, 'NONE': 0.3188934624195099}


# Trained classifiers for Legalization of abortion in the Stance SemEval 2016 dataset.

In [13]:
training_data, train_texts, train_cats = load_data_spacy('/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/train.Legalization of Abortion.tsv')
print(training_data[:10])
print(len(training_data))
test_data, test_texts, test_cats = load_data_spacy('/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/test.Legalization of Abortion.tsv')
print(len(test_data))

AGAINST    355
NONE       177
FAVOR      121
Name: Stance, dtype: int64
[('Just laid down the law on abortion in my bioethics class. Catholic SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ("tooprettyclub Are you OK with GOP males telling you what you can and can't do with your own body? SemST", {'cats': {'AGAINST': 0, 'FAVOR': 1, 'NONE': 0}}), ("If you don't want your kid, put it up for adoption. sorrynotsorry SemST", {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('RedAlert -there should be a "stigma" to butchering pre-born children - its a horrendous crime against humanity.  murder SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ("But isn't that the problem then. Not enough faith. gaystapo socialism SemST", {'cats': {'AGAINST': 0, 'FAVOR': 0, 'NONE': 1}}), ('Life is our first and most basic human right. SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ("Rise & Shine its a new day & you're alive. Thank God 4 another day of precious life. Christian Cat

In [14]:
def train_spacy(  train_data, iterations,test_texts,test_cats, model_arch, dropout = 0.3, model=None, init_tok2vec=None):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {cats: ['AGAINST'|'FAVOR'|'NONE']})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    
    nlp = spacy.load('en_core_web_sm')
    

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": model_arch}
        )
        nlp.add_pipe(textcat, last=True)
         # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("AGAINST")
    textcat.add_label("FAVOR")
    textcat.add_label("NONE")


    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(16.0, 64.0, 1.5)
        for i in range(iterations):
            print('Iteration: '+str(i))
            start_time = time.clock()
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the test data 
                evaluate(nlp.tokenizer, textcat, test_texts,test_cats)
            print ('Elapsed time'+str(time.clock() - start_time)+  "seconds")
        with nlp.use_params(optimizer.averages):
            model_name = model_arch + "_Legalization_of_abortion_Stance_Semeval2016"
            filepath = "/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/resources/" + model_name 
            nlp.to_disk(filepath)
    return nlp  

In [15]:
nlp = train_spacy(training_data, 20, test_texts, test_cats, "bow")

Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.68      1.00      0.81       189
       FAVOR       0.00      0.00      0.00        46

   micro avg       0.68      0.80      0.73       235
   macro avg       0.34      0.50      0.40       235
weighted avg       0.54      0.80      0.65       235

Elapsed time0.4854029999999998seconds
Iteration: 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.68      1.00      0.81       189
       FAVOR       0.00      0.00      0.00        46

   micro avg       0.68      0.80      0.73       235
   macro avg       0.34      0.50      0.40       235
weighted avg       0.54      0.80      0.65       235

Elapsed time0.24044799999999888seconds
Iteration: 2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.68      1.00      0.81       189
       FAVOR       0.00      0.00      0.00        46

   micro avg       0.68      0.80      0.73       235
   macro avg       0.34      0.50      0.40       235
weighted avg       0.54      0.80      0.65       235

Elapsed time0.24001100000000086seconds
Iteration: 3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.68      1.00      0.81       189
       FAVOR       0.00      0.00      0.00        46

   micro avg       0.68      0.80      0.74       235
   macro avg       0.34      0.50      0.40       235
weighted avg       0.54      0.80      0.65       235

Elapsed time0.23876100000000022seconds
Iteration: 4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.68      1.00      0.81       189
       FAVOR       0.00      0.00      0.00        46

   micro avg       0.68      0.80      0.74       235
   macro avg       0.34      0.50      0.40       235
weighted avg       0.54      0.80      0.65       235

Elapsed time0.24001999999999946seconds
Iteration: 5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.68      1.00      0.81       189
       FAVOR       0.00      0.00      0.00        46

   micro avg       0.68      0.80      0.74       235
   macro avg       0.34      0.50      0.40       235
weighted avg       0.54      0.80      0.65       235

Elapsed time0.24592100000000094seconds
Iteration: 6
              precision    recall  f1-score   support

     AGAINST       0.68      1.00      0.81       189
       FAVOR       1.00      0.02      0.04        46

   micro avg       0.68      0.81      0.74       235
   macro avg       0.84      0.51      0.43       235
weighted avg       0.74      0.81      0.66       235

Elapsed time0.24170200000000008seconds
Iteration: 7




              precision    recall  f1-score   support

     AGAINST       0.68      0.99      0.81       189
       FAVOR       0.67      0.04      0.08        46

   micro avg       0.68      0.81      0.74       235
   macro avg       0.67      0.52      0.45       235
weighted avg       0.68      0.81      0.67       235

Elapsed time0.24260799999999882seconds
Iteration: 8




              precision    recall  f1-score   support

     AGAINST       0.68      0.97      0.80       189
       FAVOR       0.33      0.04      0.08        46

   micro avg       0.67      0.79      0.73       235
   macro avg       0.51      0.51      0.44       235
weighted avg       0.61      0.79      0.66       235

Elapsed time0.23719999999999963seconds
Iteration: 9




              precision    recall  f1-score   support

     AGAINST       0.69      0.97      0.81       189
       FAVOR       0.50      0.09      0.15        46

   micro avg       0.68      0.80      0.74       235
   macro avg       0.59      0.53      0.48       235
weighted avg       0.65      0.80      0.68       235

Elapsed time0.23941999999999908seconds
Iteration: 10




              precision    recall  f1-score   support

     AGAINST       0.69      0.96      0.81       189
       FAVOR       0.50      0.11      0.18        46

   micro avg       0.68      0.80      0.74       235
   macro avg       0.60      0.54      0.49       235
weighted avg       0.65      0.80      0.68       235

Elapsed time0.23933900000000108seconds
Iteration: 11




              precision    recall  f1-score   support

     AGAINST       0.70      0.95      0.81       189
       FAVOR       0.50      0.15      0.23        46

   micro avg       0.69      0.79      0.74       235
   macro avg       0.60      0.55      0.52       235
weighted avg       0.66      0.79      0.70       235

Elapsed time0.24096400000000173seconds
Iteration: 12




              precision    recall  f1-score   support

     AGAINST       0.71      0.94      0.81       189
       FAVOR       0.53      0.20      0.29        46

   micro avg       0.70      0.80      0.74       235
   macro avg       0.62      0.57      0.55       235
weighted avg       0.67      0.80      0.71       235

Elapsed time0.2482830000000007seconds
Iteration: 13




              precision    recall  f1-score   support

     AGAINST       0.71      0.94      0.81       189
       FAVOR       0.56      0.22      0.31        46

   micro avg       0.70      0.80      0.75       235
   macro avg       0.63      0.58      0.56       235
weighted avg       0.68      0.80      0.71       235

Elapsed time0.2397229999999979seconds
Iteration: 14




              precision    recall  f1-score   support

     AGAINST       0.71      0.93      0.81       189
       FAVOR       0.56      0.22      0.31        46

   micro avg       0.70      0.79      0.74       235
   macro avg       0.63      0.57      0.56       235
weighted avg       0.68      0.79      0.71       235

Elapsed time0.25185300000000055seconds
Iteration: 15




              precision    recall  f1-score   support

     AGAINST       0.72      0.93      0.81       189
       FAVOR       0.60      0.26      0.36        46

   micro avg       0.71      0.80      0.75       235
   macro avg       0.66      0.59      0.59       235
weighted avg       0.70      0.80      0.72       235

Elapsed time0.2337239999999987seconds
Iteration: 16




              precision    recall  f1-score   support

     AGAINST       0.72      0.91      0.80       189
       FAVOR       0.57      0.26      0.36        46

   micro avg       0.70      0.78      0.74       235
   macro avg       0.64      0.59      0.58       235
weighted avg       0.69      0.78      0.72       235

Elapsed time0.22728199999999887seconds
Iteration: 17




              precision    recall  f1-score   support

     AGAINST       0.72      0.89      0.79       189
       FAVOR       0.52      0.28      0.37        46

   micro avg       0.70      0.77      0.73       235
   macro avg       0.62      0.59      0.58       235
weighted avg       0.68      0.77      0.71       235

Elapsed time0.23790500000000137seconds
Iteration: 18




              precision    recall  f1-score   support

     AGAINST       0.72      0.87      0.79       189
       FAVOR       0.52      0.30      0.38        46

   micro avg       0.70      0.76      0.73       235
   macro avg       0.62      0.59      0.59       235
weighted avg       0.68      0.76      0.71       235

Elapsed time0.24111999999999867seconds
Iteration: 19




              precision    recall  f1-score   support

     AGAINST       0.72      0.85      0.78       189
       FAVOR       0.50      0.33      0.39        46

   micro avg       0.69      0.75      0.72       235
   macro avg       0.61      0.59      0.59       235
weighted avg       0.68      0.75      0.70       235

Elapsed time0.22512500000000202seconds




In [17]:
textcat_bow = spacy.load("/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/resources/bow_Legalization_of_abortion_Stance_Semeval2016")
tweets = textcat_bow(test_texts[10])
print("Text: "+ test_texts[10])
print("Gold Label:"+ test_cats[10])
print(" Predicted Label:") 
print(tweets.cats)
print("=======================================")

Text: The government has given no explanation of why the law was changed macedonia HRCtte SemST
Gold Label:AGAINST
 Predicted Label:
{'AGAINST': 0.5957818627357483, 'FAVOR': 0.1288515031337738, 'NONE': 0.2753666341304779}


# Trained classifiers for Climate change in the Stance SemEval 2016 dataset

In [18]:
training_data, train_texts, train_cats = load_data_spacy('/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/train.Climate Change is a Real Concern.tsv')
print(training_data[:10])
print(len(training_data))
test_data, test_texts, test_cats = load_data_spacy('/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/test.Climate Change is a Real Concern.tsv')
print(len(test_data))

FAVOR      212
NONE       168
AGAINST     15
Name: Stance, dtype: int64
[('We cant deny it, its really happening.  SemST', {'cats': {'AGAINST': 0, 'FAVOR': 1, 'NONE': 0}}), ('RT cderworiz: Timelines are short. Strategy must be in place by climate change conference in Paris by December. ableg SemST', {'cats': {'AGAINST': 0, 'FAVOR': 1, 'NONE': 0}}), ('SO EXCITING! Meaningful climate change action is on the way! abpoli GHG SemST', {'cats': {'AGAINST': 0, 'FAVOR': 1, 'NONE': 0}}), ('Delivering good jobs for Albertans, maintaining a stable economy & meeting climate change strategy. Good goals. abpoli GHG SemST', {'cats': {'AGAINST': 0, 'FAVOR': 1, 'NONE': 0}}), ('davidswann says he wants carbon fund to be spent on public transportation and renewable energy. ejlive ableg SemST', {'cats': {'AGAINST': 0, 'FAVOR': 1, 'NONE': 0}}), ('Questions about the LancetGH report?  asklancet tweet chat happening now! actonclimate ClimateHealth SemST', {'cats': {'AGAINST': 0, 'FAVOR': 1, 'NONE': 0}}), ('We

In [19]:
def train_spacy(  train_data, iterations,test_texts,test_cats, model_arch, dropout = 0.3, model=None, init_tok2vec=None):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {cats: ['AGAINST'|'FAVOR'|'NONE']})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    
    nlp = spacy.load('en_core_web_sm')
    

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": model_arch}
        )
        nlp.add_pipe(textcat, last=True)
         # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("AGAINST")
    textcat.add_label("FAVOR")
    textcat.add_label("NONE")


    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(16.0, 64.0, 1.5)
        for i in range(iterations):
            print('Iteration: '+str(i))
            start_time = time.clock()
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the test data 
                evaluate(nlp.tokenizer, textcat, test_texts,test_cats)
            print ('Elapsed time'+str(time.clock() - start_time)+  "seconds")
        with nlp.use_params(optimizer.averages):
            model_name = model_arch + "_Climate_change_2016"
            filepath = "/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/resources/" + model_name 
            nlp.to_disk(filepath)
    return nlp 

In [20]:
nlp = train_spacy(training_data, 20, test_texts, test_cats, "bow")

Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.77      1.00      0.87       123

   micro avg       0.77      0.92      0.84       134
   macro avg       0.38      0.50      0.43       134
weighted avg       0.71      0.92      0.80       134

Elapsed time0.34443299999999866seconds
Iteration: 1
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      1.00      0.86       123

   micro avg       0.76      0.92      0.83       134
   macro avg       0.38      0.50      0.43       134
weighted avg       0.70      0.92      0.79       134

Elapsed time0.14806999999999704seconds
Iteration: 2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.99      0.86       123

   micro avg       0.76      0.91      0.83       134
   macro avg       0.38      0.50      0.43       134
weighted avg       0.70      0.91      0.79       134

Elapsed time0.1523779999999988seconds
Iteration: 3
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.98      0.86       123

   micro avg       0.76      0.90      0.82       134
   macro avg       0.38      0.49      0.43       134
weighted avg       0.70      0.90      0.79       134

Elapsed time0.1443860000000008seconds
Iteration: 4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.97      0.85       123

   micro avg       0.76      0.89      0.82       134
   macro avg       0.38      0.48      0.43       134
weighted avg       0.70      0.89      0.78       134

Elapsed time0.15335899999999825seconds
Iteration: 5
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.96      0.85       123

   micro avg       0.76      0.88      0.82       134
   macro avg       0.38      0.48      0.42       134
weighted avg       0.70      0.88      0.78       134

Elapsed time0.15073399999999992seconds
Iteration: 6


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.96      0.85       123

   micro avg       0.76      0.88      0.82       134
   macro avg       0.38      0.48      0.42       134
weighted avg       0.70      0.88      0.78       134

Elapsed time0.16266900000000106seconds
Iteration: 7
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.95      0.85       123

   micro avg       0.76      0.87      0.82       134
   macro avg       0.38      0.48      0.42       134
weighted avg       0.70      0.87      0.78       134

Elapsed time0.14487399999999795seconds
Iteration: 8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.94      0.84       123

   micro avg       0.76      0.87      0.81       134
   macro avg       0.38      0.47      0.42       134
weighted avg       0.70      0.87      0.77       134

Elapsed time0.15792700000000082seconds
Iteration: 9
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.94      0.84       123

   micro avg       0.76      0.87      0.81       134
   macro avg       0.38      0.47      0.42       134
weighted avg       0.70      0.87      0.77       134

Elapsed time0.14664700000000153seconds
Iteration: 10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.76      0.93      0.84       123

   micro avg       0.76      0.86      0.81       134
   macro avg       0.38      0.47      0.42       134
weighted avg       0.70      0.86      0.77       134

Elapsed time0.15389300000000006seconds
Iteration: 11
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.77      0.93      0.84       123

   micro avg       0.77      0.86      0.81       134
   macro avg       0.38      0.47      0.42       134
weighted avg       0.70      0.86      0.77       134

Elapsed time0.1380669999999995seconds
Iteration: 12


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.77      0.92      0.84       123

   micro avg       0.77      0.84      0.80       134
   macro avg       0.38      0.46      0.42       134
weighted avg       0.71      0.84      0.77       134

Elapsed time0.15397799999999862seconds
Iteration: 13
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.77      0.92      0.84       123

   micro avg       0.77      0.84      0.80       134
   macro avg       0.38      0.46      0.42       134
weighted avg       0.71      0.84      0.77       134

Elapsed time0.14362700000000217seconds
Iteration: 14


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.80      0.92      0.85       123

   micro avg       0.80      0.84      0.82       134
   macro avg       0.40      0.46      0.43       134
weighted avg       0.73      0.84      0.78       134

Elapsed time0.15479900000000058seconds
Iteration: 15
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.79      0.91      0.85       123

   micro avg       0.79      0.84      0.81       134
   macro avg       0.40      0.46      0.42       134
weighted avg       0.73      0.84      0.78       134

Elapsed time0.1344949999999976seconds
Iteration: 16


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.79      0.91      0.85       123

   micro avg       0.79      0.84      0.81       134
   macro avg       0.40      0.46      0.42       134
weighted avg       0.73      0.84      0.78       134

Elapsed time0.14112099999999828seconds
Iteration: 17
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.79      0.91      0.85       123

   micro avg       0.79      0.84      0.81       134
   macro avg       0.40      0.46      0.42       134
weighted avg       0.73      0.84      0.78       134

Elapsed time0.14011400000000052seconds
Iteration: 18


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.79      0.90      0.84       123

   micro avg       0.79      0.83      0.81       134
   macro avg       0.40      0.45      0.42       134
weighted avg       0.73      0.83      0.77       134

Elapsed time0.16240800000000277seconds
Iteration: 19
              precision    recall  f1-score   support

     AGAINST       0.00      0.00      0.00        11
       FAVOR       0.79      0.90      0.84       123

   micro avg       0.79      0.83      0.81       134
   macro avg       0.40      0.45      0.42       134
weighted avg       0.73      0.83      0.77       134

Elapsed time0.14564199999999872seconds


In [21]:
textcat_bow = spacy.load("/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/resources/bow_Climate_change_2016")
tweets = textcat_bow(test_texts[10])
print("Text: "+ test_texts[10])
print("Gold Label:"+ test_cats[10])
print(" Predicted Label:") 
print(tweets.cats)
print("=======================================")

Text: Interesting speaker Maarten Van Aalst Federation how their humanitarian experience can help Climate scientists CCFC15 SemST
Gold Label:FAVOR
 Predicted Label:
{'AGAINST': 0.15667614340782166, 'FAVOR': 0.4745143949985504, 'NONE': 0.36880940198898315}


# Trained classifiers for Atheism in the Stance SemEval 2016 dataset.

In [22]:
training_data, train_texts, train_cats = load_data_spacy('/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/train.Atheism.tsv')
print(training_data[:10])
print(len(training_data))
test_data, test_texts, test_cats = load_data_spacy('/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/datasets/stance-semeval2016/test.Atheism.tsv')
print(len(test_data))

AGAINST    304
NONE       117
FAVOR       92
Name: Stance, dtype: int64
[('dear lord thank u for all of ur blessings forgive my sins lord give me strength and energy for this busy day ahead blessed hope SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('Blessed are the peacemakers, for they shall be called children of God. Matthew 5:9 scripture peace SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('I am not conformed to this world. I am transformed by the renewing of my mind. ISpeakLife God 2014 SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('Salah should be prayed with focus and understanding. Allah warns against lazy prayers done just for show Surah Al-Maoon 107:4-6 SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('And stay in your houses and do not display yourselves like that of the times of ignorance." [Quran 33:33].islam SemST', {'cats': {'AGAINST': 1, 'FAVOR': 0, 'NONE': 0}}), ('If we are unsure whether something is halal or haram, we sh

In [24]:
def train_spacy(  train_data, iterations,test_texts,test_cats, model_arch, dropout = 0.3, model=None, init_tok2vec=None):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {cats: ['AGAINST'|'FAVOR'|'NONE']})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    
    nlp = spacy.load('en_core_web_sm')
    

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": model_arch}
        )
        nlp.add_pipe(textcat, last=True)
         # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("AGAINST")
    textcat.add_label("FAVOR")
    textcat.add_label("NONE")


    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(16.0, 64.0, 1.5)
        for i in range(iterations):
            print('Iteration: '+str(i))
            start_time = time.clock()
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the test data 
                evaluate(nlp.tokenizer, textcat, test_texts,test_cats)
            print ('Elapsed time'+str(time.clock() - start_time)+  "seconds")
        with nlp.use_params(optimizer.averages):
            model_name = model_arch + "_Atheism_2016"
            filepath = "/content/drive/My Drive/Colab Notebooks/2022-ILTAPP/resources/" + model_name 
            nlp.to_disk(filepath)
    return nlp

In [25]:
nlp = train_spacy(training_data, 20, test_texts, test_cats, "bow")

Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.4114159999999991seconds
Iteration: 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.21418800000000005seconds
Iteration: 2
              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.20915600000000012seconds
Iteration: 3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.27724800000000016seconds
Iteration: 4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.2232050000000001seconds
Iteration: 5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.2240940000000009seconds
Iteration: 6


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.23111499999999907seconds
Iteration: 7


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.23402300000000054seconds
Iteration: 8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       0.00      0.00      0.00        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.36      0.50      0.42       192
weighted avg       0.61      0.83      0.70       192

Elapsed time0.21144299999999916seconds
Iteration: 9
              precision    recall  f1-score   support

     AGAINST       0.73      1.00      0.84       160
       FAVOR       1.00      0.03      0.06        32

   micro avg       0.73      0.84      0.78       192
   macro avg       0.87      0.52      0.45       192
weighted avg       0.78      0.84      0.71       192

Elapsed time0.1982299999999988seconds
Iteration: 10
              precision    recall  f1-score   support

     AGAINST       0.73      0.99      0.84       160
       FAVOR       0.50      0.03      0.06        32

   micro avg       0.73      0.83      0.78       192
   macro avg       0.62 



              precision    recall  f1-score   support

     AGAINST       0.75      0.99      0.85       160
       FAVOR       0.40      0.06      0.11        32

   micro avg       0.74      0.83      0.78       192
   macro avg       0.57      0.53      0.48       192
weighted avg       0.69      0.83      0.73       192

Elapsed time0.19523200000000074seconds
Iteration: 13
              precision    recall  f1-score   support

     AGAINST       0.75      0.98      0.85       160
       FAVOR       0.40      0.06      0.11        32

   micro avg       0.74      0.83      0.78       192
   macro avg       0.58      0.52      0.48       192
weighted avg       0.69      0.83      0.73       192

Elapsed time0.191240999999998seconds
Iteration: 14




              precision    recall  f1-score   support

     AGAINST       0.75      0.97      0.85       160
       FAVOR       0.33      0.06      0.11        32

   micro avg       0.74      0.82      0.78       192
   macro avg       0.54      0.52      0.48       192
weighted avg       0.68      0.82      0.72       192

Elapsed time0.205171seconds
Iteration: 15
              precision    recall  f1-score   support

     AGAINST       0.75      0.96      0.84       160
       FAVOR       0.25      0.06      0.10        32

   micro avg       0.73      0.81      0.77       192
   macro avg       0.50      0.51      0.47       192
weighted avg       0.67      0.81      0.72       192

Elapsed time0.17935199999999796seconds
Iteration: 16




              precision    recall  f1-score   support

     AGAINST       0.75      0.94      0.84       160
       FAVOR       0.25      0.06      0.10        32

   micro avg       0.73      0.80      0.76       192
   macro avg       0.50      0.50      0.47       192
weighted avg       0.67      0.80      0.71       192

Elapsed time0.20119199999999893seconds
Iteration: 17
              precision    recall  f1-score   support

     AGAINST       0.75      0.94      0.83       160
       FAVOR       0.22      0.06      0.10        32

   micro avg       0.73      0.79      0.76       192
   macro avg       0.49      0.50      0.47       192
weighted avg       0.66      0.79      0.71       192

Elapsed time0.18795300000000026seconds
Iteration: 18




              precision    recall  f1-score   support

     AGAINST       0.76      0.93      0.84       160
       FAVOR       0.27      0.09      0.14        32

   micro avg       0.73      0.79      0.76       192
   macro avg       0.52      0.51      0.49       192
weighted avg       0.68      0.79      0.72       192

Elapsed time0.19033899999999804seconds
Iteration: 19
              precision    recall  f1-score   support

     AGAINST       0.76      0.93      0.84       160
       FAVOR       0.25      0.09      0.14        32

   micro avg       0.73      0.79      0.76       192
   macro avg       0.51      0.51      0.49       192
weighted avg       0.68      0.79      0.72       192

Elapsed time0.19753100000000146seconds




In [26]:
textcat_bow = spacy.load("/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/resources/bow_Atheism_2016")
tweets = textcat_bow(test_texts[10])
print("Text: "+ test_texts[10])
print("Gold Label:"+ test_cats[10])
print(" Predicted Label:") 
print(tweets.cats)
print("=======================================")

Text: If only dreams were real, now it's gone. SingleBecause getonyourfeet SemST
Gold Label:AGAINST
 Predicted Label:
{'AGAINST': 0.39675581455230713, 'FAVOR': 0.3052373230457306, 'NONE': 0.29800689220428467}


# ASSIGNMENT 2

# Reused the above code to train a new classifier for fake news dataset.

In [34]:
from sklearn.model_selection import train_test_split

In [27]:
df_fake = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/datasets/fake_rada/fake_news_full.tsv",sep='\t')

In [28]:
df_fake.columns =["label","News"]

In [29]:
df_fake.head(2)

Unnamed: 0,label,News
0,fake,THE BIG DATA CONSPIRACY Government and Silicon...
1,fake,California Surprisingly Lenient on Auto Emissi...


In [31]:
df_fake.shape

(476, 2)

In [32]:
data = df_fake

In [35]:
train,test = train_test_split(data, test_size=0.20, random_state=0)
#save the data
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)

In [36]:
print('train:',len(train))
print('test:',len(test))

train: 380
test: 96


In [37]:
def load_data_spacy(fname):
  train_data = pd.read_csv(fname,)
  train_data.dropna(axis = 0, how ='any',inplace=True)
  train_data['Num_words_text'] = train_data['News'].apply(lambda x:len(str(x).split())) 
  mask = train_data['Num_words_text'] >2
  train_data = train_data[mask]
  print(train_data['label'].value_counts())

  train_data['News'] = train_data['News'].apply(cleanup)
   
  train_texts = train_data['News'].tolist()
  train_cats = train_data['label'].tolist()
  final_train_cats=[]
  for cat in train_cats:
    cat_list = {}
    if cat == 'fake':
      cat_list['fake'] =  1
      cat_list['legit'] =  0
    else:
      cat_list['fake'] =  0
      cat_list['legit'] =  1
      
    final_train_cats.append(cat_list)
    
  training_data = list(zip(train_texts, [{"cats": cats} for cats in final_train_cats]))
  return training_data, train_texts, train_cats

In [38]:
training_data, train_texts, train_cats = load_data_spacy('/content/train.csv')
#training_data, train_texts, train_cats = load_data_spacy('')
#training_data, train_texts, train_cats = load_data_spacy('')
print(training_data[:10])
print(len(training_data))
test_data, test_texts, test_cats = load_data_spacy('/content/test.csv')
print(len(test_data))

fake     194
legit    186
Name: label, dtype: int64
[("New Nintendo Switch game console to launch in March for $99 Nintendo plans a promotional roll out of it's new Nintendo switch game console. For a limited time, the console will roll out for an introductory price of $99. Nintendo promises to pack the new console with fun features not present in past machines. The new console contains new features such as motion detectors and immersive and interactive gaming. The new introductory price will be available for two months to show the public the new advances in gaming. However, initial quantities will be limited to 250,000 units available at the sales price. So rush out and get yours today while the promotional offer is running.", {'cats': {'fake': 1, 'legit': 0}}), ('State Department relaxes travel restrictions for certain foreigners. Washington (CNN) Secretary of State Rex Tillerson sent out a memorandum today to various worldwide embassies directing them to ease restrictions on foreign

In [40]:
def Sort(sub_li):
  # reverse = True (Soresulting_list = list(first_list)rts in Descending  order) 
  # key is set to sort using second element of  
  # sublist lambda has been used 
  return(sorted(sub_li, key = lambda x: x[1],reverse=True))  

# run the predictions on each sentence in the evaluation  dataset, and return the metrics
def evaluate(tokenizer, textcat, test_texts, test_cats ):
  docs = (tokenizer(text) for text in test_texts)
  preds = []
  for i, doc in enumerate(textcat.pipe(docs)):
    #print(doc.cats.items())
    scores = Sort(doc.cats.items())
    #print(scores)
    catList=[]
    for score in scores:
      catList.append(score[0])
    preds.append(catList[0])
        
  labels = ['fake', 'legit']
  print(classification_report(test_cats, preds,labels=labels))

In [41]:
def train_spacy(  train_data, iterations,test_texts,test_cats, model_arch, dropout = 0.3, model=None, init_tok2vec=None):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {cats: ['legit'|'fake']})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    
    nlp = spacy.load('en_core_web_sm')
    

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": model_arch}
        )
        nlp.add_pipe(textcat, last=True)
        
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("legit")
    textcat.add_label("fake")
  


    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(16.0, 64.0, 1.5)
        for i in range(iterations):
            print('Iteration: '+str(i))
            start_time = time.clock()
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the test data 
                evaluate(nlp.tokenizer, textcat, test_texts,test_cats)
            print ('Elapsed time'+str(time.clock() - start_time)+  "seconds")
        with nlp.use_params(optimizer.averages):
            model_name = model_arch + "Fake_rada"
            filepath = "/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/resources"+ model_name 
            nlp.to_disk(filepath)
    return nlp

In [42]:
nlp = train_spacy(training_data, 20, test_texts, test_cats, "bow")

Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0




              precision    recall  f1-score   support

        fake       0.43      0.90      0.58        42
       legit       0.43      0.06      0.10        54

    accuracy                           0.43        96
   macro avg       0.43      0.48      0.34        96
weighted avg       0.43      0.43      0.31        96

Elapsed time0.8524280000000033seconds
Iteration: 1




              precision    recall  f1-score   support

        fake       0.42      0.45      0.44        42
       legit       0.55      0.52      0.53        54

    accuracy                           0.49        96
   macro avg       0.49      0.49      0.49        96
weighted avg       0.49      0.49      0.49        96

Elapsed time0.39022900000000504seconds
Iteration: 2




              precision    recall  f1-score   support

        fake       0.35      0.45      0.40        42
       legit       0.45      0.35      0.40        54

    accuracy                           0.40        96
   macro avg       0.40      0.40      0.40        96
weighted avg       0.41      0.40      0.40        96

Elapsed time0.2978299999999976seconds
Iteration: 3




              precision    recall  f1-score   support

        fake       0.36      0.52      0.43        42
       legit       0.43      0.28      0.34        54

    accuracy                           0.39        96
   macro avg       0.39      0.40      0.38        96
weighted avg       0.40      0.39      0.38        96

Elapsed time0.31122799999999984seconds
Iteration: 4




              precision    recall  f1-score   support

        fake       0.36      0.52      0.43        42
       legit       0.43      0.28      0.34        54

    accuracy                           0.39        96
   macro avg       0.39      0.40      0.38        96
weighted avg       0.40      0.39      0.38        96

Elapsed time0.28789700000000096seconds
Iteration: 5




              precision    recall  f1-score   support

        fake       0.37      0.52      0.43        42
       legit       0.44      0.30      0.36        54

    accuracy                           0.40        96
   macro avg       0.41      0.41      0.39        96
weighted avg       0.41      0.40      0.39        96

Elapsed time0.29946199999999834seconds
Iteration: 6




              precision    recall  f1-score   support

        fake       0.36      0.52      0.43        42
       legit       0.43      0.28      0.34        54

    accuracy                           0.39        96
   macro avg       0.39      0.40      0.38        96
weighted avg       0.40      0.39      0.38        96

Elapsed time0.3001640000000023seconds
Iteration: 7




              precision    recall  f1-score   support

        fake       0.36      0.52      0.43        42
       legit       0.43      0.28      0.34        54

    accuracy                           0.39        96
   macro avg       0.39      0.40      0.38        96
weighted avg       0.40      0.39      0.38        96

Elapsed time0.3125270000000029seconds
Iteration: 8




              precision    recall  f1-score   support

        fake       0.37      0.52      0.43        42
       legit       0.44      0.30      0.36        54

    accuracy                           0.40        96
   macro avg       0.41      0.41      0.39        96
weighted avg       0.41      0.40      0.39        96

Elapsed time0.2999409999999969seconds
Iteration: 9




              precision    recall  f1-score   support

        fake       0.37      0.52      0.43        42
       legit       0.44      0.30      0.36        54

    accuracy                           0.40        96
   macro avg       0.41      0.41      0.39        96
weighted avg       0.41      0.40      0.39        96

Elapsed time0.30399200000000093seconds
Iteration: 10




              precision    recall  f1-score   support

        fake       0.37      0.52      0.43        42
       legit       0.44      0.30      0.36        54

    accuracy                           0.40        96
   macro avg       0.41      0.41      0.39        96
weighted avg       0.41      0.40      0.39        96

Elapsed time0.3098990000000015seconds
Iteration: 11




              precision    recall  f1-score   support

        fake       0.36      0.50      0.42        42
       legit       0.43      0.30      0.35        54

    accuracy                           0.39        96
   macro avg       0.39      0.40      0.38        96
weighted avg       0.40      0.39      0.38        96

Elapsed time0.3218490000000003seconds
Iteration: 12




              precision    recall  f1-score   support

        fake       0.38      0.55      0.45        42
       legit       0.46      0.30      0.36        54

    accuracy                           0.41        96
   macro avg       0.42      0.42      0.40        96
weighted avg       0.42      0.41      0.40        96

Elapsed time0.2893939999999944seconds
Iteration: 13




              precision    recall  f1-score   support

        fake       0.35      0.48      0.40        42
       legit       0.44      0.31      0.37        54

    accuracy                           0.39        96
   macro avg       0.39      0.40      0.38        96
weighted avg       0.40      0.39      0.38        96

Elapsed time0.27981100000000225seconds
Iteration: 14




              precision    recall  f1-score   support

        fake       0.36      0.50      0.42        42
       legit       0.45      0.31      0.37        54

    accuracy                           0.40        96
   macro avg       0.40      0.41      0.39        96
weighted avg       0.41      0.40      0.39        96

Elapsed time0.32068399999999997seconds
Iteration: 15




              precision    recall  f1-score   support

        fake       0.37      0.52      0.43        42
       legit       0.44      0.30      0.36        54

    accuracy                           0.40        96
   macro avg       0.41      0.41      0.39        96
weighted avg       0.41      0.40      0.39        96

Elapsed time0.6098440000000025seconds
Iteration: 16




              precision    recall  f1-score   support

        fake       0.38      0.55      0.45        42
       legit       0.46      0.30      0.36        54

    accuracy                           0.41        96
   macro avg       0.42      0.42      0.40        96
weighted avg       0.42      0.41      0.40        96

Elapsed time0.49428100000000086seconds
Iteration: 17




              precision    recall  f1-score   support

        fake       0.38      0.55      0.45        42
       legit       0.46      0.30      0.36        54

    accuracy                           0.41        96
   macro avg       0.42      0.42      0.40        96
weighted avg       0.42      0.41      0.40        96

Elapsed time0.4988430000000008seconds
Iteration: 18




              precision    recall  f1-score   support

        fake       0.39      0.57      0.46        42
       legit       0.47      0.30      0.36        54

    accuracy                           0.42        96
   macro avg       0.43      0.43      0.41        96
weighted avg       0.43      0.42      0.41        96

Elapsed time0.48729900000000015seconds
Iteration: 19




              precision    recall  f1-score   support

        fake       0.39      0.57      0.46        42
       legit       0.47      0.30      0.36        54

    accuracy                           0.42        96
   macro avg       0.43      0.43      0.41        96
weighted avg       0.43      0.42      0.41        96

Elapsed time0.49754399999999777seconds




In [43]:
textcat_bow = spacy.load("/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/resourcesbowFake_rada")
tweets = textcat_bow(test_texts[10])
print("Text: "+ test_texts[10])
print("Gold Label:"+ test_cats[10])
print(" Predicted Label:") 
print(tweets.cats)
print("=======================================")

Text: Chris Pratt responds to body shamers telling him he's too thin Big or small Chris Pratt has heard it all. These days the "Guardians of the Galaxy" star 37 is taking flak for being too thin but he's not taking it lying down. Pratt who has been documenting the healthy snacks he's eating while filming "Jurassic World 2" in a series of "What's My Snack" Instagram videos fired back -- in his usual tongue-in-cheek manner -- after some followers apparently suggested he looked too thin. "So many people have said I look too thin in my recent episodes of WHATSMYSNACK he wrote on Instagram Thursday. Some have gone as far as to say I look 'skeletal.' Well just because I am a male doesn't mean I'm impervious to your whispers. Body shaming hurts."
Gold Label:legit
 Predicted Label:
{'legit': 0.2109808325767517, 'fake': 0.7890191674232483}


# Reused the above code to train a new classifier for celebrity dataset.

In [44]:
df_cleb = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/datasets/fake_rada/celebrity_full.tsv",sep='\t')

In [45]:
df_cleb.columns =["label","News"]

In [46]:
df_cleb.head(2)

Unnamed: 0,label,News
0,legit,This Is What Brad Pitt Has Been Texting Jennif...
1,legit,Jennifer Aniston's spokesman denies reports th...


In [47]:
df_cleb.shape

(499, 2)

In [48]:
data = df_cleb

In [49]:
c_train,c_test = train_test_split(data, test_size=0.20, random_state=0)
#save the data
c_train.to_csv('c_train.csv',index=False)
c_test.to_csv('c_test.csv',index=False)

In [50]:
print('train:',len(c_train))
print('test:',len(c_test))

train: 399
test: 100


In [51]:
def load_data_spacy(fname):
  train_data = pd.read_csv(fname,)
  train_data.dropna(axis = 0, how ='any',inplace=True)
  train_data['Num_words_text'] = train_data['News'].apply(lambda x:len(str(x).split())) 
  mask = train_data['Num_words_text'] >2
  train_data = train_data[mask]
  print(train_data['label'].value_counts())

  train_data['News'] = train_data['News'].apply(cleanup)
   
  train_texts = train_data['News'].tolist()
  train_cats = train_data['label'].tolist()
  final_train_cats=[]
  for cat in train_cats:
    cat_list = {}
    if cat == 'fake':
      cat_list['fake'] =  1
      cat_list['legit'] =  0
    else:
      cat_list['fake'] =  0
      cat_list['legit'] =  1
      
    final_train_cats.append(cat_list)
    
  training_data = list(zip(train_texts, [{"cats": cats} for cats in final_train_cats]))
  return training_data, train_texts, train_cats

In [52]:
training_data, train_texts, train_cats = load_data_spacy('/content/c_train.csv')
#training_data, train_texts, train_cats = load_data_spacy('')
#training_data, train_texts, train_cats = load_data_spacy('')
print(training_data[:10])
print(len(training_data))
test_data, test_texts, test_cats = load_data_spacy('/content/c_test.csv')
print(len(test_data))

legit    200
fake     199
Name: label, dtype: int64
[(' Kim Kardashian on How the Aftermath of Her Robbery Inspired Her New Crystal Gardenia Fragrance (Exclusive) There’s more to Kim Kardashian West’s newest fragrance, KKW Crystal Gardenia, than meets the eye! The 37-year-old reality star and businesswoman is launching her new scent on Wednesday, and this time it’s personal. “After my whole Paris situation over a year ago, all my friends would come over and bring me healing crystals,” Kim told ET’s Keltie Knight in an exclusive interview. “I wasn\'t really into them. I didn\'t know much about them. Enough people brought them over that I love the shape, I loved holding it, and I felt like, ‘OK, this is it. I want something that I feel like calms me, is healing...’ And I wanted the bottle, especially the smaller one.” The mother of two wanted the crystal-shaped bottles to bring her fans a similar calm. “It felt so pure to me," she noted.\xa0"I just loved [how] the properties of the rose 

In [None]:
def Sort(sub_li):
  # reverse = True (Soresulting_list = list(first_list)rts in Descending  order) 
  # key is set to sort using second element of  
  # sublist lambda has been used 
  return(sorted(sub_li, key = lambda x: x[1],reverse=True))  

# run the predictions on each sentence in the evaluation  dataset, and return the metrics
def evaluate(tokenizer, textcat, test_texts, test_cats ):
  docs = (tokenizer(text) for text in test_texts)
  preds = []
  for i, doc in enumerate(textcat.pipe(docs)):
    #print(doc.cats.items())
    scores = Sort(doc.cats.items())
    #print(scores)
    catList=[]
    for score in scores:
      catList.append(score[0])
    preds.append(catList[0])
        
  labels = ['fake', 'legit']
  print(classification_report(test_cats, preds,labels=labels))

In [53]:
def train_spacy(  train_data, iterations,test_texts,test_cats, model_arch, dropout = 0.3, model=None, init_tok2vec=None):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {cats: ['legit'|'fake']})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    
    nlp = spacy.load('en_core_web_sm')
    

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"exclusive_classes": True, "architecture": model_arch}
        )
        nlp.add_pipe(textcat, last=True)
        
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("legit")
    textcat.add_label("fake")
  


    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(16.0, 64.0, 1.5)
        for i in range(iterations):
            print('Iteration: '+str(i))
            start_time = time.clock()
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=dropout, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the test data 
                evaluate(nlp.tokenizer, textcat, test_texts,test_cats)
            print ('Elapsed time'+str(time.clock() - start_time)+  "seconds")
        with nlp.use_params(optimizer.averages):
            model_name = model_arch + "_clebrity"
            filepath = "/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/resources"+ model_name 
            nlp.to_disk(filepath)
    return nlp

In [54]:
nlp = train_spacy(training_data, 10, test_texts, test_cats, "bow")

Training the model...
LOSS 	  P  	  R  	  F  
Iteration: 0




              precision    recall  f1-score   support

        fake       0.66      0.75      0.70        51
       legit       0.69      0.59      0.64        49

    accuracy                           0.67       100
   macro avg       0.67      0.67      0.67       100
weighted avg       0.67      0.67      0.67       100

Elapsed time2.2905490000000057seconds
Iteration: 1




              precision    recall  f1-score   support

        fake       0.81      0.57      0.67        51
       legit       0.66      0.86      0.74        49

    accuracy                           0.71       100
   macro avg       0.73      0.71      0.71       100
weighted avg       0.73      0.71      0.70       100

Elapsed time1.4542520000000039seconds
Iteration: 2




              precision    recall  f1-score   support

        fake       0.74      0.76      0.75        51
       legit       0.74      0.71      0.73        49

    accuracy                           0.74       100
   macro avg       0.74      0.74      0.74       100
weighted avg       0.74      0.74      0.74       100

Elapsed time1.459142seconds
Iteration: 3




              precision    recall  f1-score   support

        fake       0.80      0.71      0.75        51
       legit       0.73      0.82      0.77        49

    accuracy                           0.76       100
   macro avg       0.76      0.76      0.76       100
weighted avg       0.76      0.76      0.76       100

Elapsed time1.4309450000000012seconds
Iteration: 4




              precision    recall  f1-score   support

        fake       0.83      0.69      0.75        51
       legit       0.72      0.86      0.79        49

    accuracy                           0.77       100
   macro avg       0.78      0.77      0.77       100
weighted avg       0.78      0.77      0.77       100

Elapsed time1.4061339999999944seconds
Iteration: 5




              precision    recall  f1-score   support

        fake       0.77      0.71      0.73        51
       legit       0.72      0.78      0.75        49

    accuracy                           0.74       100
   macro avg       0.74      0.74      0.74       100
weighted avg       0.74      0.74      0.74       100

Elapsed time1.4275209999999987seconds
Iteration: 6




              precision    recall  f1-score   support

        fake       0.80      0.71      0.75        51
       legit       0.73      0.82      0.77        49

    accuracy                           0.76       100
   macro avg       0.76      0.76      0.76       100
weighted avg       0.76      0.76      0.76       100

Elapsed time1.3850010000000026seconds
Iteration: 7




              precision    recall  f1-score   support

        fake       0.75      0.71      0.73        51
       legit       0.71      0.76      0.73        49

    accuracy                           0.73       100
   macro avg       0.73      0.73      0.73       100
weighted avg       0.73      0.73      0.73       100

Elapsed time1.4625529999999998seconds
Iteration: 8




              precision    recall  f1-score   support

        fake       0.77      0.71      0.73        51
       legit       0.72      0.78      0.75        49

    accuracy                           0.74       100
   macro avg       0.74      0.74      0.74       100
weighted avg       0.74      0.74      0.74       100

Elapsed time1.4823140000000024seconds
Iteration: 9




              precision    recall  f1-score   support

        fake       0.72      0.71      0.71        51
       legit       0.70      0.71      0.71        49

    accuracy                           0.71       100
   macro avg       0.71      0.71      0.71       100
weighted avg       0.71      0.71      0.71       100

Elapsed time1.4605649999999955seconds




In [55]:
textcat_bow = spacy.load("/content/drive/MyDrive/Colab Notebooks/2022-ILTAPP/resourcesbow_clebrity")
tweets = textcat_bow(test_texts[20])
print("Text: "+ test_texts[20])
print("Gold Label:"+ test_cats[20])
print(" Predicted Label:") 
print(tweets.cats)
print("=======================================")

Text: Gwen Stefani Shockingly Ruptures Eardrum & Is Forced To Cancel Show — Is She Okay? Poor Gwen Stefani! The singer shockingly ruptured her eardrum and was forced to cancel an April 27 concert in Las Vegas because of her medical emergency. This is so scary. Is Gwen’s hearing going to be okay? Gwen Stefani, 47, went through a terrifying ordeal on April 25 when her eardrum ruptured while on a flight. The painful injury landed Gwen in Cedars-Sinai hospital in Los Angeles, where doctors told her not to fly anywhere or sing until it gets healed. The brutal injury came at exactly the wrong time for Gwen. She was supposed to sing at the Power f Love gala in Las Vegas just two days later. The pop star/The Voice judge had to bow out of the prestigious event at the MGM Grand where she was set to perform a 30-minute live set, because of her limitations. That’s so awful! Guests at the event, according to the Las Vegas Review-Journal, were told about the cancellation right before the event. Gwen