<a href="https://colab.research.google.com/github/cicl-iscl/FramingDetection/blob/main/BertModel_Biene.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Subtask 1 - News Genre Categorisation**

In [35]:
!pip install transformers
!pip install swifter
!git clone https://github.com/cicl-iscl/FramingDetection.git
import pandas as pd
from tqdm import tqdm
import numpy as np
import os

import torch
import transformers as ppb  # pytorch transformers

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import classification_report as report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings

import swifter

tqdm.pandas()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
fatal: destination path 'FramingDetection' already exists and is not an empty directory.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#try:

warnings.filterwarnings('ignore')
#except Exception as e:
#    pass

def make_dataframe(input_folder, labels_folder=None):
    # MAKE TXT DATAFRAME
    text = []

    for fil in tqdm(filter(lambda x: x.endswith('.txt'), os.listdir(input_folder))):
        iD, txt = fil[7:].split('.')[0], open(input_folder + fil, 'r', encoding='utf-8').read()
        text.append((iD, txt))

    df_text = pd.DataFrame(text, columns=['id', 'text']).set_index('id')

    df = df_text

    # MAKE LABEL DATAFRAME
    if labels_folder:
        labels = pd.read_csv(labels_folder, sep='\t', header=None)
        labels = labels.rename(columns={0: 'id', 1: 'type'})
        labels.id = labels.id.apply(str)
        labels = labels.set_index('id')

        # JOIN
        df = labels.join(df_text)[['text', 'type']]

    return df

class BertTokenizer(object):

    def __init__(self, text=[]):
        self.text = text

        # For DistilBERT:
        self.model_class, self.tokenizer_class, self.pretrained_weights = (
        ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

        # Load pretrained model/tokenizer
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)

        self.model = self.model_class.from_pretrained(self.pretrained_weights)

    def get(self):

        df = pd.DataFrame(data={"text": self.text})
        tokenized = df["text"].swifter.apply((lambda x: self.tokenizer.encode(x, add_special_tokens=True)))

        max_len = 0
        for i in tokenized.values:
            if len(i) > max_len:
                max_len = len(i)

        padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])

        attention_mask = np.where(padded != 0, 1, 0)
        input_ids = torch.tensor(padded)
        attention_mask = torch.tensor(attention_mask)

        with torch.no_grad():
            last_hidden_states = self.model(input_ids, attention_mask=attention_mask)

        features = last_hidden_states[0][:, 0, :].numpy()

        return features

def main():
    print("Read Data from disk:")
    #loaddata.load_trainingdata()
    #os.chdir('/Volumes/Elements/Computerlinguistics/Subtask1/')

    language = "en"
    folder_train = "/content/FramingDetection/Data/data/" + language + "/train-articles-subtask-1/"
    folder_dev = "/content/FramingDetection/Data/data/" + language + "/dev-articles-subtask-1/"
    labels_train_fn = "/content/FramingDetection/Data/data/" + language + "/train-labels-subtask-1.txt"
    out_fn = "/content/FramingDetection/Data/results/output-subtask-1-dev-" + language + ".txt"

    # Read Data
    print('Loading training...')
    train = make_dataframe(folder_train, labels_train_fn)
    print('Loading dev...')
    test = make_dataframe(folder_dev)

    X_train = train['text'].values
    X_test = test['text'].values
    Y_train = train['type'].values


    encoder = LabelEncoder()
    Y_train = encoder.fit_transform(Y_train)
    #x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3)

    _instance = BertTokenizer(text=X_train)
    tokens = _instance.get()

    #lr_clf = LogisticRegression()
    #lr_clf.fit(tokens, Y_train)

    pipe = Pipeline([('vectorizer', CountVectorizer(ngram_range=(10, 10),
                                                     analyzer='char')),
                      ('RandomForestClassifier', DecisionTreeClassifier(class_weight='balanced', max_depth=None,
                                 min_samples_split=2, random_state=0))])

    pipe.fit(tokens, Y_train)

    print('In-sample Acc: \t\t', pipe.score(X_train, Y_train))

    Y_pred = pipe.predict(X_test)

    out = pd.DataFrame(Y_pred, test.index)
    out.to_csv(out_fn, sep='\t', header=None)
    print('Results on: ', out_fn)

    #_instance = BertTokenizer(text=x_test)
    #tokensTest = _instance.get()

    #predicted = lr_clf.predict(tokensTest)

    #np.mean(predicted == y_test)


if __name__ == "__main__":
    main()



Read Data from disk:
Loading training...


433it [00:00, 3455.45it/s]


Loading dev...


83it [00:00, 5381.14it/s]
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Pandas Apply:   0%|          | 0/433 [00:00<?, ?it/s]

# **Subtask 2 - Framing Detection**

In [1]:
import pandas as pd
from tqdm import tqdm
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import classification_report as report
from sklearn.feature_extraction.text import CountVectorizer
import argparse
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import warnings


In [5]:
#try

warnings.filterwarnings('ignore')
#except Exception as e:
#    pass

def make_dataframe(input_folder, labels_folder=None):
    #MAKE TXT DATAFRAME
    text = []

    for fil in tqdm(filter(lambda x: x.endswith('.txt'), os.listdir(input_folder))):
        
        iD, txt = fil[7:].split('.')[0], open(input_folder +fil, 'r', encoding='utf-8').read()
        text.append((iD, txt))

    df_text = pd.DataFrame(text, columns=['id','text']).set_index('id')
    df = df_text
    
    
    #MAKE LABEL DATAFRAME
    if labels_folder:
        labels = pd.read_csv(labels_folder, sep='\t', header=None)
        labels = labels.rename(columns={0:'id',1:'frames'})
        labels.id = labels.id.apply(str)
        labels = labels.set_index('id')

        #JOIN
        df = labels.join(df_text)[['text','frames']]
        
    
    return df

def main():
    print("Read Data from disk:")
    language = "en"
    folder_train = "/content/drive/MyDrive/Colab Notebooks/Data/data/" + language + "/train-articles-subtask-2/"
    folder_dev = "/content/drive/MyDrive/Colab Notebooks/Data/data/" + language + "/dev-articles-subtask-2/"
    labels_train_fn = "/content/drive/MyDrive/Colab Notebooks/Data/data/" + language + "/train-labels-subtask-2.txt"
    out_fn = "/content/FramingDetection/Data/results/output-subtask-2-dev-" + language + ".txt"    

    print()
    folder_train 
    folder_dev 
    labels_train_fn 
    out_fn 

    #Read Data
    print('Loading training...')
    train = make_dataframe(folder_train, labels_train_fn)
    print('Loading dev...')
    test = make_dataframe(folder_dev)

    X_train = train['text'].values
    X_test = test['text'].values

    encoder = MultiLabelBinarizer() #use sklearn binarizer

    Y_train = train['frames'].str.split(',').values
    Y_train = encoder.fit_transform(Y_train)


    pipe = Pipeline([('vectorizer',CountVectorizer(ngram_range = (1, 2), 
                                               analyzer='word')),
                ('SVM_multiclass', MultiOutputClassifier(svm.SVC(class_weight= None, C=1, kernel='linear'),n_jobs=1))])

    pipe.fit(X_train,Y_train)

    print('In-sample Acc: \t\t', pipe.score(X_train,Y_train))
    
    Y_pred = pipe.predict(X_test)
    out = encoder.inverse_transform(Y_pred)
    out = list(map(lambda x: ','.join(x), out))
    out = pd.DataFrame(out, test.index)
    print("dataframe out: " + out)
    out.to_csv(out_fn, sep='\t', header=None)
    print('Results on: ', out_fn)

if __name__ == "__main__":
    main()

Read Data from disk:

Loading training...


433it [00:00, 648.91it/s]


Loading dev...


83it [00:00, 580.13it/s]


In-sample Acc: 		 0.9953810623556582
                                                           0
id                                                          
813494037  dataframe out: Legality_Constitutionality_and_...
830359136  dataframe out: Legality_Constitutionality_and_...
833050243  dataframe out: Crime_and_punishment,External_r...
813547724  dataframe out: Legality_Constitutionality_and_...
833028680  dataframe out: Crime_and_punishment,External_r...
...                                                      ...
822942601  dataframe out: External_regulation_and_reputat...
832269185  dataframe out: Legality_Constitutionality_and_...
814251296  dataframe out: Legality_Constitutionality_and_...
813992175                            dataframe out: Morality
833053676  dataframe out: Crime_and_punishment,Legality_C...

[83 rows x 1 columns]
Results on:  /content/FramingDetection/Data/results/output-subtask-2-dev-en.txt


In [None]:
    parser = argparse.ArgumentParser(description="Subtask-2")
    parser.add_argument("/content/FramingDetection/Data/data/en/dev-articles-subtask-2",  type=str, nargs=1,
                    help='Path to dev articles')
    args, unknown = parser.parse_known_args()

In [6]:
import pandas as pd
from tqdm import tqdm
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.metrics import classification_report as report
from sklearn.feature_extraction.text import CountVectorizer
import argparse
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import warnings


warnings.filterwarnings('ignore')
#except Exception as e:
#    pass

def make_dataframe(input_folder, labels_folder=None):
    #MAKE TXT DATAFRAME
    text = []

    for fil in tqdm(filter(lambda x: x.endswith('.txt'), os.listdir(input_folder))):
        
        iD, txt = fil[7:].split('.')[0], open(input_folder +fil, 'r', encoding='utf-8').read()
        text.append((iD, txt))

    df_text = pd.DataFrame(text, columns=['id','text']).set_index('id')
    df = df_text
    
    
    #MAKE LABEL DATAFRAME
    if labels_folder:
        labels = pd.read_csv(labels_folder, sep='\t', header=None)
        labels = labels.rename(columns={0:'id',1:'frames'})
        labels.id = labels.id.apply(str)
        labels = labels.set_index('id')

        #JOIN
        df = labels.join(df_text)[['text','frames']]
        
    
    return df

def main():
    print("Read Data from disk:")

    language = "en"
    train_folder = "/content/FramingDetection/Data/data/" + language + "/train-articles-subtask-2/"
    dev_folder = "/content/FramingDetection/Data/data/" + language + "/dev-articles-subtask-2/"
    train_labels = "/content/FramingDetection/Data/data/" + language + "/train-labels-subtask-2.txt"
    output = "results/output-subtask-2-dev-" + language + ".txt"

    parser = argparse.ArgumentParser(description='Subtask-2')
    parser.add_argument('train_folder',  type=str, nargs=1,
                        help='Path to training articles')
    parser.add_argument('dev_folder',  type=str, nargs=1,
                    help='Path to dev articles')
    parser.add_argument('train_labels',  type=str, nargs=1,
                    help='Path to training labels')
    parser.add_argument('-o', "--output",  type=str, nargs=1,
                help='Path to output predictions on dev (mandatory)')

    
    args = parser.parse_args()
    if not args.output:
        print("argument -o is mandatory")
        sys.exit(1)
    
    print(args)
    folder_train = args.train_folder[0]
    folder_dev = args.dev_folder[0]
    labels_train_fn = args.train_labels[0]
    out_fn = args.output[0]


    #Read Data
    print('Loading training...')
    train = make_dataframe(folder_train, labels_train_fn)
    print('Loading dev...')
    test = make_dataframe(folder_dev)

    X_train = train['text'].values
    X_test = test['text'].values

    encoder = MultiLabelBinarizer() #use sklearn binarizer

    Y_train = train['frames'].str.split(',').values
    Y_train = encoder.fit_transform(Y_train)


    pipe = Pipeline([('vectorizer',CountVectorizer(ngram_range = (1, 2), 
                                               analyzer='word')),
                ('SVM_multiclass', MultiOutputClassifier(svm.SVC(class_weight= None, C=1, kernel='linear'),n_jobs=1))])

    pipe.fit(X_train,Y_train)

    print('In-sample Acc: \t\t', pipe.score(X_train,Y_train))
    
    Y_pred = pipe.predict(X_test)
    out = encoder.inverse_transform(Y_pred)
    out = list(map(lambda x: ','.join(x), out))
    out = pd.DataFrame(out, test.index)
    out.to_csv(out_fn, sep='\t', header=None)
    print('Results on: ', out_fn)

if __name__ == "__main__":
    main()

Read Data from disk:


usage: ipykernel_launcher.py [-h] train_folder
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: ignored

In [26]:
!pip install bert-tensorflow
!pip install bert_repo --upgrade
#import optimization 
#import run_classifier 
#import tokenization 
import tensorflow as tf 
import tensorflow_hub
#from bert_repo import run_classifier_with_tfhub
#import run_classifier_with_tfhub.create_tokenizer_from_hub_module from bert-tensflow
#!wget https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4
Bert_Model_Hub = "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4"
tokenizer = run_classifier_with_tfhub.create_tokenizer_from_hub_module(Bert_Model_Hub)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement bert_repo (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for bert_repo[0m[31m
[0m--2023-01-18 14:55:38--  https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4
Resolving tfhub.dev (tfhub.dev)... 74.125.31.101, 74.125.31.139, 74.125.31.102, ...
Connecting to tfhub.dev (tfhub.dev)|74.125.31.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘4’

4                       [ <=>                ]   1.21K  --.-KB/s    in 0s      

2023-01-18 14:55:38 (13.6 MB/s) - ‘4’ saved [1237]



NameError: ignored

In [33]:
!pip install -q -U "tensorflow-text==2.8.*"
!pip install -q -U tf-models-official==2.7.0
!pip install -U tfds-nightly
!pip install tensorflow-text

import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
!python -m pip install tensorflow_text
#import tensorflow_text #as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np

tf.get_logger().setLevel('ERROR')

os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


AlreadyExistsError: ignored