In [2]:
import numpy as np
import json
import pandas as pd
f = open('../input/news-annotated/dataset_annotated_impact.json')
data = json.load(f)
df_train = pd.DataFrame(data['train'])
prob = []
for i in range(len(df_train)):
    if len(df_train.iloc[i,1]) > 0:
        prob.append(np.sum([j=='جریان‌ساز' for j in df_train.iloc[i,1]])/len(df_train.iloc[i,1]))
    else:
        prob.append(np.nan)
df_train['probability'] = prob

df_eval = pd.DataFrame(data['eval'])
prob = []
for i in range(len(df_eval)):
    if len(df_eval.iloc[i,1]) > 0:
        prob.append(np.sum([j=='جریان‌ساز' for j in df_eval.iloc[i,1]])/len(df_eval.iloc[i,1]))
    else:
        prob.append(np.nan)
df_eval['probability'] = prob
df_test = pd.DataFrame(data['test'])
prob = []
for i in range(len(df_test)):
    if len(df_test.iloc[i,1]) > 0:
        prob.append(np.sum([j=='جریان‌ساز' for j in df_test.iloc[i,1]])/len(df_test.iloc[i,1]))
    else:
        prob.append(np.nan)
df_test['probability'] = prob
df_train

In [3]:
!pip install -q transformers
!pip install -q hazm
!pip install -q clean-text[gpl]

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import hazm
from cleantext import clean

import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

import os
import re
import json
import copy
import collections

In [5]:
df_train = df_train.dropna()
df_eval = df_eval.dropna()
df_test = df_test.dropna()

In [6]:
def prob_to_label(rate, threshold=0.5):
    if rate <= threshold:
        return 'negative'
    else:
        return 'positive'

df_train['label'] = df_train['probability'].apply(lambda t: prob_to_label(t, 0.5))
df_eval['label'] = df_eval['probability'].apply(lambda t: prob_to_label(t, 0.5))
df_test['label'] = df_test['probability'].apply(lambda t: prob_to_label(t, 0.5))

labels = list(sorted(df_test['label'].unique()))
df_train

In [7]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    
    return text

In [8]:
df_train['cleaned_text'] = df_train['text'].apply(cleaning)
df_eval['cleaned_text'] = df_eval['text'].apply(cleaning)
df_test['cleaned_text'] = df_test['text'].apply(cleaning)
df_train

In [9]:
data = df_train[['cleaned_text', 'label']]
negative_data = data[data['label'] == 'negative']
positive_data = data[data['label'] == 'positive']
len(negative_data), len(positive_data)

In [10]:
cutting_point = 600

if cutting_point <= len(negative_data):
    negative_data = negative_data.sample(n=cutting_point).reset_index(drop=True)

if cutting_point <= len(positive_data):
    positive_data = positive_data.sample(n=cutting_point).reset_index(drop=True)

data = pd.concat([negative_data, positive_data])
data = data.sample(frac=1).reset_index(drop=True)
data

In [11]:
data['label_id'] = data['label'].apply(lambda t: labels.index(t))
df_eval['label_id'] = df_eval['label'].apply(lambda t: labels.index(t))
df_test['label_id'] = df_test['label'].apply(lambda t: labels.index(t))


x_train, y_train = data['cleaned_text'].values.tolist(), data['label_id'].values.tolist()
x_valid, y_valid = df_eval['cleaned_text'].values.tolist(), df_eval['label_id'].values.tolist()
x_test, y_test = df_test['cleaned_text'].values.tolist(), df_test['label_id'].values.tolist()

print(len(x_train))
print(len(x_valid))
print(len(x_test))

## Training

In [12]:
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

In [24]:
# general config
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
TEST_BATCH_SIZE = 4

EPOCHS = 1
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = './bert-fa-base-uncased/model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [14]:
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

In [15]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

In [16]:
class InputExample:
    """ A single example for simple sequence classification. """

    def __init__(self, guid, text_a, text_b=None, label=None):
        """ Constructs a InputExample. """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=128, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)
        
        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]
        
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    features = glue_convert_examples_to_features(
        examples, 
        tokenizer, 
        maxlen, 
        output_mode=output_mode, 
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features
    
    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [17]:
train_dataset_base, train_examples = make_examples(tokenizer, x_train, y_train, maxlen=512)
valid_dataset_base, valid_examples = make_examples(tokenizer, x_valid, y_valid, maxlen=512)

test_dataset_base, test_examples = make_examples(tokenizer, x_test, y_test, maxlen=512)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, y_test, maxlen=512, is_tf_dataset=False)

In [18]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [19]:
train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
valid_dataset = get_training_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

train_steps, valid_steps

In [20]:
def build_model(model_name, config, learning_rate=3e-5):
    model = TFBertForSequenceClassification.from_pretrained(model_name, config=config)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [21]:
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

In [25]:
%%time

r = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    verbose=1)

final_accuracy = r.history['val_accuracy']
print('FINAL ACCURACY MEAN: ', np.mean(final_accuracy))

In [26]:
ev = model.evaluate(test_dataset_base.batch(TEST_BATCH_SIZE))
print()
print(f'Evaluation: {ev}')
print()

predictions = model.predict(xtest)
ypred = predictions[0].argmax(axis=-1).tolist()

print()
print(classification_report(ytest, ypred, target_names=labels))
print()

print(f'F1: {f1_score(ytest, ypred, average="weighted")}')