In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install torch

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForTokenClassification

In [None]:
model_name = 'aubmindlab/bert-base-arabertv2'

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
classifier = pipeline('ner', model=model_name, tokenizer=tokenizer)

In [None]:
import pandas as pd
import os

In [None]:
data = os.listdir('../../data/facebook_posts')

In [None]:
data

In [None]:
df_list = []
for filename in data:
    df = pd.read_csv(f"../../data/facebook_posts/{filename}")
    df_list.append(df)

In [None]:
df_list

In [None]:
df = pd.concat(df_list, axis=0, ignore_index=True)

In [None]:
df.columns

In [None]:
df_message = df[["Facebook Id","Total Interactions", "Message"]]

In [None]:
# 1114 posts with no message
df_message.dropna(subset= ["Message"], inplace= True)

In [None]:
# Actual assignation of untrained model. Not needed furthermore.
# df_message = df_message.assign(NER = lambda x: x["Message"].apply(lambda s: classifier(s)))

In [None]:
df_message

In [None]:
df_message_small = df_message[:25].copy()

In [None]:
df_message_small

In [None]:
def expand_dict_series(
    df: pd.DataFrame,
    dict_column: str,
    keys = None,
    col_prefix = None,
) -> pd.DataFrame:
    """Expand a series of dicts into one column for each of the specified keys.

    Note that an error will only be raised if keys is not None and all the keys
    are not found in any dict in the series. If a key is missing from a
    given row but is present in other rows, that row will be nan in
    the resulting column.

    Specify col_prefix in cases where keys in dict col overlap
    with column names.

    ref:
    https://stackoverflow.com/questions/54344114/expand-pandas-dataframe-column-of-dict-into-dataframe-columns
    """
    assert not df[dict_column].isnull().any(), "na found in dict column"
    df = df.reset_index(drop=True)  # get a clean index for the join
    expanded_df = pd.DataFrame(df[dict_column].values.tolist())
    if col_prefix is not None:
        expanded_df = expanded_df.add_prefix(col_prefix)
    keys = keys or expanded_df.columns
    expanded_df = df.drop(dict_column, axis=1).join(expanded_df[keys])
    return expanded_df


In [None]:
NER_keys_df = expand_dict_series(df_message_small.explode("NER"), "NER")

In [None]:
NER_keys_df.iloc[0].Message

In [None]:
NER_keys_df["score"].hist()

In [None]:
NER_keys_df.groupby("entity").count()

In [None]:
word_entity_df = NER_keys_df.groupby(["word", "entity"]).count().sort_values(by="score", ascending=False)

In [None]:
word_entity_df[:30]

In [None]:
word_entity_df.assign(word_en =  lambda x: x["word"].apply(lambda s: translator.translate(s)))

In [None]:
from data.data_path import DATA_DIRECTORY

In [None]:
label_list =['B-LOC',
 'O',
 'B-PERS',
 'I-PERS',
 'B-ORG',
 'I-LOC',
 'I-ORG',
 'B-MISC',
 'I-MISC']

In [None]:
# PROBABLY NOT NEEDED
class NERDataset:
    def __init__(self, texts, tags, label_list, model_name, max_length):
        self.texts = texts
        self.tags = tags
        self.label_map = {label: i for i, label in enumerate(label_list)}
        self.preprocessor = ArabertPreprocessor(model_name.split("/")[-1])    
        self.pad_token_label_id = torch.nn.CrossEntropyLoss().ignore_index
        # Use cross entropy ignore_index as padding label id so that only
        # real label ids contribute to the loss later.
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_length = max_length

     
    def __len__(self):
        return len(self.texts)
  
    def __getitem__(self, item):
        textlist = self.texts[item]
#         tags = self.tags[item]

        tokens = []
        label_ids = []
#         for word, label in zip(textlist, tags):      
        for word in textlist:      
            clean_word = self.preprocessor.preprocess(word)  
            word_tokens = self.tokenizer.tokenize(clean_word)

            if len(word_tokens) > 0:
                tokens.extend(word_tokens)    
                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
#                 label_ids.extend([self.label_map[label]] + [self.pad_token_label_id] * (len(word_tokens) - 1))
                label_ids.extend([self.pad_token_label_id] + [self.pad_token_label_id] * (len(word_tokens) - 1))
 
        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = self.tokenizer.num_special_tokens_to_add()
        if len(tokens) > self.max_length - special_tokens_count:
            tokens = tokens[: (self.max_length - special_tokens_count)]
            label_ids = label_ids[: (self.max_length - special_tokens_count)]

        #Add the [SEP] token
        tokens += [self.tokenizer.sep_token]
        label_ids += [self.pad_token_label_id]
        token_type_ids = [0] * len(tokens)

        #Add the [CLS] TOKEN
        tokens = [self.tokenizer.cls_token] + tokens
        label_ids = [self.pad_token_label_id] + label_ids
        token_type_ids = [0] + token_type_ids

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = self.max_length - len(input_ids)

        input_ids += [self.tokenizer.pad_token_id] * padding_length
        attention_mask += [0] * padding_length
        token_type_ids += [0] * padding_length
        label_ids += [self.pad_token_label_id] * padding_length

        assert len(input_ids) == self.max_length
        assert len(attention_mask) == self.max_length
        assert len(token_type_ids) == self.max_length
        assert len(label_ids) == self.max_length

        # if item < 5:
        #   print("*** Example ***")
        #   print("tokens:", " ".join([str(x) for x in tokens]))
        #   print("input_ids:", " ".join([str(x) for x in input_ids]))
        #   print("attention_mask:", " ".join([str(x) for x in attention_mask]))
        #   print("token_type_ids:", " ".join([str(x) for x in token_type_ids]))
        #   print("label_ids:", " ".join([str(x) for x in label_ids]))

        return {
            'input_ids' : torch.tensor(input_ids, dtype=torch.long),
            'attention_mask' : torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids' : torch.tensor(token_type_ids, dtype=torch.long),
            'labels' : torch.tensor(label_ids, dtype=torch.long)       
        }

In [None]:
# model_name = DATA_DIRECTORY +"/retrained_ner_arabertv2"
model_name = 'aubmindlab/bert-base-arabertv2'

In [None]:
!pip install pyarabic
# !git clone https://github.com/aub-mind/arabert
!pip install farasapy
from arabert.preprocess import ArabertPreprocessor

In [None]:
## retrained: 
model_rt = AutoModelForTokenClassification.from_pretrained(DATA_DIRECTORY +"/retrained_ner_arabertv2")
# tokenizer_rt = AutoTokenizer.from_pretrained(DATA_DIRECTORY +"/retrained_ner_arabertv2")


In [None]:
classifier_rt = pipeline('token-classification', model=model_rt, tokenizer=tokenizer)

In [None]:
df_rt = df_message[:100].assign(NER_rt =  lambda x: x["Message"].apply(lambda s: classifier_rt(s)))

In [None]:
label_list =['B-LOC',
 'O',
 'B-PERS',
 'I-PERS',
 'B-ORG',
 'I-LOC',
 'I-ORG',
 'B-MISC',
 'I-MISC']

In [None]:

label2ent= {
 "LABEL_0":'B-LOC',
 "LABEL_1": 'O',
 "LABEL_2": 'B-PERS',
 "LABEL_3": 'I-PERS',
 "LABEL_4": 'B-ORG',
 "LABEL_5": 'I-LOC',
 "LABEL_6": 'I-ORG',
 "LABEL_7": 'B-MISC',
 "LABEL_8": 'I-MISC'
}

In [None]:
df_indexed = df_rt.reset_index()

In [None]:
df_indexed= df_indexed.rename(columns={"index": "post_id"})

In [None]:
NER_keys_df_rt = expand_dict_series(df_indexed.explode("NER_rt"), "NER_rt")

In [None]:
NER_keys_df_rt["entity"] = NER_keys_df_rt["entity"].replace(label2ent)

In [None]:
NER_keys_df_rt

In [None]:
### concat entities together
NER_keys_df_rt_entities = NER_keys_df_rt[NER_keys_df_rt["entity"]!="O"].copy()

In [None]:
NER_keys_df_rt_entities= NER_keys_df_rt_entities.reset_index(drop=True)

In [None]:
NER_keys_df_rt_entities.loc[i,"entity"][-3:]

In [None]:
NER_keys_df_rt_entities["word_plus_1"] = NER_keys_df_rt_entities["word"].shift(-1)
NER_keys_df_rt_entities["ent_plus_1"] = NER_keys_df_rt_entities["entity"].shift(-1)

In [None]:
NER_keys_df_rt_entities["word_new"] = (NER_keys_df_rt_entities["word"] + NER_keys_df_rt_entities["word_plus_1"])if 

In [None]:
def concat_condition(i:int, df) -> bool:
    condition = ((df.loc[i,"entity"].startswith("I") or df.loc[i,"entity"].startswith("B")) and 
                 df.loc[i+1,"entity"].startswith("I") and
                 df.loc[i+1,"entity"][-3:] == df.loc[i,"entity"][-3:] and
                 df.loc[i+1, "start"] == df.loc[i, "end"])
    return condition

In [None]:
df_2 = NER_keys_df_rt_entities.copy()

In [None]:
## Concat words if they're B and I's of the same entity type
for i in reversed(range(1, (len(df_2)-1))):
    df_2.loc[i, "word"] = df_2.loc[i, "word"] +df_2.loc[i+1, "word"] if (concat_condition(i,df_2)) else df_2.loc[i, "word"]

In [None]:
df_2

In [None]:
df_post = df_2.groupby("post_id").first()

In [None]:
NER_keys_df_rt_entities_1 = df_2.drop(df_2[df_2["entity"].str.startswith("I")].index)

In [None]:
NER_keys_df_rt_entities_1["word"] = NER_keys_df_rt_entities_1["word"].str.replace("#", "", regex=False)

In [None]:
NER_keys_df_rt_entities_1

In [None]:
entities = NER_keys_df_rt_entities_1[NER_keys_df_rt_entities_1["entity"]!="O"].groupby(["post_id", "entity"])["word"].apply(list)



In [None]:
NER_keys_df_rt.iloc[0]["Message"]

In [None]:
entities

In [None]:
ent_df = entities.unstack()

In [None]:
ent_df = ent_df.fillna("[]")

In [None]:
ent_df

In [None]:
df_post = df_post[["Facebook Id","Total Interactions", "Message"]]

In [None]:
df_post.merge(ent_df, left_index=True, right_index=True, how="left").to_csv("Facebook_May.csv")

In [None]:
ent_df["word"]