# NER

Use a NER model to tag tweets and use entities as input to train a NLG model.

In [1]:
from transformers import BertForTokenClassification, BertTokenizer, T5Tokenizer
import torch
from ast import literal_eval
import os
import json
import numpy as np
import pandas as pd
import sys

sys.path.append('..')
from src.utils.utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_dir=os.path.join('D:\\Projects\\AIbrizioRomano\\model\\Bert')

with open(model_dir+'/config.json', 'r', encoding='utf-8') as f:
    datastore = json.load(f)

label_list = dict((int(k), v) for k,v in datastore['id2label'].items())
special_tokens = dict((v, f'<|{v}|>') for k,v in datastore['id2label'].items())

model = BertForTokenClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
df=pd.read_csv('D:\\Projects\\AIbrizioRomano\\fAIbrizioRomano.csv', sep=',', index_col=0)
df['Entities'] = df['Entities'].apply(literal_eval)

df=df[np.isnan(df['In_reply_to_status_id'])]
df=df[df['Language']=='en']
df=df[~df.Text.str.contains("b'RT")]
df=df[~df.Text.str.contains("RT")]
df['Text']=df.apply(lambda x: trim_text(x), axis=1)
df['Text']=df.apply(lambda x: clean_text(x), axis=1)
df.reset_index
df

Unnamed: 0,Time,Text,Entities,In_reply_to_status_id,Language
1,2022-07-26 08:23:08+00:00,Cristiano Ronaldo will meet with Erik ten Hag ...,"{'hashtags': [{'text': 'MUFC', 'indices': [162...",,en
3,2022-07-26 07:19:31+00:00,Napoli are set to sign Kim Min Jae as new cent...,"{'hashtags': [{'text': 'Napoli', 'indices': [1...",,en
4,2022-07-26 07:11:35+00:00,Nottingham Forest are set to sign Orel Mangala...,"{'hashtags': [{'text': 'NFFC', 'indices': [157...",,en
6,2022-07-26 06:07:00+00:00,"Youri Tielemans deal, stalling - as Leicester ...","{'hashtags': [{'text': 'LCFC', 'indices': [147...",,en
7,2022-07-26 05:51:12+00:00,Paris Saint-Germain and RB Leipzig have comple...,"{'hashtags': [{'text': 'PSG', 'indices': [176,...",,en
...,...,...,...,...,...
3239,2022-05-06 10:40:49+00:00,Gavi update. Meeting just finished between Bar...,"{'hashtags': [{'text': 'FCB', 'indices': [174,...",,en
3240,2022-05-06 10:28:56+00:00,Meeting today as expected for Gavi. Barcelona ...,"{'hashtags': [{'text': 'FCB', 'indices': [161,...",,en
3241,2022-05-06 10:16:35+00:00,Nagelsmann on Lewandowski deal: The conversati...,"{'hashtags': [{'text': 'FCBayern', 'indices': ...",,en
3247,2022-05-06 10:03:26+00:00,Emerson Palmieri will leave OL at the end of c...,"{'hashtags': [{'text': 'OL', 'indices': [139, ...",,en


In [4]:
df['Entities']=df.apply(lambda x: get_entities(model, tokenizer, x['Text'], label_list), axis=1)
df

Unnamed: 0,Time,Text,Entities,In_reply_to_status_id,Language
1,2022-07-26 08:23:08+00:00,Cristiano Ronaldo will meet with Erik ten Hag ...,"[{'text': 'cristiano ronaldo', 'label': 'PERSO...",,en
3,2022-07-26 07:19:31+00:00,Napoli are set to sign Kim Min Jae as new cent...,"[{'text': 'napoli', 'label': 'CLUB', 'score': ...",,en
4,2022-07-26 07:11:35+00:00,Nottingham Forest are set to sign Orel Mangala...,"[{'text': 'nottingham forest', 'label': 'CLUB'...",,en
6,2022-07-26 06:07:00+00:00,"Youri Tielemans deal, stalling - as Leicester ...","[{'text': 'youri tielemans', 'label': 'PERSON'...",,en
7,2022-07-26 05:51:12+00:00,Paris Saint-Germain and RB Leipzig have comple...,"[{'text': 'paris saint - germain', 'label': 'C...",,en
...,...,...,...,...,...
3239,2022-05-06 10:40:49+00:00,Gavi update. Meeting just finished between Bar...,"[{'text': 'gavi', 'label': 'PERSON', 'score': ...",,en
3240,2022-05-06 10:28:56+00:00,Meeting today as expected for Gavi. Barcelona ...,"[{'text': 'today', 'label': 'DATE', 'score': 0...",,en
3241,2022-05-06 10:16:35+00:00,Nagelsmann on Lewandowski deal: The conversati...,"[{'text': 'nagelsmann', 'label': 'PERSON', 'sc...",,en
3247,2022-05-06 10:03:26+00:00,Emerson Palmieri will leave OL at the end of c...,"[{'text': 'emerson palmieri', 'label': 'PERSON...",,en


In [8]:
def ent_to_string(row, special_tokens):
    string=''
    for e in row['Entities']:
        label=special_tokens[e['label']]
        text=e['text']
        string=string + f'{label} {text} {label} '

    return string

In [6]:
df['Input String']=df.apply(lambda x: ent_to_string(x, special_tokens), axis=1)

In [7]:
df

Unnamed: 0,Time,Text,Entities,In_reply_to_status_id,Language,Input String
1,2022-07-26 08:23:08+00:00,Cristiano Ronaldo will meet with Erik ten Hag ...,"[{'text': 'cristiano ronaldo', 'label': 'PERSO...",,en,<|PERSON|> cristiano ronaldo <|PERSON|> <|PERS...
3,2022-07-26 07:19:31+00:00,Napoli are set to sign Kim Min Jae as new cent...,"[{'text': 'napoli', 'label': 'CLUB', 'score': ...",,en,<|CLUB|> napoli <|CLUB|> <|PERSON|> kim min ja...
4,2022-07-26 07:11:35+00:00,Nottingham Forest are set to sign Orel Mangala...,"[{'text': 'nottingham forest', 'label': 'CLUB'...",,en,<|CLUB|> nottingham forest <|CLUB|> <|PERSON|>...
6,2022-07-26 06:07:00+00:00,"Youri Tielemans deal, stalling - as Leicester ...","[{'text': 'youri tielemans', 'label': 'PERSON'...",,en,<|PERSON|> youri tielemans <|PERSON|> <|CLUB|>...
7,2022-07-26 05:51:12+00:00,Paris Saint-Germain and RB Leipzig have comple...,"[{'text': 'paris saint - germain', 'label': 'C...",,en,<|CLUB|> paris saint - germain <|CLUB|> <|CLUB...
...,...,...,...,...,...,...
3239,2022-05-06 10:40:49+00:00,Gavi update. Meeting just finished between Bar...,"[{'text': 'gavi', 'label': 'PERSON', 'score': ...",,en,<|PERSON|> gavi <|PERSON|> <|CLUB|> barcelona ...
3240,2022-05-06 10:28:56+00:00,Meeting today as expected for Gavi. Barcelona ...,"[{'text': 'today', 'label': 'DATE', 'score': 0...",,en,<|DATE|> today <|DATE|> <|PERSON|> gavi <|PERS...
3241,2022-05-06 10:16:35+00:00,Nagelsmann on Lewandowski deal: The conversati...,"[{'text': 'nagelsmann', 'label': 'PERSON', 'sc...",,en,<|PERSON|> nagelsmann <|PERSON|> <|PERSON|> le...
3247,2022-05-06 10:03:26+00:00,Emerson Palmieri will leave OL at the end of c...,"[{'text': 'emerson palmieri', 'label': 'PERSON...",,en,<|PERSON|> emerson palmieri <|PERSON|> <|TEAM_...


In [10]:
df.to_csv(os.path.join('D:\\Projects\\Lab.Research.FootballNewsGenerator\\data_mapped.csv'))