#### Load the model, define the nlp pipeline as ner

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

#### Specify channel name (for saving output)

In [66]:
channel = 'Nippon_TV'

#### Specify how channel is named in 'world_news_720_df.csv'

In [67]:
channel_in_df = 'Nippon TV (JP)'

#### Import Pandas and read in the world news df, then filter the rows for the specified channel

In [68]:
import pandas as pd
df = pd.read_csv('world_news_720_df.csv')
df = df[df['channel'] == channel_in_df] 
df

Unnamed: 0,channel,month,video_id,text
6126,Nippon TV (JP),12,-2nUSCe3oWY,a traffic accident involving multiple vehicles...
6127,Nippon TV (JP),8,-B-Z7bJkdPM,a research group of japan's health ministry ha...
6128,Nippon TV (JP),1,-JhA3vkeMuA,japanese automakers are reducing production am...
6129,Nippon TV (JP),11,-LNQS6KNGSk,japanese defense minister kishi nobuo inspecte...
6130,Nippon TV (JP),4,-PpNZTc8fJg,japanese princess mako's boyfriend komoro k is...
...,...,...,...,...
6841,Nippon TV (JP),11,zfo7so0YLaU,a car crashed into a ramen shop in tokyo injur...
6842,Nippon TV (JP),2,zhjHVf3Jtk8,a case of a damaged japanese cargo flight was ...
6843,Nippon TV (JP),2,zo4xD2ls7Vc,life-size hina dolls are on display at a histo...
6844,Nippon TV (JP),6,zsHgDV6rG8g,the australian softball team arrived in japan ...


#### Define 'docs' as the text column in the df

In [69]:
docs = df['text']
docs

6126    a traffic accident involving multiple vehicles...
6127    a research group of japan's health ministry ha...
6128    japanese automakers are reducing production am...
6129    japanese defense minister kishi nobuo inspecte...
6130    japanese princess mako's boyfriend komoro k is...
                              ...                        
6841    a car crashed into a ramen shop in tokyo injur...
6842    a case of a damaged japanese cargo flight was ...
6843    life-size hina dolls are on display at a histo...
6844    the australian softball team arrived in japan ...
6845    whiteout conditions have likely caused a massi...
Name: text, Length: 720, dtype: object

#### Extract the items tagged as named entity recognition (ner) items in the docs
#### (LOC: locations, PER: people, ORG: organizations, MISC: miscellaneous)
#### This will probably take a long time

In [70]:
docs_ner = nlp([doc for doc in docs])
docs_ner

[[{'entity_group': 'LOC',
   'score': 0.9999228,
   'word': ' japan',
   'start': 67,
   'end': 72},
  {'entity_group': 'ORG',
   'score': 0.83233464,
   'word': ' fire department',
   'start': 175,
   'end': 190}],
 [{'entity_group': 'LOC',
   'score': 0.9999117,
   'word': ' japan',
   'start': 20,
   'end': 25},
  {'entity_group': 'ORG',
   'score': 0.5957899,
   'word': ' health ministry',
   'start': 28,
   'end': 43},
  {'entity_group': 'MISC',
   'score': 0.9448408,
   'word': ' moderna',
   'start': 73,
   'end': 80},
  {'entity_group': 'LOC',
   'score': 0.9996625,
   'word': ' u',
   'start': 232,
   'end': 233},
  {'entity_group': 'LOC',
   'score': 0.9997414,
   'word': 's',
   'start': 234,
   'end': 235},
  {'entity_group': 'MISC',
   'score': 0.99859655,
   'word': ' covid-19',
   'start': 261,
   'end': 269},
  {'entity_group': 'MISC',
   'score': 0.9925898,
   'word': ' madurana',
   'start': 338,
   'end': 346},
  {'entity_group': 'PER',
   'score': 0.9985202,
   'wor

#### Create lists to store the data from each docs_ner dictionary item
#### the file list records the number of the document containing ner items
##### - this is necessary to calculate document frequency later

#### store the lists in a pandas dataframe

In [71]:
file = []
pnoun = []
entity_group = []
score = []
start = []
end = []


for i, doc in enumerate(docs_ner):
    for item in doc:
        file.append(i)
        pnoun.append(item['word'])
        entity_group.append(item['entity_group'])
        score.append(item['score'])
        start.append(item['start'])
        end.append(item['end'])
        
data = {'file': file, 'pnoun': pnoun, 'entity_group': entity_group,
        'score': score, 'start': start, 'end': end}

ner_df = pd.DataFrame(data)

ner_df

Unnamed: 0,file,pnoun,entity_group,score,start,end
0,0,japan,LOC,0.999923,67,72
1,0,fire department,ORG,0.832335,175,190
2,1,japan,LOC,0.999912,20,25
3,1,health ministry,ORG,0.595790,28,43
4,1,moderna,MISC,0.944841,73,80
...,...,...,...,...,...,...
4777,719,miyagi,LOC,0.997664,95,101
4778,719,japan,LOC,0.999764,126,131
4779,719,tohoku expressway,LOC,0.949357,254,271
4780,719,tokyo,LOC,0.998095,312,317


#### Import the frequency distibution from nltk and calculate the frequency of of each ner item (proper noun) 
#### - directly from the 'pnoun' list compiled above

In [72]:
from nltk.probability import FreqDist
fd = FreqDist(pnoun)
fd

FreqDist({' japan': 561, ' tokyo': 340, ' japanese': 318, ' covid-19': 79, ' tokyo olympics': 69, ' u.s': 58, ' olympic': 50, ' paralympics': 46, ' okinawa': 43, ' osaka': 39, ...})

#### Calculate total number of tokens in this part of the corpus to calculate normalised frequency (per million words)
#### I tried to calculate this by summing the tokens that were tokenized by the autotokenizer
#### - but it was difficult to check how the text was tokenized (punctuation, etc)
#### And I got this message too:
###### Token indices sequence length is longer than the specified maximum sequence length for this model (581 > 512). Running this sequence through the model will result in indexing errors

In [12]:
# Commented out for the reason above

# tokens = tokenizer([doc for doc in docs])
# total_tokens = sum([len(token) for token in tokens['input_ids']])
# total_tokens

#### So I used spaCy instead to calculate the number of tokens 
##### because I used spaCy to produce frequency lists for other parts of speech

In [74]:
import spacy

doc_lengths = []

spacy_nlp = spacy.load("en_core_web_sm")

spacy_docs = spacy_nlp.pipe(docs)

for doc in spacy_docs:
    doc_tokens = [token.text for token in doc if not token.is_punct]
    doc_lengths.append(len(doc_tokens))
    total_tokens = sum(doc_lengths)

total_tokens

80038

#### Create a pandas dataframe with the words and frequency information from the nltk FreqDist
#### - then add a column with the normalized frequency (per million)

In [75]:
freq_df = pd.DataFrame(fd.most_common(), columns=['word', 'frequency'])
freq_df['per_million'] = freq_df['frequency'] / total_tokens * 1000000
freq_df

Unnamed: 0,word,frequency,per_million
0,japan,561,7009.170644
1,tokyo,340,4247.982208
2,japanese,318,3973.112771
3,covid-19,79,987.031160
4,tokyo olympics,69,862.090507
...,...,...,...
1561,boeing 747f,1,12.494065
1562,transport ministry,1,12.494065
1563,hina,1,12.494065
1564,yamamoto ichita,1,12.494065


#### rearrange the ner_df dataframe to show type frequencies per document

In [76]:
grouped = ner_df.groupby(['file', 'pnoun']).count()
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,entity_group,score,start,end
file,pnoun,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,fire department,1,1,1,1
0,japan,1,1,1,1
1,covid-19,1,1,1,1
1,health ministry,1,1,1,1
1,ito suminobu,1,1,1,1
...,...,...,...,...,...
718,yamamoto ichita,1,1,1,1
719,japan,2,2,2,2
719,miyagi,1,1,1,1
719,tohoku expressway,1,1,1,1


#### Assign the list of types per document to a variable (types) by using .reset_index()
#### This can be used to calculate the document frequency of each proper noun

In [77]:
types = grouped.reset_index()['pnoun']
types

0          fire department
1                    japan
2                 covid-19
3          health ministry
4             ito suminobu
               ...        
3992       yamamoto ichita
3993                 japan
3994                miyagi
3995     tohoku expressway
3996                 tokyo
Name: pnoun, Length: 3997, dtype: object

#### Calculate the document frequency from the list of types

In [78]:
doc_freq = FreqDist(types)
doc_freq

FreqDist({' japan': 375, ' tokyo': 248, ' japanese': 246, ' covid-19': 70, ' tokyo olympics': 67, ' paralympics': 40, ' olympic': 37, ' u.s': 36, ' osaka': 31, ' tokyo games': 31, ...})

#### Calculate the number of docs in the subcorpus using the variable 'docs'
##### - this was used at the beginning to extract each text in the subcorpus from world_news_720_df.csv
#### Then create a pandas dataframe with the information from the 'doc_freq' nltk FreqDist created above
#### Then add a column calculating the percentage of documents featuring the proper noun

In [79]:
total_docs = len(docs)
doc_freq_df = pd.DataFrame(doc_freq.most_common(), columns=['word', 'doc_freq'])
doc_freq_df['doc_percentage'] = doc_freq_df['doc_freq'] / total_docs * 100
doc_freq_df

Unnamed: 0,word,doc_freq,doc_percentage
0,japan,375,52.083333
1,tokyo,248,34.444444
2,japanese,246,34.166667
3,covid-19,70,9.722222
4,tokyo olympics,67,9.305556
...,...,...,...
1561,boeing 747f,1,0.138889
1562,transport ministry,1,0.138889
1563,hina,1,0.138889
1564,yamamoto ichita,1,0.138889


#### Merge the dataframes together on the word

In [80]:
df_merged = pd.merge(left=freq_df, right=doc_freq_df, how='left', left_on=['word'],
         right_on = ['word'])

df_merged.to_csv(channel+'_ner_frequency.csv', encoding='utf8', index=False)

#### I created a different script for creating plots (ner_freq_plot.py)
#### So i could play around with parameters