In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk. corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import pandas as pd
import re
import matplotlib.pyplot as plt

In [6]:
bbc_data = pd.read_csv('bbc_news.csv')
bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [8]:
titles = pd.DataFrame(bbc_data['title'])

titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


# Text Preprocessing

In [16]:
titles['lowercase'] = titles['title'].str.lower()

en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join(word for word in x.split() if word not in en_stopwords))

titles['no_punct'] = titles['no_stopwords'].apply(lambda x:re.sub(r"[^\w\s]", '', x))

titles['tokens'] = titles['no_punct'].apply(lambda x: word_tokenize(x))

lemmatizer = WordNetLemmatizer()
titles['lemmatized'] = titles['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens]) 

titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_punct,tokens,lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"


In [17]:
tokens_raw_list = sum(titles['tokens'], [])
tokens_cleaned_list = sum(titles['lemmatized'], [])

print(tokens_raw_list)
print(tokens_cleaned_list)



In [18]:
nlp = spacy.load("en_core_web_sm")

In [23]:
spacy_doc = nlp(' '.join(tokens_raw_list))

# POS

In [24]:
pos_df = pd.DataFrame(columns=['Token', 'POS_tag'])

for token in spacy_doc:
    pos_df = pd.concat([pos_df,
                        pd.DataFrame.from_records({'Token': [token.text], 'POS_tag': [token.pos_]})], ignore_index=True)

In [29]:
pos_df_counts = pos_df.groupby(['Token', 'POS_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending = False)
pos_df_counts.head(10)

Unnamed: 0,Token,POS_tag,counts
31,2022,NUM,47
1237,england,PROPN,40
935,cup,PROPN,36
3946,uk,PROPN,33
2544,new,ADJ,32
4096,war,NOUN,32
3262,says,VERB,30
3951,ukraine,VERB,28
4210,world,NOUN,28
4211,world,PROPN,26


In [39]:
nouns = pos_df_counts[pos_df_counts['POS_tag'] == 'NOUN'][:10]
print(nouns)

print("\n")

verbs = pos_df_counts[pos_df_counts['POS_tag'] == 'VERB'][:10]
print(verbs)

       Token POS_tag  counts
4096     war    NOUN      32
4210   world    NOUN      28
2275     man    NOUN      23
2707  papers    NOUN      18
3049  record    NOUN      17
1233  energy    NOUN      17
2845  police    NOUN      16
971      day    NOUN      15
4128    week    NOUN      15
934      cup    NOUN      14


        Token POS_tag  counts
3262     says    VERB      30
3951  ukraine    VERB      28
1471    found    VERB      13
4174      win    VERB      10
1562      get    VERB       9
2271     make    VERB       8
4184     wins    VERB       8
1763     hits    VERB       8
3691     take    VERB       8
3261      say    VERB       8


# NER

In [43]:
ner_df = pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) == False:
        ner_df = pd.concat([ner_df,
                            pd.DataFrame.from_records({'token': [token.text], 'ner_tag': [token.label_]})], ignore_index=True)

ner_df.head()


Unnamed: 0,token,ner_tag
0,russian,NORP
1,70 years,DATE
2,70 seconds,TIME
3,bull,ORG
4,1s,CARDINAL


In [44]:
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending = False)
ner_df_counts.head(10)  

Unnamed: 0,token,ner_tag,counts
34,2022,CARDINAL,30
427,russian,NORP,22
219,first,ORDINAL,15
35,2022,DATE,10
207,england,GPE,10
426,russia,GPE,10
498,uk,GPE,10
154,china,GPE,9
391,one,CARDINAL,9
227,france,GPE,9
