In [2]:
import pandas as pd
import spacy
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt

In [3]:
bbc_data=pd.read_csv("bbc_news.csv")

In [4]:
bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [5]:
bbc_data.info

<bound method DataFrame.info of      Unnamed: 0  index                                              title  \
0             0   6684                              Can I refuse to work?   
1             1   9267  'Liz Truss the Brief?' World reacts to UK poli...   
2             2   7387  Rationing energy is nothing new for off-grid c...   
3             3    767  The hunt for superyachts of sanctioned Russian...   
4             4   3712  Platinum Jubilee: 70 years of the Queen in 70 ...   
..          ...    ...                                                ...   
995         995  13238  Dominic Raab: Third senior civil servant gives...   
996         996   4730                Highlights: Radacanu beats Uytvanck   
997         997   1871    In pictures: Mountain bikers descend snowy peak   
998         998   4346  Companies must help cut living costs, says new...   
999         999   8482     Beware online car sale scams, consumers warned   

                           pubDate  \
0    

In [6]:
titles=pd.DataFrame(bbc_data['title'])

In [7]:
titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


Clean Data

In [None]:
#lowercase
titles['lowercase']=titles['title'].str.lower()

In [None]:
#stopword removal
en_stopwords=stopwords.words('english')
titles['no_stopwords']=titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [10]:
#punctuation removal
titles['no_stopwords_no_punct']=titles.apply(lambda x: re.sub(r"([^\w\s])", "", x['no_stopwords']), axis=1)

In [11]:
#tokenize
titles['tokens_raw']=titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean']=titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)

In [12]:
#lemmatizing
lemmatizer=WordNetLemmatizer()
titles["tokens_clean_lemmatizer"]=titles["tokens_clean"].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens ])

In [13]:
titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,tokens_raw,tokens_clean,tokens_clean_lemmatizer
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[Can, I, refuse, to, work, ?]","[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"['Liz, Truss, the, Brief, ?, ', World, reacts,...","[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[Rationing, energy, is, nothing, new, for, off...","[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[The, hunt, for, superyachts, of, sanctioned, ...","[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[Platinum, Jubilee, :, 70, years, of, the, Que...","[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"


In [15]:
#create lists for just our tokens
tokens_raw_list=sum(titles['tokens_raw'], []) #unpack list into a single list
tokens_clean_list=sum(titles['tokens_clean_lemmatizer'], [])

POS Tagging

In [16]:
nlp=spacy.load("en_core_web_sm")

In [17]:
spacy_doc=nlp(''.join(tokens_raw_list))

In [18]:
pos_df=pd.DataFrame(columns=['token', 'pos_tag'])

In [20]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token':token.text, 'pos_tag': token.pos_}])], ignore_index=True)

In [22]:
pos_df_counts=pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
print(pos_df_counts.head())

    token pos_tag  counts
39      :   PUNCT     498
2       -   PUNCT     202
0       ,   PUNCT      81
1       -    NOUN       6
791  year    NOUN       4


In [23]:
nouns=pos_df_counts[pos_df_counts.pos_tag=='NOUN'][0:10]

In [24]:
verbs=pos_df_counts[pos_df_counts.pos_tag=='VERB'][0:10]
verbs

Unnamed: 0,token,pos_tag,counts
13,0SheffieldUnited,VERB,1
36,3SheffieldUnited,VERB,1
56,Aniceclimber'sclose,VERB,1
80,BestshotsfromAliceCapsey's51againstIrelandEnca...,VERB,1
161,EnglandfansfrustratedafterdisappointingUSAmatc...,VERB,1
171,FacingtheRussianArmyonthefrontlineinDonbasWhat...,VERB,1
279,KeepingUpAppearancesactressdiesat91HowMargaret...,VERB,1
286,LagoandotherTrump,VERB,1
366,PM’slockdownpartyfineTheHundred,VERB,1
454,SouthAsianfacesonTV'makesmehappy'PrinceHarryre...,VERB,1


In [25]:
adj=pos_df_counts[pos_df_counts.pos_tag=='ADJ'][0:10]
adj

Unnamed: 0,token,pos_tag,counts
151,England'sMarliePackerwillleadmuch,ADJ,1
300,Liverpoolmanagerhasnoplanstoquit'unlesssomeone...,ADJ,1
302,Long,ADJ,1
362,OnekilledasstrongwindscausestagecollapseinSpai...,ADJ,1
409,RivalryisfierceonthepitchbutfriendlyoffitChild...,ADJ,1
455,SouthKoreareachlast16withdramaticlast,ADJ,1
519,USOpenchampiondiscusses'tough'splitwithcoachTo...,ADJ,1
568,Whattolookoutforinthequarter,ADJ,1
607,WorkstartstocutdownBrettonoaktreeOnlinepredato...,ADJ,1
613,Youngcancerpatients'inadesperatesituation'Riha...,ADJ,1


NER

In [26]:
ner_df=pd.DataFrame(columns=['token', 'ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_)is False:
        ner_df=pd.concat([ner_df, pd.DataFrame.from_records([{'token':token.text, 'ner_tag':token.label_}])], ignore_index=True)

In [27]:
ner_df.head()

Unnamed: 0,token,ner_tag
0,DemocratselevatedthisRepublican,ORG
1,Dutch17-year-oldIsaacBabadiscoresPanenkapenalt...,DATE
2,Leicester16-6Edinburgh-JasperWiesetrysendsTige...,ORG
3,Gakpo,GPE
4,sayLibDemsChrisKaba,NORP


In [None]:
ner_df_counts=ner_df_counts.groubpy(['token', 'ner_tag'])