In [None]:
https://www.kaggle.com/hgultekin/bbcnewsarchive

In [1]:
# Standard Libraries
import pandas as pd
import numpy as np


# Data Preprocessing
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer


import warnings
warnings.filterwarnings('ignore')

In [2]:
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd /content/drive/MyDrive/GMBM_TextDataset_20ns_BBC/20ns_BBC_NIPS/text_data

/content/drive/MyDrive/GMBM_TextDataset_20ns_BBC/20ns_BBC_NIPS/text_data


In [8]:
os.listdir('./')

['news-classification-with-ml-deep-learning.ipynb',
 'bbc-news-data.csv',
 '20newsgroup',
 '20news_vocabulary_61188.txt',
 'bbc']

In [9]:
news_df = pd.read_csv("./bbc-news-data.csv",sep='\t')

In [10]:
news_df.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [11]:
news_df.shape

(2225, 4)

In [12]:
news_df.drop('filename',axis=1,inplace=True)
news_df.head()

Unnamed: 0,category,title,content
0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [13]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   title     2225 non-null   object
 2   content   2225 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


## Preprocessing First Step
Ordinal encoding : *category* column

In [14]:
news_df['category_id'] = news_df.category.factorize()[0]

In [15]:
category_id_df = news_df[['category','category_id']].drop_duplicates().sort_values('category_id')

In [16]:
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id','category']].values)

# Visualization

In [17]:
pd.DataFrame(news_df.category.value_counts()).rename(columns={'category':'Count'}).style.bar(color='#137a63')

Unnamed: 0,Count
sport,511
business,510
politics,417
tech,401
entertainment,386


In [18]:
# Drop duplicate data
news_df.drop_duplicates(subset=['category','title'],inplace=True)

In [19]:
news_df.shape

(2096, 4)

In [20]:
# join title and content
news_df['text'] = news_df['title']+ ' ' + news_df['content']

In [21]:
news_df.head()

Unnamed: 0,category,title,content,category_id,text
0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,0,Ad sales boost Time Warner profit Quarterly p...
1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,0,Dollar gains on Greenspan speech The dollar h...
2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,0,Yukos unit buyer faces loan claim The owners ...
3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,0,High fuel prices hit BA's profits British Air...
4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,0,Pernod takeover talk lifts Domecq Shares in U...


## Preprocessing Second Step
Approach for text preprocessing  
1. lower casing
2. removal of punctuations and numbers
3. remove white space
4. remove stops
5. lemmitization word

In [22]:
# Data Cleaning
def clean_text(text):
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text
news_df['clean_text'] = news_df['text'].apply(clean_text)

In [23]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [24]:
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)
news_df['clean_text'] = news_df['clean_text'].apply(remove_stopwords)

In [25]:
new_stopwords=['about','above','after','again','against','all','and','any','are',
 "aren't",'because','been','before','being','below','between','both',
 'but','can',"can't",'cannot','could',"couldn't",'did',"didn't",'does',"doesn't",'doing',
 "don't",'down','during','each','few','for','from','further','had',"hadn't",'has',"hasn't",'have',
 "haven't",'having',"he'd","he'll","he's",'her','here',"here's",'hers','herself','him','himself','his',
 'how',"how's","i'd","i'll","i'm","i've",'into','is',"isn't",'it',"it's",'its','itself',"let's",'more',
 'most',"mustn't",'myself','nor','not','off','once','only','other','ought','our','oursourselves',
 'out','over','own','same',"shan't",'she',"she'd","she'll","she's",'should',"shouldn't",'some','such','than','that',"that's",
 'the','their','theirs','them','themselves',
 'then','there',"there's",'these','they',"they'd","they'll","they're",
 "they've",'this','those','through','too','under','until',
 'very','was',"wasn't","we'd","we'll","we're","we've",
 'were',"weren't",'what',"what's",'when',"when's",'where',"where's",'which','while','who',"who's",
 'whom','why',"why's",'will','with',"won't",'would',"wouldn't",
 'you',"you'd","you'll","you're","you've",'your','yours','yourself','yourselves',
 'one','two','three','four','five','six','seven','eight','nine','ten','hundred','thousand',
 '1st','2nd','3rd','4th','5th','6th','7th','8th','9th','10th',
 "ax'ax'ax'ax'ax'ax'ax'ax'ax'ax'ax'ax'ax'ax'ax",'get','also','use','like','anyone','know','need',
 'want','using','may','new',
 'even','good','bad','article','bus','work','think','help','please','thanks','writes','time','many','much','used','well',
 'say','lot','place','example','nice','first','really','thing','might','someone','look','fact','right','guns','believe',
  'way','batf','atf','January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',
 'jim','miller','robert','smith','wall','actually','make','part','seems','tell','rate','must','see','going','never','keep','david','koresh','rights',
 'things','said','something','day','since','however','point','still','better','best','worst','anything','little','number','years','year','read','write',
 'true','false','take','made','every','far','back','long','find','come','without','high','low','says','mean','different','etc','enough','hard','sure',
 'real','around','got','least','seen','rather','last','second','done','put','possible','frank','whether','probably','wrong','else','another','away','either',
 'bos','nyi','pit','tor','buf','cal','det','pts','next','john','clinton','already','dod','leds','give','kind','ever','led','soon','men','getting','quite',
 'nhl','san','said','also','can','could','mr','told','say'
]

# function to remove more stopwords
def remove_morestopwords(text):
    no_stopword_text = [w for w in text.split() if not w in new_stopwords]
    return ' '.join(no_stopword_text)
news_df['clean_text'] = news_df['clean_text'].apply(remove_morestopwords)

In [26]:
# Word lemmatization
lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    lem = [lemmatizer.lemmatize(w) for w in text.split()]
    words = [word for word in lem if len(word) >= 2]
    return ' '.join(words)
news_df['clean_text'] = news_df['clean_text'].apply(lemmatization)

In [27]:
news_df.head()

Unnamed: 0,category,title,content,category_id,text,clean_text
0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,0,Ad sales boost Time Warner profit Quarterly p...,ad sale boost warner profit quarterly profit m...
1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,0,Dollar gains on Greenspan speech The dollar h...,dollar gain greenspan speech dollar hit highes...
2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,0,Yukos unit buyer faces loan claim The owners ...,yukos unit buyer face loan claim owner embattl...
3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,0,High fuel prices hit BA's profits British Air...,fuel price hit ba profit british airway blamed...
4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,0,Pernod takeover talk lifts Domecq Shares in U...,pernod takeover talk lift domecq share uk drin...


In [28]:
news_df['category'].unique()

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [29]:
news_df.shape

(2096, 6)

In [30]:
news_df.to_csv('./clean_bbc.csv',index=False)

In [31]:
doc_list=news_df['clean_text'].to_numpy()

In [32]:
doc_list[1]

'dollar gain greenspan speech dollar hit highest level euro almost month federal reserve head trade deficit set stabilise alan greenspan highlighted government willingness curb spending rising household saving factor reduce late trading york dollar reached euro thursday market concern deficit hit greenback recent month friday federal reserve chairman greenspan speech london ahead meeting finance minister sent dollar higher earlier tumbled worse expected job data chairman taking sanguine view current account deficit taken sinche head currency strategy bank america york taking longer term view laying set condition current account deficit improve worry deficit concern china remain china currency remains pegged dollar currency sharp fall recent month therefore chinese export price highly competitive call shift beijing policy fallen deaf ear despite recent comment major chinese newspaper ripe loosening peg meeting thought unlikely produce meaningful movement chinese policy meantime federal 

In [33]:
doc_list.shape

(2096,)

In [34]:
docs=[]

for i in range(len(doc_list)):
  tokens=doc_list[i].split()
  docs.append(tokens)

In [35]:
len(docs)

2096

In [36]:
def flatten(list):
    new_list = []
    for i in list:
        for j in i:
            new_list.append(j)
    return new_list

In [37]:
np_list_of_words = np.asarray(flatten(docs))
words, counts = np.unique(np_list_of_words, return_counts=True)
#print("No of unique words",len(words))
words=list(words)

In [38]:
vocabulary=[]
maxfreq=10
for i in range(0,len(words)):
    if counts[i]>maxfreq:
        vocabulary.append(words[i])

newdocs=[]
c=0
for d in docs:
  wlist=[w for w in d if w in vocabulary]
  newdocs.append(wlist)
  c+=len(wlist)

In [39]:
print("Total words:",c)

Total words: 329599


In [40]:
array=[]
M=len(newdocs)
for x1 in range(0,M):
  N=len(newdocs[x1])
  #print(x1)
  for x2 in range(0,N):
    array.append(vocabulary.index(newdocs[x1][x2]))
    
print("no of docs",len(newdocs))
print("vocab size",len(vocabulary))
print("total words",len(array))

no of docs 2096
vocab size 5264
total words 329599


In [41]:
save_path='./bbc_2096docs/'

In [42]:

os.mkdir(save_path)

In [43]:
import pickle 
with open(save_path+"vocab",'wb')as f2:
  pickle.dump(vocabulary,f2)

with open(save_path+"images",'wb')as f3:
  pickle.dump(newdocs,f3)  

with open(save_path+"map_patch_to_id",'wb')as f3:
  pickle.dump(array,f3)

In [44]:
check=['actor','film','movie',\
       'stock','market','money',\
       'technology','digital','information',\
       'player','game','sport',\
       'politics','government','law']

In [45]:
for ch in check:
  if ch in vocabulary:
    print(ch,vocabulary.index(ch))

actor 52
film 1811
movie 3079
stock 4501
market 2898
money 3045
technology 4713
digital 1323
information 2395
player 3472
game 1967
sport 4433
politics 3499
government 2058
law 2690
