In [51]:
## importing required libraries
import spacy
import pandas as pd
import nltk
from collections import Counter
import numpy as np
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [52]:
##loading the dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
df=pd.read_csv("BBC-articles.csv")

In [53]:
df.head(5)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


## Cleaning the text column by using
1.   nltk word tokenizer
2.   alphanumeric  check
3.   stop words removal
4.   lower case conversion
5.   lemmatization of words




In [54]:
nltkStopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
def tokenizeCleanLemmatizeWords(data):
    nltkWords = nltk.tokenize.word_tokenize(data)
    nltkCleanWords = [w for w in nltkWords if w.isalnum() and len(w)>1]
    nltkCleanWords = [w.lower() for w in nltkCleanWords if w not in nltkStopwords]    
    return [lemmatizer.lemmatize(w) for w in nltkCleanWords]

In [55]:
df['clean_text'] = df['text'].apply(lambda x: tokenizeCleanLemmatizeWords(x))

In [56]:
df.head()

Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hand, viewer, home, theatre, syst..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, bos, left, book, alone, former, wor..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tiger, wary, farrell, gamble, leicester, say,..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raid, box, office, ocean, twel..."


## (1)    TF-IDF after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)


In [57]:
from gensim.corpora import Dictionary
dictionary_basic = Dictionary(df['clean_text'])
len(dictionary_basic.token2id)

25468

## (2)    TF-IDF with term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents.


In [58]:
dictionary_freq_filter=Dictionary(df['clean_text'])
dictionary_freq_filter.filter_extremes(no_below=5, no_above=0.90)
len(dictionary_freq_filter)

8070

In [59]:
df.head()

Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hand, viewer, home, theatre, syst..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, bos, left, book, alone, former, wor..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tiger, wary, farrell, gamble, leicester, say,..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raid, box, office, ocean, twel..."


## Below function extracts topics and calculating the coherence score using LSI model for each record

In [11]:
from gensim.models import LsiModel, CoherenceModel
def getTopicByGenismLsi(dictionary_var,row):
  dtm = [dictionary_var.doc2bow(row)]
  lsi_model = LsiModel(corpus=dtm, id2word=dictionary_var, num_topics=1)
  result = dict(); 
  result['coherence'] = CoherenceModel(model=lsi_model, texts=df['clean_text'],dictionary=dictionary_var, coherence='c_v').get_coherence()
  result['words']   = lsi_model.print_topics(-1)
  return result

## Below function extracts topics and calculating the coherence score using LDA model for each record

In [12]:
from gensim.models import LdaModel, LdaMulticore
def getTopicByGenismLda(dictionary_var,row):
  dtm = [dictionary_var.doc2bow(row)]
  goodLdaModel= LdaMulticore(corpus=dtm, id2word=dictionary_var, passes = 5, workers = 8, iterations = 50, num_topics=1)
  goodcm = CoherenceModel(model=goodLdaModel, texts=df['clean_text'].tolist(), dictionary=dictionary_basic, coherence='c_v')
  print(goodcm)
  result = dict(); 
  result['coherence'] = goodcm.get_coherence()
  result['words']   = goodLdaModel.print_topics(-1)
  return result

In [13]:
df['spacy_lsi_words']=''
df['spacy_lda_words']=''
df['basic_lsi_words']=''
df['basic_lda_words']=''
df['freqfilter_lsi_words']=''
df['freqfilter_lda_words']=''
df['basic_lsi_coherance']=''
df['basic_lda_coherance']=''

In [None]:
##iterating the dataframe to get the topic and coherance score
for i,row in df.iterrows():
  print("iter for",i)
  result_lsi=getTopicByGenismLsi(dictionary_basic,row['clean_text'])
  df['basic_lsi_coherance'][i]=result_lsi.get('coherence')
  df['basic_lsi_words'][i]=result_lsi.get('words')  
  result_lda=getTopicByGenismLda(dictionary_basic,row['clean_text'])
  df['basic_lda_coherance'][i]=result_lda.get('coherence')
  df['basic_lda_words'][i]=result_lda.get('words')

## saving the output to csv to avoid reruns on google collab

In [None]:
df.to_csv("output_basic.csv")

In [105]:
len(df)

2225

##  TF-IDF limited to nouns, noun phrases, and named entity recognition only.

In [60]:
##loading the spacy
import pandas as pd
sp = spacy.load("en_core_web_sm")
def getSpacyExtractFeatures(row):
  spCorpus = sp(row)
  spCleanWords = [w.text.lower() for w in spCorpus
                  if not w.is_stop and not w.is_punct and len(w)>1]
  CleanSentence = ' '.join(spCleanWords)
  spCorpus = sp(CleanSentence)

  nounsNounPhrasesAndNER=[]
  for w in spCorpus:
    if w.pos_ == 'NOUN':
      nounsNounPhrasesAndNER.append(str(w))
  for chunk in spCorpus.noun_chunks:
    nounsNounPhrasesAndNER.append(str(chunk))
  for ent in spCorpus.ents:
    nounsNounPhrasesAndNER.append(str(ent))
  return  nounsNounPhrasesAndNER

In [61]:
df['spacy_features']=df['text'].apply(lambda x: getSpacyExtractFeatures(x))


## (3) Creating a dictionary with the features extracted from spacy. 

In [37]:
dictionary_spacy=Dictionary(df['spacy_features'])
print(len(dictionary_spacy))


89012


## Below function generate the topics using LSI and LDA models and returns a dictionary of results


In [18]:
from gensim.models import LsiModel, CoherenceModel
from gensim.models import LdaModel, LdaMulticore

def getTopicByGenismLsiLDa(dictionary_var,row):
    dtm = [dictionary_var.doc2bow(row)]
    lsi_model = LsiModel(corpus=dtm, id2word=dictionary_var, num_topics=1)
   # goodLdaModel = LdaModel(corpus=dtm, id2word=dictionary_var, iterations=50, num_topics=1)
    goodLdaModel= LdaMulticore(corpus=dtm, id2word=dictionary_var, passes = 5, workers = 8, iterations = 50, num_topics=1)
    result = dict(); 
    result['lsi_words'] = lsi_model.print_topics(-1)
    result['lda_words']   = goodLdaModel.print_topics(-1)
    return result

In [None]:
for i,row in df.iterrows():
  print("iter for",i)
  result=getTopicByGenismLsiLDa(dictionary_spacy,row['spacy_features'])
  df['spacy_lsi_words'][i]=result.get('lsi_words')
  df['spacy_lda_words'][i]=result.get('lda_words')

## creating a result dataframe 

In [116]:
for i,row in df.iterrows():
  print("iter for",i)
  result=getTopicByGenismLsiLDa(dictionary_freq_filter,row['clean_text'])
  df['freqfilter_lsi_words'][i]=result.get('lsi_words')
  df['freqfilter_lda_words'][i]=result.get('lda_words')


iter for 2214




iter for 2215




iter for 2216




iter for 2217




iter for 2218




iter for 2219




iter for 2220




iter for 2221




iter for 2222




iter for 2223




iter for 2224




In [117]:
df.head(3)

Unnamed: 0,category,text,clean_text,spacy_features,spacy_lsi_words,spacy_lda_words,basic_lsi_words,basic_lda_words,freqfilter_lsi_words,freqfilter_lda_words
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hand, viewer, home, theatre, syst...","[tv, future, hands, viewers, home, theatre, sy...","[(0, 0.398*""people"" + 0.369*""tv"" + 0.341*""they...","[(0, 0.002*""people"" + 0.002*""tv"" + 0.002*""they...",,,"[(0, 0.438*""tv"" + 0.219*""want"" + 0.219*""people...","[(0, 0.002*""tv"" + 0.001*""u"" + 0.001*""want"" + 0..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, bos, left, book, alone, former, wor...","[books, ebbers, fraud, accounting, decisions, ...","[(0, 0.349*""ebbers"" + 0.291*""accounting"" + 0.2...","[(0, 0.001*""ebbers"" + 0.001*""fraud"" + 0.001*""a...",,,"[(0, 0.528*""mr"" + 0.352*""worldcom"" + 0.308*""eb...","[(0, 0.002*""mr"" + 0.001*""worldcom"" + 0.001*""eb..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tiger, wary, farrell, gamble, leicester, say,...","[tigers, leicester, bid, codes, process, way, ...","[(0, 0.390*""he"" + 0.312*""leicester"" + 0.234*""u...","[(0, 0.001*""he"" + 0.001*""leicester"" + 0.000*""l...",,,"[(0, 0.343*""farrell"" + 0.206*""union"" + 0.206*""...","[(0, 0.001*""farrell"" + 0.000*""rugby"" + 0.000*""..."



Basic lsi and Lda model were executed separately and saved to csv as the execution time is longer in google collab resulting in timeout

In [121]:
df_basic=pd.read_csv('output_basic.csv')
df_basic.head(3)

Unnamed: 0.1,Unnamed: 0,category,text,clean_text,tfidf_basic_lsi,tfidf_basic_lsi_coherance,tfidf_basic_lda,tfidf_basic_lda_coherance
0,0,tech,tv future in the hands of viewers with home th...,"['tv', 'future', 'hand', 'viewer', 'home', 'th...","[(0, '0.430*""tv"" + 0.215*""people"" + 0.215*""u"" ...",0.497097,"[(0, '0.001*""tv"" + 0.000*""want"" + 0.000*""peopl...",0.54332
1,1,business,worldcom boss left books alone former worldc...,"['worldcom', 'bos', 'left', 'book', 'alone', '...","[(0, '0.523*""mr"" + 0.348*""worldcom"" + 0.305*""e...",0.522474,"[(0, '0.001*""mr"" + 0.001*""worldcom"" + 0.000*""e...",0.600885
2,2,sport,tigers wary of farrell gamble leicester say ...,"['tiger', 'wary', 'farrell', 'gamble', 'leices...","[(0, '0.343*""farrell"" + 0.206*""leicester"" + 0....",0.330982,"[(0, '0.000*""farrell"" + 0.000*""gamble"" + 0.000...",0.286034


In [144]:
df.head(2)

Unnamed: 0,category,text,clean_text,spacy_features,spacy_lsi_words,spacy_lda_words,basic_lsi_words,basic_lda_words,freqfilter_lsi_words,freqfilter_lda_words
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hand, viewer, home, theatre, syst...","[tv, future, hands, viewers, home, theatre, sy...","[(0, 0.398*""people"" + 0.369*""tv"" + 0.341*""they...","[(0, 0.002*""people"" + 0.002*""tv"" + 0.002*""they...",,,"[(0, 0.438*""tv"" + 0.219*""want"" + 0.219*""people...","[(0, 0.002*""tv"" + 0.001*""u"" + 0.001*""want"" + 0..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, bos, left, book, alone, former, wor...","[books, ebbers, fraud, accounting, decisions, ...","[(0, 0.349*""ebbers"" + 0.291*""accounting"" + 0.2...","[(0, 0.001*""ebbers"" + 0.001*""fraud"" + 0.001*""a...",,,"[(0, 0.528*""mr"" + 0.352*""worldcom"" + 0.308*""eb...","[(0, 0.002*""mr"" + 0.001*""worldcom"" + 0.001*""eb..."


In [141]:
df_basic.columns

Index(['Unnamed: 0', 'category', 'text', 'clean_text', 'tfidf_basic_lsi',
       'tfidf_basic_lsi_coherance', 'tfidf_basic_lda',
       'tfidf_basic_lda_coherance'],
      dtype='object')

In [145]:
df['basic_lsi_words']=df_basic['tfidf_basic_lsi']
df['basic_lda_words']=df_basic['tfidf_basic_lda']

In [147]:
df.head(5)

Unnamed: 0,category,text,clean_text,spacy_features,spacy_lsi_words,spacy_lda_words,basic_lsi_words,basic_lda_words,freqfilter_lsi_words,freqfilter_lda_words
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hand, viewer, home, theatre, syst...","[tv, future, hands, viewers, home, theatre, sy...","[(0, 0.398*""people"" + 0.369*""tv"" + 0.341*""they...","[(0, 0.002*""people"" + 0.002*""tv"" + 0.002*""they...","[(0, '0.430*""tv"" + 0.215*""people"" + 0.215*""u"" ...","[(0, '0.001*""tv"" + 0.000*""want"" + 0.000*""peopl...","[(0, 0.438*""tv"" + 0.219*""want"" + 0.219*""people...","[(0, 0.002*""tv"" + 0.001*""u"" + 0.001*""want"" + 0..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, bos, left, book, alone, former, wor...","[books, ebbers, fraud, accounting, decisions, ...","[(0, 0.349*""ebbers"" + 0.291*""accounting"" + 0.2...","[(0, 0.001*""ebbers"" + 0.001*""fraud"" + 0.001*""a...","[(0, '0.523*""mr"" + 0.348*""worldcom"" + 0.305*""e...","[(0, '0.001*""mr"" + 0.001*""worldcom"" + 0.000*""e...","[(0, 0.528*""mr"" + 0.352*""worldcom"" + 0.308*""eb...","[(0, 0.002*""mr"" + 0.001*""worldcom"" + 0.001*""eb..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tiger, wary, farrell, gamble, leicester, say,...","[tigers, leicester, bid, codes, process, way, ...","[(0, 0.390*""he"" + 0.312*""leicester"" + 0.234*""u...","[(0, 0.001*""he"" + 0.001*""leicester"" + 0.000*""l...","[(0, '0.343*""farrell"" + 0.206*""leicester"" + 0....","[(0, '0.000*""farrell"" + 0.000*""gamble"" + 0.000...","[(0, 0.343*""farrell"" + 0.206*""union"" + 0.206*""...","[(0, 0.001*""farrell"" + 0.000*""rugby"" + 0.000*""..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[face, newcastle, premiership, side, trip, lea...","[(0, 0.366*""january"" + 0.305*""v"" + 0.244*""side...","[(0, 0.001*""january"" + 0.001*""v"" + 0.001*""side...","[(0, '0.240*""side"" + 0.192*""face"" + 0.192*""cup...","[(0, '0.000*""side"" + 0.000*""cup"" + 0.000*""yead...","[(0, 0.258*""side"" + 0.207*""cup"" + 0.207*""face""...","[(0, 0.001*""side"" + 0.001*""united"" + 0.001*""cu..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raid, box, office, ocean, twel...","[raids, ocean, crime, caper, sequel, clooney, ...","[(0, 0.350*""sequel"" + 0.280*""weekend"" + 0.280*...","[(0, 0.001*""sequel"" + 0.001*""it"" + 0.001*""week...","[(0, '0.309*""ocean"" + 0.309*""sequel"" + 0.185*""...","[(0, '0.000*""sequel"" + 0.000*""ocean"" + 0.000*""...","[(0, 0.327*""sequel"" + 0.327*""ocean"" + 0.196*""s...","[(0, 0.001*""sequel"" + 0.001*""ocean"" + 0.000*""o..."


In [149]:
result_df=df[['category','text']]
result_df.head(2)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...


In [150]:
df.columns

Index(['category', 'text', 'clean_text', 'spacy_features', 'spacy_lsi_words',
       'spacy_lda_words', 'basic_lsi_words', 'basic_lda_words',
       'freqfilter_lsi_words', 'freqfilter_lda_words'],
      dtype='object')

In [131]:
import re
def getTop5(rec):
  return re.findall(r'"([^"]*)"', str(rec))[0:5]

## extract the first 5 words from the output

In [151]:
result_df['basic_lsi_words'] = df['basic_lsi_words'].apply(lambda x: getTop5(x))
result_df['basic_lda_words'] = df['basic_lda_words'].apply(lambda x: getTop5(x))
result_df['freqfilter_lsi_words'] = df['freqfilter_lsi_words'].apply(lambda x: getTop5(x))
result_df['freqfilter_lda_words'] = df['freqfilter_lda_words'].apply(lambda x: getTop5(x))
result_df['spacy_lsi_words'] = df['spacy_lsi_words'].apply(lambda x: getTop5(x))
result_df['spacy_lda_words'] = df['spacy_lda_words'].apply(lambda x: getTop5(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [152]:
result_df.head(5)

Unnamed: 0,category,text,basic_lsi_words,basic_lda_words,freqfilter_lsi_words,freqfilter_lda_words,spacy_lsi_words,spacy_lda_words
0,tech,tv future in the hands of viewers with home th...,"[tv, people, u, want, brand]","[tv, want, people, u, brand]","[tv, want, people, u, brand]","[tv, u, want, people, brand]","[people, tv, they, what, us]","[people, tv, they, what, it]"
1,business,worldcom boss left books alone former worldc...,"[mr, worldcom, ebbers, myers, accounting]","[mr, worldcom, ebbers, accounting, myers]","[mr, worldcom, ebbers, myers, accounting]","[mr, worldcom, ebbers, myers, accounting]","[ebbers, accounting, fraud, he, who]","[ebbers, fraud, accounting, who, he]"
2,sport,tigers wary of farrell gamble leicester say ...,"[farrell, leicester, gamble, well, rugby]","[farrell, gamble, union, well, league]","[farrell, union, gamble, league, rugby]","[farrell, rugby, league, leicester, well]","[he, leicester, union, tigers, league]","[he, leicester, league, tigers, union]"
3,sport,yeading face newcastle in fa cup premiership s...,"[side, face, cup, yeading, west]","[side, cup, yeading, west, united]","[side, cup, face, west, united]","[side, united, cup, west, face]","[january, v, side, weekend, who]","[january, v, side, weekend, who]"
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, sequel, u, starring, office]","[sequel, ocean, box, office, twelve]","[sequel, ocean, starring, u, twelve]","[sequel, ocean, office, u, starring]","[sequel, weekend, it, twelve, ocean]","[sequel, it, weekend, twelve, the sequel]"


In [66]:
from gensim.models import LsiModel, CoherenceModel
from gensim.models import LdaModel, LdaMulticore

def getCoheranceByGenismLsiLDa(dictionary_var,records):
  dtm = [dictionary_var.doc2bow(doc) for doc in records]
  lsi_model = LsiModel(corpus=dtm, id2word=dictionary_var, num_topics=10)
  goodLdaModel= LdaMulticore(corpus=dtm, id2word=dictionary_var, passes = 5, workers = 8, iterations = 50, num_topics=10)
  result = dict(); 
  result['lsi_coher'] = CoherenceModel(model=lsi_model, texts=records,dictionary=dictionary_var, coherence='c_v').get_coherence()
  result['lda_coher']   = CoherenceModel(model=goodLdaModel, texts=records.tolist(), dictionary=dictionary_var, coherence='c_v').get_coherence()
  return result

In [67]:
print(getCoheranceByGenismLsiLDa(dictionary_basic,df['clean_text']))



{'lsi_coher': 0.3666159965570199, 'lda_coher': 0.3511177217331644}


In [68]:
print(getCoheranceByGenismLsiLDa(dictionary_freq_filter,df['clean_text']))



{'lsi_coher': 0.34793964174130715, 'lda_coher': 0.3917676420198509}


In [69]:
print(getCoheranceByGenismLsiLDa(dictionary_spacy,df['spacy_features']))



{'lsi_coher': 0.4013961961124031, 'lda_coher': 0.32745132537000143}


Relatively LSI models performed better than LDA models based on coherence value

Best model: Based on coherence value LSI model using TFIDF Vectorrization imited to nouns, noun phrases, and named entity recognition only performed better