In [2]:
import pandas as pd

In [16]:
bbc = pd.read_csv("bbc-text.csv")

In [18]:
bbc.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [97]:
bbc.shape

(2225, 2)

In [21]:
bbc['text'][0]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also being built-in to high

In [23]:
bbc.head(10)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...


In [25]:
len(bbc)

2225

# Preprocessing

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
cv = CountVectorizer(max_df=0.95 , min_df=2 ,stop_words="english")

In [39]:
dtm = cv.fit_transform(bbc['text'])

In [41]:
dtm

<2225x17473 sparse matrix of type '<class 'numpy.int64'>'
	with 310493 stored elements in Compressed Sparse Row format>

# LDA

In [52]:
from sklearn.decomposition import LatentDirichletAllocation

In [54]:
LDA = LatentDirichletAllocation(n_components=7,random_state=42)

In [58]:
LDA.fit(dtm)

# Showing Store words

In [65]:
len(cv.get_feature_names_out())

17473

In [67]:
type(cv.get_feature_names_out())

numpy.ndarray

In [69]:
cv.get_feature_names_out()[5000]

'disappointment'

In [73]:
cv.get_feature_names_out()[2000]

'basingstoke'

In [75]:
cv.get_feature_names_out()[4000]

'contradict'

In [77]:
import random 

In [101]:
random_word_id = random.randint(0,2225)
print(cv.get_feature_names_out()[random_word_id])

bend


In [109]:
for i in range (10):
    random_word_id = random.randint(0, 5000)
    print(cv.get_feature_names_out()[random_word_id])

brewer
aviation
1956
capacity
constitutes
chronicles
43
astonishingly
arizona
blow


# Grab the Topics

In [112]:
len(LDA.components_)

7

In [114]:
LDA.components_.shape

(7, 17473)

In [118]:
len(LDA.components_[0])

17473

In [122]:
single_topic = LDA.components_[0]

In [126]:
single_topic.argsort()

array([10285,  9240,  5652, ...,  2177, 13725,  6422])

In [130]:
single_topic.argsort()[-20:]

array([ 6426,  9466, 11848,  8928,  1816, 14958,  2624,  4977, 17391,
        1813, 17251,  6920, 15909, 10773,  6928, 17292, 17387,  2177,
       13725,  6422])

In [132]:
top_word_indices = single_topic.argsort()[-10:]

In [149]:
for index in  top_word_indices:
    print(cv.get_feature_names_out()[index]) 

won
game
time
new
games
world
year
best
said
film


# Grab Highest Probability words per topic

In [170]:
from sklearn.decomposition import LatentDirichletAllocation

In [172]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-25:]])  
    print(("\n"))

THE TOP 15 WORDS FOR TOPIC #0
['win', 'actor', 'england', 'bbc', 'old', 'films', 'like', 'play', 'just', 'awards', 'star', 'british', 'director', 'years', 'award', 'won', 'game', 'time', 'new', 'games', 'world', 'year', 'best', 'said', 'film']


THE TOP 15 WORDS FOR TOPIC #1
['technology', 'say', 'digital', 'prime', 'howard', 'says', 'like', 'make', 'plans', 'bbc', 'public', 'told', 'brown', 'uk', 'minister', 'new', 'mobile', 'blair', 'party', 'election', 'government', 'labour', 'people', 'mr', 'said']


THE TOP 15 WORDS FOR TOPIC #2
['injury', 'football', 'liverpool', 'arsenal', 'number', 'team', 'final', 'year', 'just', 'set', 'players', 'league', 'second', 'good', 'open', 'cup', 'chelsea', 'match', 'united', 'play', 'time', 'win', 'club', 'game', 'said']


THE TOP 15 WORDS FOR TOPIC #3
['help', 'google', 'market', 'russian', 'web', 'time', 'countries', 'net', 'use', 'online', 'broadband', 'bank', 'government', '000', 'new', 'oil', 'million', 'mr', 'world', 'yukos', 'search', 'year',

In [173]:
dtm.shape

(2225, 17473)

In [174]:
len(bbc)

2225

In [182]:
topic_results = LDA.transform(dtm)

In [184]:
topic_results

array([[5.28263649e-02, 5.64273941e-01, 4.00944870e-04, ...,
        4.00516184e-04, 3.57611605e-01, 3.99840192e-04],
       [8.22462321e-04, 8.23288057e-04, 8.23361556e-04, ...,
        3.01824527e-02, 9.65702483e-01, 8.22752030e-04],
       [6.52803980e-01, 1.28295650e-03, 3.40792764e-01, ...,
        1.27996432e-03, 1.28074187e-03, 1.27987665e-03],
       ...,
       [8.20783669e-01, 8.96373419e-04, 8.95109050e-04, ...,
        1.05696298e-01, 8.97862715e-04, 6.99334421e-02],
       [4.11318900e-04, 9.59529431e-01, 4.10915845e-04, ...,
        4.11807132e-04, 4.10775577e-04, 4.10933419e-04],
       [7.06184864e-02, 1.75540368e-02, 9.06096248e-01, ...,
        1.43627069e-03, 1.43079717e-03, 1.43326878e-03]])

In [186]:
topic_results.shape

(2225, 7)

In [188]:
topic_results[0]

array([5.28263649e-02, 5.64273941e-01, 4.00944870e-04, 2.40867877e-02,
       4.00516184e-04, 3.57611605e-01, 3.99840192e-04])

In [190]:
topic_results[0].round(2)

array([0.05, 0.56, 0.  , 0.02, 0.  , 0.36, 0.  ])

In [192]:
topic_results[0].argmax()

1

In [194]:
bbc.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [196]:
topic_results.argmax(axis=1)

array([1, 5, 0, ..., 0, 1, 2])

In [198]:
bbc['Topic'] = topic_results.argmax(axis=1)

In [200]:
bbc

Unnamed: 0,category,text,Topic
0,tech,tv future in the hands of viewers with home th...,1
1,business,worldcom boss left books alone former worldc...,5
2,sport,tigers wary of farrell gamble leicester say ...,0
3,sport,yeading face newcastle in fa cup premiership s...,2
4,entertainment,ocean s twelve raids box office ocean s twelve...,0
...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,4
2221,politics,kilroy unveils immigration policy ex-chatshow ...,1
2222,entertainment,rem announce new glasgow concert us band rem h...,0
2223,politics,how political squabbles snowball it s become c...,1
