# Topic modelling

## Import packages

In [126]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords

import pandas as pd
import matplotlib.pyplot as plt

## Get (toy) dataset

In [127]:
corpus = [ "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
          "Rafael Nadal Is Out of the Australian Open",
          "Biden Announces Virus Measures",
          "Biden's Virus Plans Meet Reality",
          "Where Biden's Virus Plan Stands"]

## Preprocess data

In [128]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), lowercase=True)
dtm = vectorizer.fit_transform(corpus)

#print(dtm.todense())
#print(vectorizer.get_feature_names_out())

In [129]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(dtm)
#print(tfidf.todense())

## Topic modelling

In [130]:
n_topics = 2

In [131]:
lda = LDA(n_components = n_topics)
lda_array = lda.fit_transform(tfidf)
lda_array

array([[0.14838002, 0.85161998],
       [0.17640475, 0.82359525],
       [0.81927251, 0.18072749],
       [0.83045459, 0.16954541],
       [0.81927261, 0.18072739]])

In [132]:
(pd.
 DataFrame(lda_array, columns=[f'Topic {i}' for i in range(1,n_topics+1)]).
 sort_values('Topic 1', ascending=False)
)

Unnamed: 0,Topic 1,Topic 2
3,0.830455,0.169545
4,0.819273,0.180727
2,0.819273,0.180727
1,0.176405,0.823595
0,0.14838,0.85162


In [149]:
features = vectorizer.get_feature_names_out()

for topic in range(n_topics):
    words = (pd.
             Series(lda.components_[topic], index=features).
             sort_values(ascending=False).
             iloc[:3].
             index
            )
    words = list(words)
    print(words)
             

['biden', 'virus', 'stands']
['rafael', 'nadal', 'open']


## Now lets try this with a real dataset

In [151]:
from sklearn.datasets import fetch_20newsgroups
X, _ = fetch_20newsgroups(subset='train', 
                          remove=('headers', 'footers', 'quotes'), 
                          return_X_y=True)

print(X[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [152]:
print(len(X))

11314


In [153]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), lowercase=True)
dtm = vectorizer.fit_transform(X)
print(dtm.shape)

(11314, 101487)


In [154]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(dtm)
print(tfidf.shape)

(11314, 101487)


In [166]:
n_topics = 10

In [167]:
lda = LDA(n_components = n_topics)
lda_array = lda.fit_transform(tfidf)

(pd.
 DataFrame(lda_array, columns=[f'Topic {i}' for i in range(1,n_topics+1)]).
 sort_values('Topic 1', ascending=False)
)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
10777,0.734471,0.029499,0.029499,0.029539,0.029499,0.029499,0.029499,0.029500,0.029499,0.029499
2002,0.734471,0.029499,0.029499,0.029539,0.029499,0.029499,0.029499,0.029500,0.029499,0.029499
6337,0.717504,0.031383,0.031383,0.031408,0.031383,0.031383,0.031383,0.031408,0.031383,0.031383
8215,0.694910,0.033880,0.033880,0.033943,0.033880,0.033880,0.033880,0.033985,0.033880,0.033880
2626,0.668398,0.036841,0.036841,0.036870,0.036841,0.036841,0.036842,0.036844,0.036841,0.036841
...,...,...,...,...,...,...,...,...,...,...
6440,0.002848,0.002849,0.002848,0.089367,0.002848,0.002849,0.002849,0.058905,0.002849,0.831787
6261,0.001767,0.001766,0.001766,0.063442,0.001768,0.005978,0.001766,0.060108,0.001766,0.859872
10650,0.001627,0.001625,0.001625,0.074243,0.001625,0.011961,0.001627,0.068471,0.001626,0.835570
498,0.001596,0.001599,0.001595,0.077980,0.001599,0.011007,0.001596,0.061116,0.001596,0.840316


In [168]:
features = vectorizer.get_feature_names_out()

for topic in range(n_topics):
    words = (pd.
             Series(lda.components_[topic], index=features).
             sort_values(ascending=False).
             iloc[:5].
             index
            )
    words = list(words)
    print(words)

['hawks', 'duo', 'captain', 'riders', 'steam']
['dtmedin', 'catbyte', 'ingr', 'b30', '205']
['ites', 'winbench', 'headline', 'cobb', 'mumble']
['would', 'one', 'get', 'know', 'like']
['ax', 'ico', 'bronx', 'bobbe', 'queens']
['lib', 'islanders', 'feustel', 'joystick', 'mcguire']
['keller', 'ivy', 'kkeller', 'quakers', 'upenn']
['god', 'people', 'one', 'would', 'think']
['geb', 'cadre', 'shameful', 'n3jxp', 'chastity']
['ditto', 'pkp', 'adl', 'adcom', 'liner']
