In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from machine_learning.utils.utils_nlp import remove_email, remove_newline_char, remove_single_quote
from machine_learning.utils.utils_io import read_lines_from_text_file

In [3]:
data_nlp_dirpath = "../data/nlp"

In [4]:
stopwords_filename = "stopwords_english.txt"
stopwords_filepath = os.path.join(data_nlp_dirpath, stopwords_filename)
stopwords = read_lines_from_text_file(stopwords_filepath)

### data

In [5]:
# get newsgroups.json from https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json
# it is not included in this package to save 22.2MB space

newgroups_filename = "newsgroups.json"
newgroups_filepath = os.path.join(data_nlp_dirpath, newgroups_filename)
df = pd.read_json(newgroups_filepath)
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


### preprocessing

In [5]:
docs = df.content.values.tolist()

In [6]:
docs = [remove_email(doc) for doc in docs]

In [7]:
docs = [remove_newline_char(doc) for doc in docs]

In [8]:
docs = [remove_single_quote(doc) for doc in docs]

### vectorization

In [9]:
vectorizer = TfidfVectorizer(stop_words=stopwords)
X = vectorizer.fit_transform(docs)
X.shape

(11314, 105745)

### model

In [10]:
model = NMF(n_components=20, max_iter=500, random_state=0)
model.fit(X)

NMF(max_iter=500, n_components=20, random_state=0)

In [11]:
W = model.transform(X)
W.shape

(11314, 20)

In [12]:
H = model.components_
H.shape

(20, 105745)

In [13]:
columns = vectorizer.get_feature_names_out()

In [14]:
# most probable words in each topic, that is, columns whose entries in topic row are max
H_df = pd.DataFrame(H, columns=columns)
for topic in range(H_df.shape[0]):
    topic_words = H_df.iloc[topic]
    print(f"most probable words in topic {topic} are")
    print(topic_words.nlargest(10))
    print('\n')

most probable words in topic 0 are
car       1.154409
bike      0.650312
one       0.500093
like      0.489897
get       0.488471
dont      0.454059
good      0.453161
cars      0.446417
would     0.396829
writes    0.396726
Name: 0, dtype: float64


most probable words in topic 1 are
thanks         0.870254
please         0.863548
mail           0.758509
graphics       0.666341
help           0.612398
anyone         0.571892
know           0.463541
would          0.449389
information    0.445487
email          0.445433
Name: 1, dtype: float64


most probable words in topic 2 are
key           0.957004
clipper       0.731107
encryption    0.720083
chip          0.707932
escrow        0.432508
keys          0.410442
government    0.301689
algorithm     0.278587
crypto        0.267934
security      0.255549
Name: 2, dtype: float64


most probable words in topic 3 are
god           1.508262
jesus         0.760143
bible         0.487175
christians    0.430767
christian     0.403461
faith  

In [15]:
# documents in each topic, that is, rows whose entry in topic column is max
W_df = pd.DataFrame(W)
W_df.idxmax(axis=1).value_counts()

7     1332
12    1138
0     1002
1      949
6      940
3      896
16     675
13     640
2      544
10     525
15     524
17     400
19     399
5      291
11     254
8      180
9      171
14     169
18     163
4      122
dtype: int64

### prediction

In [16]:
doc = docs[0]
doc

'From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- '

In [17]:
vectors = vectorizer.transform([doc])
vector = vectors[0]
w = model.transform(vector)

In [18]:
topic = w.argmax()
topic

0

In [19]:
H_df.iloc[topic].nlargest(10)

car       1.154409
bike      0.650312
one       0.500093
like      0.489897
get       0.488471
dont      0.454059
good      0.453161
cars      0.446417
would     0.396829
writes    0.396726
Name: 0, dtype: float64