In [None]:
#Here we will use the sparse word count features from 
#the 20 Newsgroups corpus to show how we might classify 
#these short documents into categories.
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups()
data.target_names

In [None]:
categories = ['talk.religion.misc', 'soc.religion.christian',
              'sci.space', 'comp.graphics']
train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [None]:
print(train.data[8])

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
model = make_pipeline(TfidfVectorizer(), KNeighborsClassifier())

In [None]:
model.fit(train.data, train.target)
labels = model.predict(test.data)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
mat = confusion_matrix(test.target, labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=train.target_names, yticklabels=train.target_names)


In [None]:
from sklearn.metrics import classification_report
mat = classification_report(test.target, labels)
print(mat)

In [None]:
#The very cool thing here is that we now have the tools 
#to determine the category for any string, using the 
#predict() method of this pipeline. 
#Here's a quick utility function  that will return the 
#prediction for a single string:

In [None]:
def predict_category(s, train=train, model=model):
    pred = model.predict([s])
    return train.target_names[pred[0]]

In [None]:
predict_category('ISRO launched a new satellite')

In [None]:
predict_category('God is great')

In [None]:
predict_category('Jai Ganga Maiya')

In [None]:
predict_category('Meet The Transplant Patients Armed With A New Lease Of Life')

In [None]:
predict_category('')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ['IIT Roorkee.', 'Best IIT.', 'IIT Delhi.', 'Capital Delhi',]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
tfidf_tokens=vectorizer.get_feature_names_out()
tfidf_tokens

In [None]:
import pandas as pd

result = pd.DataFrame(
    data=X.toarray(), 
    index=["Doc1", "Doc2", "Doc3", "Doc4"], 
    columns=tfidf_tokens
)

result