From: https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html
Also, great explanation at: https://krakensystems.co/blog/2018/text-classification

In [8]:
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train_full = fetch_20newsgroups(subset='train')
#newsgroups_test = fetch_20newsgroups(subset='test')

In [3]:
from pprint import pprint
pprint(list(newsgroups_train_full.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [5]:
def get_clean_data(categories):
    from sklearn.feature_extraction.text import TfidfVectorizer 
    newsgroups_train = fetch_20newsgroups(subset='train',
                                          remove=('headers', 'footers', 'quotes'),categories=categories)

    newsgroups_test = fetch_20newsgroups(subset='test',
                                         remove=('headers', 'footers', 'quotes'),categories=categories)
    vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
    vectors = vectorizer.fit_transform(newsgroups_train.data)
    vectors.shape
    return newsgroups_train,newsgroups_test,vectorizer,vectors


# A. top 10 names for all categories

In [6]:
all_categories = list(newsgroups_train_full.target_names)
newsgroups_train,newsgroups_test,vectorizer,vectors = get_clean_data(all_categories)

In [15]:
#Chi
for i in range(20):
    chi2score = chi2(vectors, newsgroups_train.target == i)[0]
    wscores = zip(vectorizer.get_feature_names(),chi2score)
    print("Top 10 words for cagetory", all_categories[i] )
    wchi2 = sorted(wscores,key=lambda x:x[1],reverse = True) 
    for j in range(0,10):
        print(wchi2[j])
    print()

Top 10 words for cagetory alt.atheism
('atheism', 176.85501897011324)
('atheists', 110.29131274035525)
('islam', 106.74709797200957)
('bobby', 86.2231549557556)
('bobbe', 75.84363337331591)
('deletion', 75.59327356853609)
('motto', 74.88318245770972)
('beauchaine', 72.95182426050978)
('ico', 72.37145228762935)
('queens', 69.72528712142856)

Top 10 words for cagetory comp.graphics
('graphics', 216.75816137820803)
('3d', 130.04641899425374)
('image', 121.52669323242205)
('tiff', 104.95242585266338)
('cview', 95.96490585102153)
('polygon', 89.27040476200257)
('pov', 83.1231676364404)
('animation', 68.88612414523175)
('images', 67.22508180251018)
('format', 63.72066406079456)

Top 10 words for cagetory comp.os.ms-windows.misc
('windows', 436.7932997208241)
('ax', 174.3722186934)
('cica', 136.95896741436033)
('ini', 107.74693057626072)
('file', 97.31400197576414)
('dos', 93.24816587167086)
('win3', 84.82684060963359)
('drivers', 73.97098736724139)
('fonts', 70.40082892304709)
('files', 68.3

In [None]:
#Mutual Information Gain
for i in range(20):
    mutual_info_score = mutual_info_classif(vectors, newsgroups_train.target == i)
    wscores = zip(vectorizer.get_feature_names(),mutual_info_score)
    print("Top 10 words for cagetory", all_categories[i] )
    wmutual = sorted(wscores,key=lambda x:x[1],reverse = True) 
    for j in range(0,10):
        print(wmutual[j])
    print()

Top 10 words for cagetory alt.atheism
('one', 0.0580318295197702)
('would', 0.05043179541896044)
('people', 0.043403398558661044)
('think', 0.040205726417214906)
('like', 0.0378694529621343)
('know', 0.035189002998355715)
('say', 0.03340902622928285)
('could', 0.033241170640713816)
('well', 0.03157989681027375)
('even', 0.03089999131491033)

Top 10 words for cagetory comp.graphics
('would', 0.0521831507007433)
('one', 0.04422913093847532)
('know', 0.043769014478247374)
('like', 0.04206383871866058)
('thanks', 0.04023663455968871)
('graphics', 0.036166310467123826)
('anyone', 0.035785928979683565)
('use', 0.03425463465514558)
('get', 0.03344597808196246)
('also', 0.031012151131448614)

Top 10 words for cagetory comp.os.ms-windows.misc
('windows', 0.08247100680944328)
('one', 0.04652017696180564)
('use', 0.04342804269633692)
('would', 0.04181746748402719)
('know', 0.04143920659457999)
('like', 0.04132759587822103)
('get', 0.04085254925460213)
('thanks', 0.03611408350497898)
('using', 0.0

In [10]:
#Anova
for i in range(20):
    f_classif_score = f_classif(vectors, newsgroups_train.target == i)[0]
    wscores = zip(vectorizer.get_feature_names(),f_classif_score)
    print("Top 10 words for cagetory", all_categories[i] )
    wf_classif = sorted(wscores,key=lambda x:x[1],reverse = True) 
    for j in range(0,10):
        print(wf_classif[j])
    print()

Top 10 words for cagetory alt.atheism
('atheism', 725.22785732133)
('atheists', 488.891000306181)
('islam', 446.0377966165041)
('bobby', 442.23748977891876)
('atheist', 414.1242356891276)
('religion', 401.9408447969025)
('bobbe', 373.647973004705)
('ico', 360.9607302266102)
('beauchaine', 351.3953300574928)
('sank', 347.4982862977032)

Top 10 words for cagetory comp.graphics
('graphics', 1334.8292632400237)
('3d', 697.1210636925315)
('image', 692.6646421934731)
('images', 458.27846541974526)
('format', 347.17089277266814)
('animation', 295.82886516579913)
('pov', 293.58642419065484)
('tiff', 280.31896406222205)
('files', 275.9335591912293)
('polygon', 275.7496478458294)

Top 10 words for cagetory comp.os.ms-windows.misc
('windows', 3087.9715351044424)
('cica', 679.4685636152511)
('file', 536.6323472061878)
('dos', 453.7328503770563)
('files', 371.1380945972807)
('ini', 354.8133963826362)
('drivers', 337.0742993200066)
('driver', 333.54455787143917)
('win3', 326.47938725063335)
('micros

# Part B: Top 200 Words Overall

In [18]:
#Chi
chi2score = chi2(vectors, newsgroups_train.target)[0]
wscores = zip(vectorizer.get_feature_names(),chi2score)
print("Top 200 words for overall")
wchi2 = sorted(wscores,key=lambda x:x[1],reverse = True) 
for j in range(0,200):
    print(wchi2[j])

Top 200 words for overall
('israel', 518.4165283851895)
('god', 514.1261540021537)
('bike', 503.8445171018337)
('windows', 477.0045102203724)
('encryption', 456.6711865551086)
('car', 430.54520354246716)
('key', 414.90044456287274)
('clipper', 404.970515098119)
('space', 399.2681834582319)
('gun', 368.5908330150317)
('hockey', 351.0024721592116)
('israeli', 349.02785109629747)
('jesus', 324.82964866117703)
('sale', 321.6188055339723)
('window', 312.6277497142652)
('armenian', 298.47061680035154)
('motif', 295.2201087060377)
('team', 286.4675614321363)
('mac', 283.8825387209641)
('apple', 281.05589195707364)
('armenians', 272.5423884762631)
('nsa', 265.0556123864018)
('scsi', 259.5937970682129)
('guns', 247.67855904133359)
('game', 247.02248076513206)
('turkish', 243.05825337359732)
('church', 240.84521514528353)
('arab', 235.95869412516413)
('keys', 235.84157447813587)
('nhl', 235.54273430947967)
('ide', 232.22143765538112)
('graphics', 231.9896720170852)
('msg', 229.39207415973797)
('

In [17]:
#Anova
f_classif_score = f_classif(vectors, newsgroups_train.target)[0]
wscores = zip(vectorizer.get_feature_names(),f_classif_score)
print("Top 200 words for overall")
wf_classif = sorted(wscores,key=lambda x:x[1],reverse = True) 
for j in range(0,200):
    print(wf_classif[j])

Top 200 words for overall
('windows', 181.77251261558595)
('god', 160.16862347908142)
('sale', 152.29981407858475)
('bike', 146.86500725391124)
('car', 142.47189784706518)
('clipper', 134.83282639941007)
('space', 132.2293397016866)
('israel', 128.42792554250536)
('encryption', 125.86657541360135)
('key', 117.86894410905703)
('gun', 117.23311064910975)
('hockey', 110.87310313074491)
('israeli', 105.16614404430636)
('team', 104.20086841074655)
('mac', 88.48453011101707)
('apple', 88.23720537659942)
('jesus', 82.12828180769517)
('game', 81.60054834701296)
('offer', 79.23264733164638)
('armenians', 76.01726985923136)
('christ', 75.93715503109578)
('graphics', 75.69894645214738)
('nsa', 75.0561605940821)
('geb', 74.63482595829049)
('guns', 74.48632009231635)
('window', 73.49623613804721)
('keys', 73.26182579134907)
('christians', 71.94477089672785)
('chastity', 71.21151784302286)
('n3jxp', 71.21151784302286)
('dsl', 70.66358856784787)
('armenian', 70.4604345648466)
('secure', 69.4423052344

In [None]:
#Mutual Info Gain
mutual_info_score = mutual_info_classif(vectors, newsgroups_train.target)
wscores = zip(vectorizer.get_feature_names(),mutual_info_score)
print("Top 200 words for overall")
    wmutual = sorted(wscores,key=lambda x:x[1],reverse = True) 
    for j in range(0,200):
        print(wmutual[j])

# Part C - K Means

In [25]:
#based on https://colab.research.google.com/drive/1ocp2SGbiMxUkhAvkRDw_IP9GRi0XgNoR?usp=sharing#scrollTo=FtwT01ORh5mz
from sklearn.cluster import KMeans, MiniBatchKMeans
km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1)

km.fit(vectors)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(20):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: would one like know get people think use anyone could
Cluster 1: state motto deeds calverts femine maschii womanly fatti words parole
Cluster 2: incitement bureau investigation murder fbi gov federal internet love reading
Cluster 3: colorado davewood rex boulder wood cs david university edu geez
Cluster 4: jesus impact guards body testament stolen crucification record god written
Cluster 5: 6236 2436 puppies epstein evenings 706 techworks 895 70ns ea
Cluster 6: idling engine blipping gauging carburated fast straightened tach linkage exaggerated
Cluster 7: software portable claye tgs graphics hoops ithaca figaro hooks hart
Cluster 8: evidence arguable rigged garrett pathetic elections stole nixon liar election
Cluster 9: notes chur heathers lek life dylan lecture tons professor movie
Cluster 10: uninfluenced adj observable emotion distinguished prejudice phenomenon existance mental dictionary
Cluster 11: wright gene portable manufacturers modem know internal fax mac original


In [None]:
vectors

In [None]:
vectorizer