In [67]:
import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

In [68]:
twenty_train = fetch_20newsgroups(shuffle=True, random_state=42)

In [69]:
print(twenty_train.data[1])
print(twenty_train.target_names[1])

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>

comp.graphics


In [70]:
for index, label in enumerate(twenty_train.target_names):
    print(index, label)

0 alt.atheism
1 comp.graphics
2 comp.os.ms-windows.misc
3 comp.sys.ibm.pc.hardware
4 comp.sys.mac.hardware
5 comp.windows.x
6 misc.forsale
7 rec.autos
8 rec.motorcycles
9 rec.sport.baseball
10 rec.sport.hockey
11 sci.crypt
12 sci.electronics
13 sci.med
14 sci.space
15 soc.religion.christian
16 talk.politics.guns
17 talk.politics.mideast
18 talk.politics.misc
19 talk.religion.misc


In [71]:
twenty_train.target

array([7, 4, 4, ..., 3, 1, 8])

In [72]:
count_vect = TfidfVectorizer(max_features=1000)
X_train_counts = count_vect.fit_transform(twenty_train.data)
new_X_train_counts = sparse.hstack((X_train_counts, twenty_train.target[:, None]))

In [73]:
data = new_X_train_counts.tocsr()

In [74]:
X = data[:, :-1].toarray()
Y = data[:, -1]

In [75]:
X.shape

(11314, 1000)

In [77]:
Y.shape

(11314, 1)

In [40]:
pca = PCA(n_components=30)
data_after_pca = pca.fit_transform(X)

In [41]:
tsne = TSNE(n_components=3)
data_after_tsne = tsne.fit_transform(data_after_pca)

In [42]:
data_after_tsne

array([[  6.217771  ,  -6.7369275 , -10.746488  ],
       [  8.411251  , -12.018542  ,   2.7920341 ],
       [ -7.006084  ,  11.076033  , -13.192918  ],
       ...,
       [  0.9320987 ,   9.026491  , -15.61674   ],
       [  5.934334  ,  -3.7138124 ,  -2.8639433 ],
       [  0.41098455, -25.034826  ,   1.7301081 ]], dtype=float32)

In [45]:
data_after_tsne.shape

(11314, 3)

In [51]:
df = pd.DataFrame(data_after_tsne, columns=['x','y','z'])

In [56]:
df['label'] = Y.toarray()

In [63]:
label_to_group = [5, 0, 0, 0, 0, 0, 3, 1, 1, 1, 1, 2, 2, 2, 2, 5, 4, 4, 4, 5]
def create_group_column(row):
    return label_to_group[int(row.label)]

In [64]:
df['group'] = df.apply(create_group_column, axis=1)

In [65]:
df

Unnamed: 0,x,y,z,label,group
0,6.217771,-6.736928,-10.746488,7.0,1
1,8.411251,-12.018542,2.792034,4.0,0
2,-7.006084,11.076033,-13.192918,4.0,0
3,-17.800188,7.637547,13.505413,1.0,0
4,0.702566,4.359663,-4.104277,14.0,2
5,4.796586,6.865203,3.421933,16.0,4
6,1.658136,-17.259863,1.769654,13.0,2
7,-9.736875,-36.209652,0.518749,3.0,0
8,0.898004,-15.206283,-3.053813,2.0,0
9,0.048843,-4.394516,-11.681096,4.0,0


In [66]:
df.to_csv('20newsgroups.csv', index=False)