# Dimension reduction, KMeans clustering: Wikipedia articles

## Import and preprocessing

In [5]:
import pandas as pd
df = pd.read_csv('wikipedia-vectors.csv', index_col=0)
df.head()

Unnamed: 0,HTTP 404,Alexa Internet,Internet Explorer,HTTP cookie,Google Search,Tumblr,Hypertext Transfer Protocol,Social search,Firefox,LinkedIn,...,Chad Kroeger,Nate Ruess,The Wanted,Stevie Nicks,Arctic Monkeys,Black Sabbath,Skrillex,Red Hot Chili Peppers,Sepsis,Adam Levine
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.008878,0.0,0.0,0.049502,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00611,0.0
2,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005646,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
titles = list(df.columns)

In [19]:
articles = df.transpose()
articles.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13115,13116,13117,13118,13119,13120,13121,13122,13123,13124
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alexa Internet,0.0,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Internet Explorer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003772,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011594,0.0,0.0
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Google Search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006649,0.0


In [16]:
articles.shape

(60, 13125)

## Pipeline: TruncatedSVD, KMeans

In [20]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters=6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd,kmeans)

## Fit and prediction

In [21]:
# Fit the model
pipeline.fit(articles)

# Calculate the cluster labels: labels
labels = pipeline.predict(articles)

# Create a DataFrame aligning labels and titles: df
df2 = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
print(df2.sort_values('label'))

                                          article  label
19  2007 United Nations Climate Change Conference      0
18  2010 United Nations Climate Change Conference      0
17  Greenhouse gas emissions by the United States      0
16                                        350.org      0
15                                 Kyoto Protocol      0
14                                 Climate change      0
13                               Connie Hedegaard      0
12                                   Nigel Lawson      0
11       Nationally Appropriate Mitigation Action      0
10                                 Global warming      0
58                                         Sepsis      1
59                                    Adam Levine      1
50                                   Chad Kroeger      1
51                                     Nate Ruess      1
52                                     The Wanted      1
53                                   Stevie Nicks      1
54                             

We see that the predicted labels correspond to "topics": 
- label 5 --> Articles about Web media
- label 4 --> Articles about actors
- label 0 --> Climate related articles

## Dimension reduction using NMF

In [22]:
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

# Print the NMF features
print(nmf_features)

[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.40559253e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   5.66725985e-01]
 [  3.82045893e-03   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   3.98731774e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   3.81821251e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.85620851e-01]
 [  1.29287532e-02   1.37892603e-02   7.76334600e-03   3.34405862e-02
    0.00000000e+00   3.34593801e-01]
 [  0.00000000e+00   0.00000000e+00   2.06746015e-02   0.00000000e+00
    6.04415019e-03   3.59137531e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.91081644e-01]
 [  1.54270124e-02   1.42820144e-02   3.76640434e-03   2.37055623e-02
    2.62587806e-02   4.80877246e-01]
 [  1.11735861e-02   3.13684253e-02  

In [27]:
# Import pandas
import pandas as pd

# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index=titles)

df.head()

Unnamed: 0,0,1,2,3,4,5
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.440559
Alexa Internet,0.0,0.0,0.0,0.0,0.0,0.566726
Internet Explorer,0.00382,0.0,0.0,0.0,0.0,0.398732
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.381821
Google Search,0.0,0.0,0.0,0.0,0.0,0.485621


In [26]:
# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])

0    0.003845
1    0.000000
2    0.000000
3    0.575575
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64
0    0.000000
1    0.005601
2    0.000000
3    0.422280
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64


In [None]:
# To be continued