# Dimension reduction, KMeans clustering: Wikipedia articles

## Import and preprocessing

In [87]:
import pandas as pd

df_temp = pd.read_csv('wikipedia-vectors.csv', index_col=0)

words = pd.read_csv('wikipedia-vocabulary-utf8.txt', header=None).values.flatten().tolist()

df = pd.DataFrame(df_temp.transpose().values, index=df_temp.transpose().index, columns=words)

df.head()

Unnamed: 0,aaron,abandon,abandoned,abandoning,abandonment,abbas,abbey,abbreviated,abbreviation,abc,...,zealand,zenith,zeppelin,zero,zeus,zimbabwe,zinc,zone,zones,zoo
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alexa Internet,0.0,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Internet Explorer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003772,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011594,0.0,0.0
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Google Search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006649,0.0


In [96]:
from scipy.sparse import csr_matrix

articles = csr_matrix(df) # The words were stored as rows (13124) in the csv 
                                      # because of the limitation of columns a csv file can have.
titles = list(df.index)

In [98]:
articles.shape

(60, 13125)

## Pipeline: TruncatedSVD, KMeans

In [99]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters=6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd,kmeans)

## Fit and prediction

In [100]:
# Fit the model
pipeline.fit(articles)

# Calculate the cluster labels: labels
labels = pipeline.predict(articles)

# Create a DataFrame aligning labels and titles: df
df2 = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
print(df2.sort_values('label'))

                                          article  label
59                                    Adam Levine      0
57                          Red Hot Chili Peppers      0
56                                       Skrillex      0
55                                  Black Sabbath      0
54                                 Arctic Monkeys      0
53                                   Stevie Nicks      0
52                                     The Wanted      0
51                                     Nate Ruess      0
50                                   Chad Kroeger      0
58                                         Sepsis      0
30                  France national football team      1
31                              Cristiano Ronaldo      1
32                                   Arsenal F.C.      1
33                                 Radamel Falcao      1
37                                       Football      1
35                Colombia national football team      1
36              2014 FIFA World

We see that the predicted labels correspond to "topics": 
- label 5 --> Articles about illnesses
- label 4 --> Articles about climate related articles
- label 1 --> Articles about football

## Dimension reduction using NMF

In [110]:
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

# Print the NMF features
print(nmf_features.shape)
print(nmf_features)

(60, 6)
[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.40423830e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   5.66551183e-01]
 [  3.82042152e-03   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   3.98609027e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   3.81703888e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.85470853e-01]
 [  1.29286228e-02   1.37889158e-02   7.76354041e-03   3.34417657e-02
    0.00000000e+00   3.34490345e-01]
 [  0.00000000e+00   0.00000000e+00   2.06749377e-02   0.00000000e+00
    6.04261533e-03   3.59027307e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.90929922e-01]
 [  1.54268658e-02   1.42816626e-02   3.76647385e-03   2.37063083e-02
    2.62522448e-02   4.80729170e-01]
 [  1.11734709e-02   3.136764

In [103]:
# Import pandas
import pandas as pd

# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index=titles)

df.head()

Unnamed: 0,0,1,2,3,4,5
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.440585
Alexa Internet,0.0,0.0,0.0,0.0,0.0,0.566758
Internet Explorer,0.003821,0.0,0.0,0.0,0.0,0.398755
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.381844
Google Search,0.0,0.0,0.0,0.0,0.0,0.485648


In [9]:
# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])

0    0.003845
1    0.000000
2    0.000000
3    0.575645
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64
0    0.000000
1    0.005601
2    0.000000
3    0.422332
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64


We see the 3rd NMF feature is high for those two rows.  Let's see which topics are behind this NMF feature.

## Identifying the topics

In [108]:
words = pd.read_csv('wikipedia-vocabulary-utf8.txt', header=None).values.flatten().tolist()

# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=words)

components_df.shape

(6, 13125)

In [105]:
# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[3,:]

# Print result of nlargest
print(component.nlargest())

(6, 13125)
film       0.627961
award      0.253165
starred    0.245317
role       0.211479
actress    0.186423
Name: 3, dtype: float64
