# Dimension reduction, KMeans clustering: Wikipedia articles

## Import and preprocessing

In [1]:
import pandas as pd

df_temp = pd.read_csv('wikipedia-vectors.csv', index_col=0)

words = pd.read_csv('wikipedia-vocabulary-utf8.txt', header=None).values.flatten().tolist()

df = pd.DataFrame(df_temp.transpose().values, index=df_temp.transpose().index, columns=words)  
                                      # The words were stored as rows (13124) in the csv 
                                      # because of the limitation of columns a csv file can have, hence the transpose

df.head()

Unnamed: 0,aaron,abandon,abandoned,abandoning,abandonment,abbas,abbey,abbreviated,abbreviation,abc,...,zealand,zenith,zeppelin,zero,zeus,zimbabwe,zinc,zone,zones,zoo
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alexa Internet,0.0,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Internet Explorer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003772,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011594,0.0,0.0
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Google Search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006649,0.0


In [2]:
from scipy.sparse import csr_matrix

articles = csr_matrix(df) 

titles = list(df.index)

In [3]:
articles.shape

(60, 13125)

## Pipeline: TruncatedSVD, KMeans

In [4]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters=6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd,kmeans)

## Fit and prediction

In [5]:
# Fit the model
pipeline.fit(articles)

# Calculate the cluster labels: labels
labels = pipeline.predict(articles)

# Create a DataFrame aligning labels and titles: df
df2 = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
print(df2.sort_values('label'))

                                          article  label
29                               Jennifer Aniston      0
21                             Michael Fassbender      0
22                              Denzel Washington      0
23                           Catherine Zeta-Jones      0
24                                   Jessica Biel      0
20                                 Angelina Jolie      0
26                                     Mila Kunis      0
27                                 Dakota Fanning      0
28                                  Anne Hathaway      0
25                                  Russell Crowe      0
18  2010 United Nations Climate Change Conference      1
10                                 Global warming      1
11       Nationally Appropriate Mitigation Action      1
12                                   Nigel Lawson      1
13                               Connie Hedegaard      1
14                                 Climate change      1
15                             

We see that the predicted labels correspond to "topics": 
- label 5 --> Articles about illnesses
- label 4 --> Articles about climate related articles
- label 1 --> Articles about football

## Dimension reduction using NMF

In [6]:
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

# Print the NMF features
print(nmf_features.shape)
print(nmf_features)

(60, 6)
[[  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.40529365e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   5.66686813e-01]
 [  3.82108491e-03   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   3.98704441e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   3.81795303e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.85587227e-01]
 [  1.29307639e-02   1.37903748e-02   7.76331461e-03   3.34493314e-02
    0.00000000e+00   3.34570084e-01]
 [  0.00000000e+00   0.00000000e+00   2.06742394e-02   0.00000000e+00
    6.04567435e-03   3.59113349e-01]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   4.91047564e-01]
 [  1.54294757e-02   1.42832337e-02   3.76635176e-03   2.37115180e-02
    2.62655884e-02   4.80844210e-01]
 [  1.11753111e-02   3.137100

In [7]:
# Import pandas
import pandas as pd

# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index=titles)

df.head()

Unnamed: 0,0,1,2,3,4,5
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.440529
Alexa Internet,0.0,0.0,0.0,0.0,0.0,0.566687
Internet Explorer,0.003821,0.0,0.0,0.0,0.0,0.398704
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.381795
Google Search,0.0,0.0,0.0,0.0,0.0,0.485587


In [8]:
# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])

0    0.003846
1    0.000000
2    0.000000
3    0.575719
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64
0    0.000000
1    0.005602
2    0.000000
3    0.422386
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64


We see the 3rd NMF feature is high for those two rows.  Let's see which topics are behind this NMF feature.

## Identifying the topics

In [9]:
words = pd.read_csv('wikipedia-vocabulary-utf8.txt', header=None).values.flatten().tolist()

# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=words)

components_df.shape

(6, 13125)

In [10]:
# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[3,:]

# Print result of nlargest
print(component.nlargest())

(6, 13125)
film       0.627867
award      0.253128
starred    0.245281
role       0.211448
actress    0.186395
Name: 3, dtype: float64


## Identifying articles with similar topics (recommending system)

In [11]:
# Perform the necessary imports
import pandas as pd
from sklearn.preprocessing import normalize

# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)

# Create a DataFrame: df
df1 = pd.DataFrame(norm_features, index=titles)

# Select the row corresponding to 'Cristiano Ronaldo': article
article = df1.loc['Cristiano Ronaldo']

# Compute the dot products: similarities
similarities = df1.dot(article)

# Display those with the largest cosine similarity
print(similarities.nlargest())

Cristiano Ronaldo                1.000000
Franck Ribéry                    0.999972
Radamel Falcao                   0.999942
Zlatan Ibrahimović               0.999942
France national football team    0.999923
dtype: float64
