**Unsupervised learning**
___
- Unsupervised learning finds patterns in data
- Dimension = number of features
- k-means clustering
    - finds clusters of samples
    - number of clusters must be specified
    - implemented in sklearn ("scikit-learn")
    - new samples can be assigned to existing clusters
        - k-means remembers the mean of each cluster (the "centroids")
        - finds the nearest centroid to each new sample
___

In [None]:
#clustering 2D points

# Import pyplot
#import matplotlib.pyplot as plt

# Import KMeans
#from sklearn.cluster import KMeans

# Create a KMeans instance with 3 clusters: model
#model = KMeans(n_clusters = 3)

# Fit model to points
#model.fit(points)

# Determine the cluster labels of new_points: labels
#labels = model.predict(new_points)

# Print cluster labels of new_points
#print(labels)

#################################################
#<script.py> output:
#    [1 2 0 1 2 1 2 2 2 0 1 2 2 0 0 2 0 0 2 2 0 2 1 2 1 0 2 0 0 1 1 2 2 2 0 1 2
#     2 1 2 0 1 1 0 1 2 0 0 2 2 2 2 0 0 1 1 0 0 0 1 1 2 2 2 1 2 0 2 1 0 1 1 1 2
#     1 0 0 1 2 0 1 0 1 2 0 2 0 1 2 2 2 1 2 2 1 0 0 0 0 1 2 1 0 0 1 1 2 1 0 0 1
#     0 0 0 2 2 2 2 0 0 2 1 2 0 2 1 0 2 0 0 2 0 2 0 1 2 1 1 2 0 1 2 1 1 0 2 2 1
#     0 1 0 2 1 0 0 1 0 2 2 0 2 0 0 2 2 1 2 2 0 1 0 1 1 2 1 2 2 1 1 0 1 1 1 0 2
#     2 1 0 1 0 0 2 2 2 1 2 2 2 0 0 1 2 1 1 1 0 2 2 2 2 2 2 0 0 2 0 0 0 0 2 0 0
#     2 2 1 0 1 1 0 1 0 1 0 2 2 0 2 2 2 0 1 1 0 2 2 0 2 0 0 2 0 0 1 0 1 1 1 2 0
#     0 0 1 2 1 0 1 0 0 2 1 1 1 0 2 2 2 1 2 0 0 2 1 1 0 1 1 0 1 2 1 0 0 0 0 2 0
#     0 2 2 1]

# Assign the columns of new_points: xs and ys
#xs = new_points[:,0]
#ys = new_points[:,1]

# Make a scatter plot of xs and ys, using labels to define the colors
#plt.scatter(xs, ys, c=labels, alpha=0.5)

# Assign the cluster centers: centroids
#centroids = model.cluster_centers_

# Assign the columns of centroids: centroids_x, centroids_y
#centroids_x = centroids[:,0]
#centroids_y = centroids[:,1]

# Make a scatter plot of centroids_x and centroids_y
#plt.scatter(centroids_x, centroids_y, marker ='D', s=50)
#plt.show()

![_images/8.1.svg](_images/8.1.svg)

**Evaluating a clustering**
___
- compare against known clustering
    - using crosstabs
- inertia measures clustering quality
    - how spread out the clusters are (lower is better)
    - distance from each sample to centroid of its cluster
    - best choice is elbow in inertia plot
___

In [None]:
# How many clusters?

# Import pyplot
#import matplotlib.pyplot as plt

# Import KMeans
#from sklearn.cluster import KMeans

#ks = range(1, 6)
#inertias = []

#for k in ks:
    # Create a KMeans instance with k clusters: model
#    model = KMeans(n_clusters=k)

    # Fit model to samples
#    model.fit(samples)

    # Append the inertia to the list of inertias
#    inertias.append(model.inertia_)

# Plot ks vs inertias
#plt.plot(ks, inertias, '-o')
#plt.xlabel('number of clusters, k')
#plt.ylabel('inertia')
#plt.xticks(ks)
#plt.show()

![_images/8.2.svg](_images/8.2.svg)

In [1]:
# Evaluating clusters

#import libraries
from sklearn.cluster import KMeans
import pandas as pd

# Create a KMeans model with 3 clusters: model
#model = KMeans(n_clusters=3)

# Use fit_predict to fit model and obtain cluster labels: labels
#labels = model.fit_predict(samples)

# Create a DataFrame with labels and varieties as columns: df
#df = pd.DataFrame({'labels': labels, 'varieties': varieties})

# Create crosstab: ct
#ct = pd.crosstab(df['labels'], df['varieties'])

# Display ct
#print(ct)

#################################################
#<script.py> output:
#    varieties  Canadian wheat  Kama wheat  Rosa wheat
#    labels
#    0                       0           1          60
#    1                      68           9           0
#    2                       2          60          10

**Transforming features for better clusterings**
___
- when features have different variances, clustering will be inaccurate.
    - for K Means clustering, variance = influence
- StandardScaler from sklearn.preprocessing transforms each feature to have mean 0 and variance 1
    - StandardScaler has fit() and transform()
    - KMeans has fit() and predict()
- Normalizer rescales samples (rather than features) independently of the other
___

In [None]:
#scaling data for clustering

# Perform the necessary imports
#from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import StandardScaler
#from sklearn.cluster import KMeans

#import pandas as pd

# Create scaler: scaler
#scaler = StandardScaler()

# Create KMeans instance: kmeans
#kmeans = KMeans(n_clusters=4)

# Create pipeline: pipeline
#pipeline = make_pipeline(scaler, kmeans)

# Fit the pipeline to samples
#pipeline.fit(samples)

# Calculate the cluster labels: labels
#labels = pipeline.predict(samples)

# Create a DataFrame with labels and species as columns: df
#df = pd.DataFrame({'labels' : labels, 'species' : species})

# Create crosstab: ct
#ct = pd.crosstab(df['labels'], df['species'])

# Display ct
#print(ct)

#################################################
#<script.py> output:
#    species  Bream  Pike  Roach  Smelt
#    labels
#    0            0     0      0     13
#    1           33     0      1      0
#    2            0    17      0      0
#    3            1     0     19      1

In [None]:
#Clustering stocks using KMeans and Normalizer

# Perform the necessary imports
#from sklearn.pipeline import make_pipeline
#from sklearn.preprocessing import Normalizer
#from sklearn.cluster import KMeans

#import pandas as pd

# Create a normalizer: normalizer
#normalizer = Normalizer()

# Create a KMeans model with 10 clusters: kmeans
#kmeans = KMeans(n_clusters=10)

# Make a pipeline chaining normalizer and kmeans: pipeline
#pipeline = make_pipeline(normalizer, kmeans)

# Fit pipeline to the daily price movements
#pipeline.fit(movements)

# Import pandas
#import pandas as pd

# Predict the cluster labels: labels
#labels = pipeline.predict(movements)

# Create a DataFrame aligning labels and companies: df
#df = pd.DataFrame({'labels': labels, 'companies': companies})

# Display df sorted by cluster label
#print(df.sort_values('labels'))

#################################################
#<script.py> output:
#        labels                           companies
#    59       0                               Yahoo
#    15       0                                Ford
#    35       0                            Navistar
#    26       1                      JPMorgan Chase
#    16       1                   General Electrics
#    58       1                               Xerox
#    11       1                               Cisco
#    18       1                       Goldman Sachs
#    20       1                          Home Depot
#    5        1                     Bank of America
#    3        1                    American express
#    55       1                         Wells Fargo
#    1        1                                 AIG
#    38       2                               Pepsi
#    40       2                      Procter Gamble
#    28       2                           Coca Cola
#    27       2                      Kimberly-Clark
#    9        2                   Colgate-Palmolive
#    54       3                            Walgreen
#    36       3                    Northrop Grumman
#    29       3                     Lookheed Martin
#    4        3                              Boeing
#    0        4                               Apple
#    47       4                            Symantec
#    33       4                           Microsoft
#    32       4                                  3M
#    31       4                           McDonalds
#    30       4                          MasterCard
#    50       4  Taiwan Semiconductor Manufacturing
#    14       4                                Dell
#    17       4                     Google/Alphabet
#    24       4                               Intel
#    23       4                                 IBM
#    2        4                              Amazon
#    51       4                   Texas instruments
#    43       4                                 SAP
#    45       5                                Sony
#    48       5                              Toyota
#    21       5                               Honda
#    22       5                                  HP
#    34       5                          Mitsubishi
#    7        5                               Canon
#    56       6                            Wal-Mart
#    57       7                               Exxon
#    44       7                        Schlumberger
#    8        7                         Caterpillar
#    10       7                      ConocoPhillips
#    12       7                             Chevron
#    13       7                   DuPont de Nemours
#    53       7                       Valero Energy
#    39       8                              Pfizer
#    41       8                       Philip Morris
#    25       8                   Johnson & Johnson
#    49       9                               Total
#    46       9                      Sanofi-Aventis
#    37       9                            Novartis
#    42       9                   Royal Dutch Shell
#    19       9                     GlaxoSmithKline
#    52       9                            Unilever
#    6        9            British American Tobacco

**Visualizing hierarchies**
___
- t-SNE
    - creates a 2D map of a dataset
- Hierarchical clustering
    - 2D array of scores
    - dendrogram
    - number of operations = # samples compared - 1
    - agglomerative clustering
        - each row begins in a separate cluster, at each step the two closest clusters are merged
        - continues until all rows are in a single cluster
    - divisive clustering
        - opposite to agglomerative clustering
___

In [None]:
#Hierarchical clustering of grain data

# Perform the necessary imports
#from scipy.cluster.hierarchy import linkage, dendrogram
#import matplotlib.pyplot as plt

# Calculate the linkage: mergings
#mergings = linkage(samples, method='complete')

# Plot the dendrogram, using varieties as labels
#dendrogram(mergings,
#           labels=varieties,
#           leaf_rotation=90,
#           leaf_font_size=6,
#)
#plt.show()

![_images/8.3.svg](_images/8.3.svg)

In [None]:
#Hierarchical clustering of stock data with normalize()

# Perform the necessary imports
#from scipy.cluster.hierarchy import linkage, dendrogram
#import matplotlib.pyplot as plt
#from sklearn.preprocessing import normalize

# Normalize the movements: normalized_movements
#normalized_movements = normalize(movements)

# Calculate the linkage: mergings
#mergings = linkage(normalized_movements, method='complete')

# Plot the dendrogram
#dendrogram(mergings,
#            labels=companies,
#            leaf_rotation=90,
#            leaf_font_size=6
#)
#plt.show()

![_images/8.4.svg](_images/8.4.svg)

**Cluster labels in hierarchical clustering**
___
- cluster labels from intermediate stages can be recovered and crosstabulated
- y axis of a dendrogram indicates height
    - distance between merging clusters
    - linkage method is called using fcluster() in scipy.cluster.hierarchy
- linkage
    - **complete** - distance between clusters is the distance between the furthest points of the clusters
    - **single** - distance between clusters is the distance between the closest points of the clusters
___

In [None]:
#single linkage, different dendrogram

# Perform the necessary imports
#import matplotlib.pyplot as plt
#from scipy.cluster.hierarchy import linkage, dendrogram

# Calculate the linkage: mergings
#mergings = linkage(samples, method='single')

# Plot the dendrogram
#dendrogram(mergings,
#            labels=country_names,
#            leaf_rotation=90,
#            leaf_font_size=6
#)
#plt.show()

![_images/8.5.svg](_images/8.5.svg)

In [None]:
#extracting cluster labels

# Perform the necessary imports
#import matplotlib.pyplot as plt
#from scipy.cluster.hierarchy import linkage, dendrogram
#import pandas as pd
#from scipy.cluster.hierarchy import fcluster

# Calculate the linkage: mergings
#mergings = linkage(samples, method='single')

# Use fcluster to extract labels: labels
#labels = fcluster(mergings, t=6, criterion='distance')

# Create a DataFrame with labels and varieties as columns: df
#df = pd.DataFrame({'labels': labels, 'varieties': varieties})

# Create crosstab: ct
#ct = pd.crosstab(df['labels'], df['varieties'])

# Display ct
#print(ct)

#################################################
#<script.py> output:
#    varieties  Canadian wheat  Kama wheat  Rosa wheat
#    labels
#    1                      14           3           0
#    2                       0           0          14
#    3                       0          11           0

**t-SNE for 2-dimensional maps**
___
"t-distributed stochastic neighbor embedding"
- maps samples to 2D or 3D space
- map approximately preserves nearness of samples
- great for inspecting datasets
- only has fit_transform() method
- t-SNE learning rate - values between 50 and 200
    - if points are clustered together, it is a bad value
- axis values are not interpretable
___

In [None]:
#t-SNE visualization of grain dataset

#import matplotlib.pyplot as plt

# Import TSNE
#from sklearn.manifold import TSNE

# Create a TSNE instance: model
#model = TSNE(learning_rate=200)

# Apply fit_transform to samples: tsne_features
#tsne_features = model.fit_transform(samples)

# Select the 0th feature: xs
#xs = tsne_features[:,0]

# Select the 1st feature: ys
#ys = tsne_features[:,1]

# Scatter plot, coloring by variety_numbers
#plt.scatter(xs, ys, c=variety_numbers)
#plt.show()

![_images/8.6.svg](_images/8.6.svg)

In [None]:
#t-SNE map of the stock market

#import matplotlib.pyplot as plt

# Import TSNE
#from sklearn.manifold import TSNE

# Create a TSNE instance: model
#model = TSNE(learning_rate=50)

# Apply fit_transform to normalized_movements: tsne_features
#tsne_features = model.fit_transform(normalized_movements)

# Select the 0th feature: xs
#xs = tsne_features[:,0]

# Select the 1th feature: ys
#ys = tsne_features[:,1]

# Scatter plot
#plt.scatter(xs, ys, alpha=0.5)

# Annotate the points
#for x, y, company in zip(xs, ys, companies):
#    plt.annotate(company, (x, y), fontsize=5, alpha=0.75)
#plt.show()

![_images/8.7.svg](_images/8.7.svg)

**Visualizing the PCA transformation**
___
Dimension reduction - more efficient storage and computation
- remove less-informative "noise" features
- PCA = principal component analysis
    - rotates data samples to be aligned with axes
    - shifts data samples so they have mean 0
    - no information is lost
    - PCA follows the fit/transform pattern
    - PCA de-correlates samples that are correlated
- Principal components
    - indicates directions of variance, and aligns with axes
___

In [None]:
#correlated data in nature

# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Assign the 0th column of grains: width
#width = grains[:,0]

# Assign the 1st column of grains: length
#length = grains[:,1]

# Scatter plot width vs length
#plt.scatter(width, length)
#plt.axis('equal')
#plt.show()

# Calculate the Pearson correlation
#correlation, pvalue = pearsonr(width, length)

# Display the correlation
#print(correlation)

#################################################
#<script.py> output:
#    0.8604149377143466

![_images/8.8.svg](_images/8.8.svg)

In [None]:
#Decorrelating the grain measurements with PCA

# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Import PCA
from sklearn.decomposition import PCA

# Create PCA instance: model
#model = PCA()

# Apply the fit_transform method of model to grains: pca_features
#pca_features = model.fit_transform(grains)

# Assign 0th column of pca_features: xs
#xs = pca_features[:,0]

# Assign 1st column of pca_features: ys
#ys = pca_features[:,1]

# Scatter plot xs vs ys
#plt.scatter(xs, ys)
#plt.axis('equal')
#plt.show()

# Calculate the Pearson correlation of xs and ys
#correlation, pvalue = pearsonr(xs, ys)

# Display the correlation
#print(correlation)

#################################################
#<script.py> output:
#    2.5478751053409354e-17

![_images/8.9.svg](_images/8.9.svg)

**Intrinsic dimension**
___
Intrinsic dimension = number of features needed to approximate the dataset
- What is the most compact representation of the samples?
- Can be approximated with PCA
- Intrinsic dimension = number of PCA features with **significant variance**
___

In [None]:
#the first principal component - direction in which grain data varies the most

# Perform the necessary imports
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Make a scatter plot of the untransformed points
#plt.scatter(grains[:,0], grains[:,1])

# Create a PCA instance: model
#model = PCA()

# Fit model to points
#model.fit(grains)

# Get the mean of the grain samples: mean
#mean = model.mean_

# Get the first principal component: first_pc
#first_pc = model.components_[0,:]

# Plot first_pc as an arrow, starting at mean
#plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)

# Keep axes on same scale
plt.axis('equal')
plt.show()

![_images/8.10.svg](_images/8.10.svg)

In [None]:
#Variance of the PCA features

# Perform the necessary imports
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# Create scaler: scaler
#scaler = StandardScaler()

# Create a PCA instance: pca
#pca = PCA()

# Create pipeline: pipeline
#pipeline = make_pipeline(scaler, pca)

# Fit the pipeline to 'samples'
#pipeline.fit(samples)

# Plot the explained variances
#features = range(pca.n_components_)
#plt.bar(features, pca.explained_variance_)
#plt.xlabel('PCA feature')
#plt.ylabel('variance')
#plt.xticks(features)
#plt.show()

![_images/8.11.svg](_images/8.11.svg)

**Dimension reduction with PCA**
___
- specify how many features to keep (i.e. intrinsic dimensions)
- alternatives to PCA
    - word frequency arrays - row = document, column = dictionary word
    - scipy.sparse.csr_matrix used for sparse arrays, where most entries = 0
    - remembers only non-zero entries
    - TruncatedSVD used instead of PCA on sparse arrays (same library/behaviors)
___

In [None]:
#Dimension reduction of the fish measurements from 6 to 2

# Import PCA
# from sklearn.decomposition import PCA

# Create a PCA model with 2 components: pca
#pca = PCA(n_components=2)

# Fit the PCA instance to the scaled samples
#pca.fit(scaled_samples)

# Transform the scaled samples: pca_features
#pca_features = pca.transform(scaled_samples)

# Print the shape of pca_features
#print(pca_features.shape)

#################################################
#<script.py> output:
#    (85, 2)

In [2]:
#A tf-idf word-frequency array
documents = ['cats say meow', 'dogs say woof', 'dogs chase cats']

# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer: tfidf
tfidf = TfidfVectorizer()

# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(documents)

# Print result of toarray() method
print(csr_mat.toarray())

# Get the words: words
words = tfidf.get_feature_names()

# Print words
print(words)

[[0.51785612 0.         0.         0.68091856 0.51785612 0.        ]
 [0.         0.         0.51785612 0.         0.51785612 0.68091856]
 [0.51785612 0.68091856 0.51785612 0.         0.         0.        ]]
['cats', 'chase', 'dogs', 'meow', 'say', 'woof']


In [None]:
#Clustering Wikipedia
#from https://blog.lateral.io/2015/06/the-unknown-perils-of-mining-wikipedia/

# Perform the necessary imports
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
import pandas as pd

# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters=6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd, kmeans)

# Fit the pipeline to word freqency array articles
#pipeline.fit(articles)

# Calculate the cluster labels: labels
#labels = pipeline.predict(articles)

# Create a DataFrame aligning labels and titles: df
#df = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
#print(df.sort_values('label'))

#################################################
#<script.py> output:
#        label                                        article
#    59      0                                    Adam Levine
#    57      0                          Red Hot Chili Peppers
#    56      0                                       Skrillex
#    55      0                                  Black Sabbath
#    54      0                                 Arctic Monkeys
#    53      0                                   Stevie Nicks
#    52      0                                     The Wanted
#    51      0                                     Nate Ruess
#    50      0                                   Chad Kroeger
#    58      0                                         Sepsis
#    30      1                  France national football team
#    31      1                              Cristiano Ronaldo
#    32      1                                   Arsenal F.C.
#    33      1                                 Radamel Falcao
#    37      1                                       Football
#    35      1                Colombia national football team
#    36      1              2014 FIFA World Cup qualification
#    38      1                                         Neymar
#    39      1                                  Franck Ribéry
#    34      1                             Zlatan Ibrahimović
#    26      2                                     Mila Kunis
#    28      2                                  Anne Hathaway
#    27      2                                 Dakota Fanning
#    25      2                                  Russell Crowe
#    29      2                               Jennifer Aniston
#    23      2                           Catherine Zeta-Jones
#    22      2                              Denzel Washington
#    21      2                             Michael Fassbender
#    20      2                                 Angelina Jolie
#    24      2                                   Jessica Biel
#    10      3                                 Global warming
#    11      3       Nationally Appropriate Mitigation Action
#    13      3                               Connie Hedegaard
#    14      3                                 Climate change
#    12      3                                   Nigel Lawson
#    16      3                                        350.org
#    17      3  Greenhouse gas emissions by the United States
#    18      3  2010 United Nations Climate Change Conference
#    19      3  2007 United Nations Climate Change Conference
#    15      3                                 Kyoto Protocol
#    8       4                                        Firefox
#    1       4                                 Alexa Internet
#    2       4                              Internet Explorer
#    3       4                                    HTTP cookie
#    4       4                                  Google Search
#    5       4                                         Tumblr
#    6       4                    Hypertext Transfer Protocol
#    7       4                                  Social search
#    49      4                                       Lymphoma
#    42      4                                    Doxycycline
#    47      4                                          Fever
#    46      4                                     Prednisone
#    44      4                                           Gout
#    43      4                                       Leukemia
#    9       4                                       LinkedIn
#    48      4                                     Gabapentin
#    0       4                                       HTTP 404
#    45      5                                    Hepatitis C
#    41      5                                    Hepatitis B
#    40      5                                    Tonsillitis

**Non-negative matrix factorization (NMF)**
___
- NMF models are interpretable, unlike PCA
- **requires all sample features to be non-negative** (>= 0)
- NMF expresses images as combinations of patterns
- scikit-learn using fit()/transform() pattern
    - must always specify number of components
    - works with NumPy arrays and csr_matrix sparse arrays
- sample reconstruction
    - multiply components by feature values, and add up
    - can be expressed as product of matrices
___

In [None]:
#NMF applied to Wikipedia articles & NMF features

# Import NMF
#from sklearn.decomposition import NMF

# Create an NMF instance: model
#model = NMF(n_components=6)

# Fit the model to articles
#model.fit(articles)

# Transform the articles: nmf_features
#nmf_features = model.transform(articles)

# Print the NMF features
#print(nmf_features)

#################################################
#<script.py> output:
#    [[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
#      0.00000000e+00 4.40623130e-01]
#     [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
#      0.00000000e+00 5.66807830e-01]
#     [3.82006369e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
#      0.00000000e+00 3.98789405e-01]
#     [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
#      0.00000000e+00 3.81876582e-01]
#    [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
#    ...

# Import pandas
import pandas as pd

# Create a pandas DataFrame: df
#df = pd.DataFrame(nmf_features, index=titles)

# Print the row for 'Anne Hathaway'
#print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
#print(df.loc['Denzel Washington'])

#################################################
#<script.py> output:
#    0    0.003845
#    1    0.000000
#    2    0.000000
#    3    0.575711
#    4    0.000000
#    5    0.000000
#    Name: Anne Hathaway, dtype: float64
#    0    0.000000
#    1    0.005601
#    2    0.000000
#    3    0.422380
#    4    0.000000
#    5    0.000000
#    Name: Denzel Washington, dtype: float64

#Notice that for both actors, the NMF feature 3 has by far the highest
#value. This means that both articles are reconstructed using mainly
#the 3rd NMF component. In the next video, you'll see why: NMF components
#represent topics (for instance, acting!).

**NMF learns interpretable parts**
___
- NMF components
    - represent topics
    - for images, are parts of images
        - Grayscale is 0(totally black) to 1(totally white)
        - each row is a flattened array for an image
            - use reshape() method to recover unflattened array
            - use matplotlib.pyplot imshow() method to recreate image
        - each column corresponds to a pixel
- NMF features
    - combine topics into documents
___

In [None]:
#NMF learns topics of documents

# Import pandas
import pandas as pd

# Create a DataFrame: components_df
#components_df = pd.DataFrame(model.components_, columns=words)

# Print the shape of the DataFrame
#print(components_df.shape)

# Select row 3: component
#component = components_df.iloc[3]

# Print result of nlargest
#print(component.nlargest())

#################################################
#(6, 13125)
#film       0.627877
#award      0.253131
#starred    0.245284
#role       0.211451
#actress    0.186398
#Name: 3, dtype: float64


In [None]:
#Explore the LED digits dataset

#Firstly, explore the image dataset and see how it is encoded as an array.
#You are given 100 images as a 2D array samples, where each row represents
#a single 13x8 image. The images in your dataset are pictures of a LED
#digital display.

# Import pyplot
from matplotlib import pyplot as plt

# Select the 0th row: digit
#digit = samples[0,:]

# Print digit
#print(digit)

# Reshape digit to a 13x8 array: bitmap
#bitmap = digit.reshape(13,8)

# Print bitmap
#print(bitmap)

# Use plt.imshow to display bitmap
#plt.imshow(bitmap, cmap='gray', interpolation='nearest')
#plt.colorbar()
#plt.show()

#################################################
#<script.py> output:
#    [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
#     0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
#     0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
#     0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#     0. 0. 0. 0. 0. 0. 0. 0.]
#    [[0. 0. 0. 0. 0. 0. 0. 0.]
#     [0. 0. 1. 1. 1. 1. 0. 0.]
#     [0. 0. 0. 0. 0. 0. 1. 0.]
#     [0. 0. 0. 0. 0. 0. 1. 0.]
#     [0. 0. 0. 0. 0. 0. 1. 0.]
#     [0. 0. 0. 0. 0. 0. 1. 0.]
#     [0. 0. 0. 0. 0. 0. 0. 0.]
#     [0. 0. 0. 0. 0. 0. 1. 0.]
#     [0. 0. 0. 0. 0. 0. 1. 0.]
#     [0. 0. 0. 0. 0. 0. 1. 0.]
#     [0. 0. 0. 0. 0. 0. 1. 0.]
#     [0. 0. 0. 0. 0. 0. 0. 0.]
#     [0. 0. 0. 0. 0. 0. 0. 0.]]

![_images/8.12.svg](_images/8.12.svg)

In [None]:
#NMF learns the parts of images

# Import pyplot
from matplotlib import pyplot as plt

def show_as_image(sample):
    bitmap = sample.reshape((13, 8))
    plt.figure()
    plt.imshow(bitmap, cmap='gray', interpolation='nearest')
    plt.colorbar()
    plt.show()

# Import NMF
from sklearn.decomposition import NMF

# Create an NMF model: model
model = NMF(7)

# Apply fit_transform to samples: features
#features = model.fit_transform(samples)

# Call show_as_image on each component
#for component in model.components_:
#    show_as_image(component)

# Assign the 0th row of features: digit_features
#digit_features = features[0,:]

# Print digit_features
#print(digit_features)

#################################################
#<script.py> output:
#    [4.76823559e-01 0.00000000e+00 0.00000000e+00 5.90605054e-01
#     4.81559442e-01 0.00000000e+00 7.37557191e-16]

![_images/8.13.svg](_images/8.13.svg)
![_images/8.14.svg](_images/8.14.svg)
![_images/8.15.svg](_images/8.15.svg)
![_images/8.16.svg](_images/8.16.svg)
![_images/8.17.svg](_images/8.17.svg)
![_images/8.18.svg](_images/8.18.svg)
![_images/8.19.svg](_images/8.19.svg)

notice how NMF has expressed the digit as a sum of the components!

In [None]:
#PCA doesn't learn parts

# Import pyplot
from matplotlib import pyplot as plt

def show_as_image(sample):
    bitmap = sample.reshape((13, 8))
    plt.figure()
    plt.imshow(bitmap, cmap='gray', interpolation='nearest')
    plt.colorbar()
    plt.show()

# Import PCA
from sklearn.decomposition import PCA

# Create a PCA instance: model
model = PCA(7)

# Apply fit_transform to samples: features
#features = model.fit_transform(samples)

# Call show_as_image on each component
#for component in model.components_:
#    show_as_image(component)

![_images/8.20.svg](_images/8.20.svg)
![_images/8.21.svg](_images/8.21.svg)
![_images/8.22.svg](_images/8.22.svg)
![_images/8.23.svg](_images/8.23.svg)
![_images/8.24.svg](_images/8.24.svg)
![_images/8.25.svg](_images/8.25.svg)
![_images/8.26.svg](_images/8.26.svg)

Notice that the components of PCA do not represent meaningful parts of images of LED digits!

**Building recommender systems using NMF**
___
Strategy
- Apply NMF to the word-frequency array
    - NMF feature values describe the topics
- Compare feature values
    - **cosine similarity** for weak vs. strong feature sets
        - higher values indicate greater similarity between documents with weak vs. strong feature sets
        - completed using .dot() method of pandas DataFrame object
___

In [None]:
#Which articles are similar to 'Cristiano Ronaldo'?

# Perform the necessary imports
import pandas as pd
from sklearn.preprocessing import normalize

# Normalize the NMF features: norm_features
#norm_features = normalize(nmf_features)

# Create a DataFrame: df
#df = pd.DataFrame(norm_features, index=titles)

# Select the row corresponding to 'Cristiano Ronaldo': article
#article = df.loc['Cristiano Ronaldo']

# Compute the dot products: similarities
#similarities = df.dot(article)

# Display those with the largest cosine similarity
#print(similarities.nlargest())

#################################################
#<script.py> output:
#    Cristiano Ronaldo                1.000000
#    Franck Ribéry                    0.999972
#    Radamel Falcao                   0.999942
#    Zlatan Ibrahimović               0.999942
#    France national football team    0.999923
#    dtype: float64

In [None]:
#Recommend musical artists

# Perform the necessary imports
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer, MaxAbsScaler
from sklearn.pipeline import make_pipeline

# Create a MaxAbsScaler: scaler
scaler = MaxAbsScaler()

# Create an NMF model: nmf
nmf = NMF(n_components=20)

# Create a Normalizer: normalizer
normalizer = Normalizer()

# Create a pipeline: pipeline
pipeline = make_pipeline(scaler, nmf, normalizer)

# Apply fit_transform to artists: norm_features
#norm_features = pipeline.fit_transform(artists)

# Import pandas
import pandas as pd

# Create a DataFrame: df
#df = pd.DataFrame(norm_features, index=artist_names)

# Select row of 'Bruce Springsteen': artist
#artist = df.loc['Bruce Springsteen']

# Compute cosine similarities: similarities
#similarities = df.dot(artist)

# Display those with highest cosine similarity
#print(similarities.nlargest())

#################################################
#<script.py> output:
#    Bruce Springsteen    1.000000
#    Neil Young           0.955896
#    Van Morrison         0.872452
#    Leonard Cohen        0.864763
#    Bob Dylan            0.859047
#    dtype: float64