#Text clustering -sample code
** Prepage datasets**
Download dataset movie_data_60.csv from google drive


In [0]:
#Mount google drive to google colab virtual machine
from google.colab import drive
drive.mount('/content/drive')
mydrive ="/content/drive/My Drive/Colab Notebooks/"

In [0]:
#download data from google drive
import gdown
url = "https://drive.google.com/uc?id=1o0I46KhBjkW2tdxD-k9NwkhrtQbI6gNF"
gdown.download(url, mydrive+"movies_genre_60.csv", quiet=False)

In [0]:
# load data to pandas data frame
import pandas as pd
import numpy as np

doc_data = pd.read_csv(mydrive+'movies_genre_60.csv', sep=",", header=0 )
movie_titles = doc_data['Title'].tolist()  # convert column Title of panda frame to list
movie_synopses = doc_data['Synopsis'].tolist()
movie_genre = doc_data['Genre'].tolist()
doc_data

#Step 1:  Pre-Processing text

In [0]:
# Define function for Lemmatization, remove stopword and feature selection using POS, spacy package
import spacy
def spacy_preprocess (text,lemma= True, pos= True, pos_select = ["VERB", "NOUN", "ADJ","ADV","PART"]):
  # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
  nlp = spacy.load('en', disable=['parser', 'ner']) # disable parser, ner for faster loading
  # Parse the sentence using the loaded 'en' model object `nlp`
  doc = nlp(text)
    
  if pos== False:
    if lemma== True: text_preprocess= " ".join([token.lemma_.lower() for token in doc if not nlp.vocab[token.text].is_stop])
    if lemma== False:text_preprocess= " ".join([token.text.lower() for token in doc if not nlp.vocab[token.text].is_stop])
  else:
    if lemma== True : text_preprocess= " ".join([token.lemma_.lower() for token in doc if (token.pos_ in pos_select and not nlp.vocab[token.text].is_stop)])
    if lemma== False : text_preprocess= " ".join([token.text.lower() for token in doc if (token.pos_ in pos_select  and not nlp.vocab[token.text].is_stop)])
  # nlp.vocab[token.text].is_stop to remove stopwords
  return text_preprocess

In [0]:
# Pre-processing data with spacy
from tqdm import tqdm
movie_synopses_preprocess=[]
for movie in tqdm(movie_synopses):
  movie_preprocess = spacy_preprocess(movie,pos_select = ["VERB", "NOUN", "ADJ"])
  movie_synopses_preprocess+= [movie_preprocess]

#Step 2: Build feature matrix (tf-idf)
** you need to change the data input below** 

##### data without preprocess

In [0]:
#change data input (movie_synopses or movie_synopses_preprocess) here
data_input =movie_synopses # data without preprocess
#data_input =movie_synopses_preprocess  # the data after preprocess text.

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.8, max_features= None)
feature_matrix = vectorizer.fit_transform(data_input).astype(float)
feature_names = vectorizer.get_feature_names() # get feature names
print("number of feature:", len(feature_names))

#Step 3-1: Kmean Clustering

In [0]:
from sklearn.cluster import KMeans
#function for Kmean clustering
def k_means(feature_matrix, num_clusters=5):
    km = KMeans(n_clusters=num_clusters, n_init=500, random_state = 1,
                max_iter=10000)
    km.fit(feature_matrix)
    clusters = km.labels_
    return km, clusters

def get_cluster_data(clustering_obj, doc_data, 
                     feature_names, num_clusters,
                     topn_features=10):

    cluster_details = {}  
    # get cluster centroids
    ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
    # get key features for each cluster
    # get docs belonging to each cluster
    for cluster_num in range(num_clusters):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster_num'] = cluster_num
        key_features = [feature_names[index] 
                        for index 
                        in ordered_centroids[cluster_num, :topn_features]]
        cluster_details[cluster_num]['key_features'] = key_features
        
        docs = doc_data[doc_data['Cluster'] == cluster_num]['Title'].values.tolist()
        cluster_details[cluster_num]['docs'] = docs
    
    return cluster_details

def print_clusters(cluster_data):
    # print cluster details
    for cluster_num, cluster_details in cluster_data.items():
        print ('Cluster {} details:'.format(cluster_num))
        print ('-'*20)
        print ('Key features:', cluster_details['key_features'])
        print ('docs in this cluster:')
        print (', '.join(cluster_details['docs']))
        print ('='*40)

import time
start=time.time()
# assume that we want to clustering withy k =3
num_clusters = 3
km_obj, km_clusters = k_means(feature_matrix=feature_matrix,
                           num_clusters=num_clusters)

doc_data['Cluster'] = km_clusters

km_cluster_data =  get_cluster_data(clustering_obj=km_obj,
                                 doc_data = doc_data,
                                 feature_names=feature_names,
                                 num_clusters=num_clusters,
                                 topn_features=10)

print_clusters(km_cluster_data)  
end=time.time()
print("used time: ",end-start)

#Step 3-2 Ward Hierarchical clustering

In [0]:
# Function to form and plot hierarchical clustering

def ward_hierarchical_clustering(feature_matrix):
    
    cosine_distance = 1 - cosine_similarity(feature_matrix)
    linkage_matrix = ward(cosine_distance)
    return linkage_matrix

def plot_hierarchical_clusters(linkage_matrix, doc_data, figure_size=(8,12)):
    # set size
    fig, ax = plt.subplots(figsize=figure_size,dpi=150) 
    doc_titles = doc_data['Title'].values.tolist()
    # plot dendrogram
    ax = dendrogram(linkage_matrix, orientation="left", labels=doc_titles)
    plt.tick_params(axis= 'x',   
                    which='both',  
                    bottom=False,
                    top=False,
                    labelbottom= False)
    plt.tight_layout()
    # plt.savefig(mydrive+'hierachical_full.png', dpi=600)

import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
import random
from matplotlib.font_manager import FontProperties
from scipy.cluster.hierarchy import ward, dendrogram

# build ward's linkage matrix    
import time
start = time.time()
linkage_matrix = ward_hierarchical_clustering(feature_matrix)
plot_hierarchical_clusters(linkage_matrix=linkage_matrix, doc_data=doc_data, figure_size=(6,8))
end=time.time()
print("used time: ",end-start)
# you can find the file "hierachical_full.png"

#Step 3-3 SOM Clustering

In [0]:
!pip install minisom

##### 8*8 neurons

In [0]:
#Step 1 tranining the Neural network 
D = feature_matrix.todense().tolist()
from minisom import MiniSom
max_features=len(feature_names)
xmap_dim =8
ymap_dim =8


som = MiniSom(x=xmap_dim, y= ymap_dim, input_len= max_features, random_seed=1)

import time
start = time.time()

#start training
som.pca_weights_init(D)
som.train_batch(data=D, num_iteration= max_features*200)
#end tranining
end = time.time()
print(end - start,'seconds')

In [0]:
# show cluster in movies titles
titles=movie_titles

plt.figure(figsize=(20, 20))
for i, (t, vec) in enumerate(zip(titles, D)):
    winnin_position = som.winner(vec)
    plt.text(winnin_position[0], 
             winnin_position[1]+np.random.rand()*0.9, 
             t[0:20], color='black')
    
plt.xticks(range(xmap_dim))
plt.yticks(range(ymap_dim))
plt.grid()
plt.xlim([0, xmap_dim])
plt.ylim([0, ymap_dim])
plt.plot()
plt.savefig(mydrive+'som_titles.png', dpi=300)

In [0]:
#Movies name, key feature and genre of every cluster
top_keywords =10
weights = som.get_weights()
for i in range(xmap_dim):
   for j in range(ymap_dim):
      keywords_idx = np.argsort(weights[i,j,:])[-top_keywords:]
      keywords = ' '.join([feature_names[k] for k in keywords_idx])
      print('\n')
      print('Cell', i,'-',j, '-','keyword:', keywords)
      movies_t =[]     
      for k, (t, g, vec) in enumerate(zip(movie_titles, movie_genre, D)):
        winnin_position = som.winner(vec)
        if winnin_position[0]==i and winnin_position[1]==j: 
          movies_t+=[t]
          
      print('Titles:', movies_t)

##### 10*10 neurons

In [0]:
#Step 1 tranining the Neural network 
D = feature_matrix.todense().tolist()
from minisom import MiniSom
max_features=len(feature_names)
xmap_dim =10
ymap_dim =10


som = MiniSom(x=xmap_dim, y= ymap_dim, input_len= max_features, random_seed=1)

import time
start = time.time()

#start training
som.pca_weights_init(D)
som.train_batch(data=D, num_iteration= max_features*200)
#end tranining
end = time.time()
print(end - start,'seconds')

In [0]:
# show cluster in movies titles
titles=movie_titles

plt.figure(figsize=(20, 20))
for i, (t, vec) in enumerate(zip(titles, D)):
    winnin_position = som.winner(vec)
    plt.text(winnin_position[0], 
             winnin_position[1]+np.random.rand()*0.9, 
             t[0:20], color='black')
    
plt.xticks(range(xmap_dim))
plt.yticks(range(ymap_dim))
plt.grid()
plt.xlim([0, xmap_dim])
plt.ylim([0, ymap_dim])
plt.plot()
plt.savefig(mydrive+'som_titles.png', dpi=300)

In [0]:
#Movies name, key feature and genre of every cluster
top_keywords =10
weights = som.get_weights()
for i in range(xmap_dim):
   for j in range(ymap_dim):
      keywords_idx = np.argsort(weights[i,j,:])[-top_keywords:]
      keywords = ' '.join([feature_names[k] for k in keywords_idx])
      print('\n')
      print('Cell', i,'-',j, '-','keyword:', keywords)
      movies_t =[]     
      for k, (t, g, vec) in enumerate(zip(movie_titles, movie_genre, D)):
        winnin_position = som.winner(vec)
        if winnin_position[0]==i and winnin_position[1]==j: 
          movies_t+=[t]
          
      print('Titles:', movies_t)

##### data without preprocess

#Step 3-1: Kmean Clustering

In [0]:
#change data input (movie_synopses or movie_synopses_preprocess) here
# data_input =movie_synopses # data without preprocess
data_input =movie_synopses_preprocess  # the data after preprocess text.

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=0.1, max_df=0.8, max_features= None)
# tfidf_vect=TfidfVectorizer(stop_words='english', max_features=300)
feature_matrix = vectorizer.fit_transform(data_input).astype(float)
feature_names = vectorizer.get_feature_names() # get feature names
print("number of feature:", len(feature_names))

In [0]:
from sklearn.cluster import KMeans
#function for Kmean clustering
def k_means(feature_matrix, num_clusters=5):
    km = KMeans(n_clusters=num_clusters, n_init=500, random_state = 1,
                max_iter=10000)
    km.fit(feature_matrix)
    clusters = km.labels_
    return km, clusters

def get_cluster_data(clustering_obj, doc_data, 
                     feature_names, num_clusters,
                     topn_features=10):

    cluster_details = {}  
    # get cluster centroids
    ordered_centroids = clustering_obj.cluster_centers_.argsort()[:, ::-1]
    # get key features for each cluster
    # get docs belonging to each cluster
    for cluster_num in range(num_clusters):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster_num'] = cluster_num
        key_features = [feature_names[index] 
                        for index 
                        in ordered_centroids[cluster_num, :topn_features]]
        cluster_details[cluster_num]['key_features'] = key_features
        
        docs = doc_data[doc_data['Cluster'] == cluster_num]['Title'].values.tolist()
        cluster_details[cluster_num]['docs'] = docs
    
    return cluster_details

def print_clusters(cluster_data):
    # print cluster details
    for cluster_num, cluster_details in cluster_data.items():
        print ('Cluster {} details:'.format(cluster_num))
        print ('-'*20)
        print ('Key features:', cluster_details['key_features'])
        print ('docs in this cluster:')
        print (', '.join(cluster_details['docs']))
        print ('='*40)

import time
start=time.time()
# assume that we want to clustering withy k =3
num_clusters = 3
km_obj, km_clusters = k_means(feature_matrix=feature_matrix,
                           num_clusters=num_clusters)

doc_data['Cluster'] = km_clusters

km_cluster_data =  get_cluster_data(clustering_obj=km_obj,
                                 doc_data = doc_data,
                                 feature_names=feature_names,
                                 num_clusters=num_clusters,
                                 topn_features=10)

print_clusters(km_cluster_data)  
end=time.time()
print("used time: ", end-start)

#Step 3-2 Ward Hierarchical clustering

In [0]:
# Function to form and plot hierarchical clustering

def ward_hierarchical_clustering(feature_matrix):
    
    cosine_distance = 1 - cosine_similarity(feature_matrix)
    linkage_matrix = ward(cosine_distance)
    return linkage_matrix

def plot_hierarchical_clusters(linkage_matrix, doc_data, figure_size=(8,12)):
    # set size
    fig, ax = plt.subplots(figsize=figure_size,dpi=150) 
    doc_titles = doc_data['Title'].values.tolist()
    # plot dendrogram
    ax = dendrogram(linkage_matrix, orientation="left", labels=doc_titles)
    plt.tick_params(axis= 'x',   
                    which='both',  
                    bottom=False,
                    top=False,
                    labelbottom= False)
    plt.tight_layout()
    plt.savefig(mydrive+'hierachical_full.png', dpi=600)

import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
import random
from matplotlib.font_manager import FontProperties
from scipy.cluster.hierarchy import ward, dendrogram

# build ward's linkage matrix  
import time
start=time.time() 
linkage_matrix = ward_hierarchical_clustering(feature_matrix)
plot_hierarchical_clusters(linkage_matrix=linkage_matrix, doc_data=doc_data, figure_size=(6,8))
end=time.time()
print("used time: ", end-start)
# you can find the file "hierachical_full.png"

#Step 3-3 SOM Clustering

In [0]:
!pip install minisom

##### 8*8 neurons

In [0]:
#Step 1 tranining the Neural network 
D = feature_matrix.todense().tolist()
from minisom import MiniSom
max_features=len(feature_names)
xmap_dim =8
ymap_dim =8
# 8*8 neuron

som = MiniSom(x=xmap_dim, y= ymap_dim, input_len= max_features, random_seed=1)

import time
start = time.time()

#start training
som.pca_weights_init(D)
som.train_batch(data=D, num_iteration= max_features*200)
#end tranining
end = time.time()
print(end - start,'seconds')

In [0]:
# show cluster in movies titles
titles=movie_titles

plt.figure(figsize=(20, 20))
for i, (t, vec) in enumerate(zip(titles, D)):
    winnin_position = som.winner(vec)
    plt.text(winnin_position[0], 
             winnin_position[1]+np.random.rand()*0.9, 
             t[0:20], color='black')
    
  
plt.xticks(range(xmap_dim))
plt.yticks(range(ymap_dim))
plt.grid()
plt.xlim([0, xmap_dim])
plt.ylim([0, ymap_dim])
plt.plot()
plt.savefig(mydrive+'som_titles.png', dpi=300)

In [0]:
#Movies name, key feature and genre of every cluster
top_keywords =10
weights = som.get_weights()
for i in range(xmap_dim):
   for j in range(ymap_dim):
      keywords_idx = np.argsort(weights[i,j,:])[-top_keywords:]
      keywords = ' '.join([feature_names[k] for k in keywords_idx])
      print('\n')
      print('Cell', i,'-',j, '-','keyword:', keywords)
      movies_t =[]     
      for k, (t, g, vec) in enumerate(zip(movie_titles, movie_genre, D)):
        winnin_position = som.winner(vec)
        if winnin_position[0]==i and winnin_position[1]==j: 
          movies_t+=[t]
          
      print('Titles:', movies_t)

##### 10*10 neurons

In [0]:
#Step 1 tranining the Neural network 
D = feature_matrix.todense().tolist()
from minisom import MiniSom
max_features=len(feature_names)
xmap_dim =10
ymap_dim =10
# 10*10 neuron

som = MiniSom(x=xmap_dim, y= ymap_dim, input_len= max_features, random_seed=1)

import time
start = time.time()

#start training
som.pca_weights_init(D)
som.train_batch(data=D, num_iteration= max_features*200)
#end tranining
end = time.time()
print(end - start,'seconds')

In [0]:
# show cluster in movies titles
titles=movie_titles

plt.figure(figsize=(20, 20))
for i, (t, vec) in enumerate(zip(titles, D)):
    winnin_position = som.winner(vec)
    plt.text(winnin_position[0], 
             winnin_position[1]+np.random.rand()*0.9, 
             t[0:20], color='black')
    
  
plt.xticks(range(xmap_dim))
plt.yticks(range(ymap_dim))
plt.grid()
plt.xlim([0, xmap_dim])
plt.ylim([0, ymap_dim])
plt.plot()
plt.savefig(mydrive+'som_titles.png', dpi=300)

In [0]:
#Movies name, key feature and genre of every cluster
top_keywords =10
weights = som.get_weights()
for i in range(xmap_dim):
   for j in range(ymap_dim):
      keywords_idx = np.argsort(weights[i,j,:])[-top_keywords:]
      keywords = ' '.join([feature_names[k] for k in keywords_idx])
      print('\n')
      print('Cell', i,'-',j, '-','keyword:', keywords)
      movies_t =[]     
      for k, (t, g, vec) in enumerate(zip(movie_titles, movie_genre, D)):
        winnin_position = som.winner(vec)
        if winnin_position[0]==i and winnin_position[1]==j: 
          movies_t+=[t]
          
      print('Titles:', movies_t)