# Supervised Clustering ML
Author: Shouqiang Ye   yeshouqiang@gmail.com

### Import necessary library

In [94]:
from flask import Flask, request, redirect, render_template, Response, send_file, send_from_directory
from flask_wtf import FlaskForm
from wtforms import SelectMultipleField
from flask_bootstrap import Bootstrap
from os.path import join, dirname, realpath
import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer  # add CountVectorizer for NMF
from sklearn.cluster import KMeans
from datetime import datetime
import io, os
import numpy as np

### Import library for stopwords

In [95]:
from sklearn import decomposition
import matplotlib.pyplot as plt
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

In [96]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shouqiangye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Set DF from csv file
Source 1: <a href="https://github.com/UNCWellington/AI-tools/blob/main/seed_sample_2000_input.csv?raw=true">seed_sample_2000_input.csv</a>  only 2000 rows  

Source 2: <a href="https://github.com/UNCWellington/AI-tools/blob/main/Sample-for-SC.csv?raw=true">Sample-for-SC.csv   </a> nearly 10,000 rows
Note: we shoud add ?raw=true at the end of github link address

In [97]:
# url = 'https://github.com/UNCWellington/AI-tools/blob/main/seed_sample_2000_input.csv?raw=true'
url = 'https://github.com/UNCWellington/AI-tools/blob/main/Sample-for-SC.csv?raw=true'
# df = pd.read_csv(url,index_col=0)
df = pd.read_csv(url, keep_default_na=False)

In [98]:
df.head()

Unnamed: 0,AN,Seed,TAB
0,1783,,"""Anywhere but here"": Querying spatial stigma a..."
1,4672,,"""Are We Safe Analysts?"" Cisgender Countertrans..."
2,4444,,"""As a Trans Person You Don't Live. You Merely ..."
3,22,,"""At Your Service"": Sexual Harassment of Female..."
4,2808,,"""Bareback"" pornography consumption and safe-se..."


In [99]:
df_filter_seeds, df_pivot, df_seed_capture = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

## Stop words, stemming, lemmatization

In [100]:
input_stopwords = "copyright, publication, abstract"
stopwords = input_stopwords.split(',')

In [101]:
stemmer = nltk.stem.SnowballStemmer('english')
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shouqiangye/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [102]:
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

## Data Processing

Next, scikit learn has a method that will extract all the word counts for us

In [103]:
documents = df['TAB'].values.astype("U")

In [104]:
algo_dict = {'Kmeans_': [10, 20, 30], 'NMF_': [10, 20, 30]}

use DF: df_filter_seeds  output 'Filtered for Seeds only' sheet
'Filtered for Seeds only': step 1: select AN column

In [105]:
df_filter_seeds = df.loc[df['Seed'] == '1', ['AN']]

In [107]:
df_filter_seeds

Unnamed: 0,AN,Kmeans_10,Kmeans_20,Kmeans_30,NMF_10,NMF_20,NMF_30
72,7695,1,3,28,9,12,12
159,7763,6,6,23,0,15,15
180,1657,0,14,3,9,9,29
198,5134,2,2,2,5,5,5
206,3739,1,1,15,2,2,2
216,1060,6,6,6,5,5,5
582,198,3,13,22,0,0,21
789,2704,1,1,20,0,2,24
871,8786,1,1,15,2,2,2
1020,8258,7,10,28,5,18,22


### output 'Supervised Clustering' sheet -- Kmeans_10, Kmeans_20,

In [124]:
    for algo in algo_dict:
        k_list = algo_dict[algo]
        # >>>>>>>>>>>> output 'Supervised Clustering' sheet -- Kmeans_10, Kmeans_20,
        vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1, 3)) if algo == 'Kmeans_' \
            else CountVectorizer(stop_words=stopwords, ngram_range=(1, 3)) if algo == 'NMF_' else "Null"
        features = vectorizer.fit_transform(documents)

        for k in k_list:
            # Kmeans and NMF have different call function, so we add determination here
            if algo == 'Kmeans_':
                # max_iter= 1 could faster running time
                # model = KMeans(n_clusters=k, init='k-means++', n_init=1, verbose=0, random_state=3425)
                model = KMeans(n_clusters=k, init='k-means++', max_iter=1, n_init=1, verbose=0, random_state=3425)
                model.fit(features)
                df[algo + str(k)] = model.labels_
            elif algo == 'NMF_':
                # max_iter= 1 could faster running time
                # model = decomposition.NMF(n_components=k, init='nndsvda', verbose=0, random_state=3425)
                model = decomposition.NMF(n_components=k, max_iter=1, init='nndsvda', verbose=0, random_state=3425)
                W1 = model.fit_transform(features)
                H1 = model.components_
                df[algo + str(k)] = np.argmax(W1, axis=1)

            # 'Filtered for Seeds only':  step 2: concat seeds = 1 to sheet 'Filtered for Seeds only'
            # df_filter_seeds = df.loc[df['Seed'] == '1', ['Kmeans_10', 'Kmeans_20', 'Kmeans_30'], ['NMF_10', 'NMF_20', 'NMF_30']]
            df_kmean = df.loc[df['Seed'] == '1', [algo + str(k)]]
            df_filter_seeds = pd.concat([df_filter_seeds, df_kmean], axis=1)

            # >>>>>>>>>>>> output 'Pivot Tables' sheet --
            df_km = pd.DataFrame()
            for i in range(k):
                # DataFrame.shape, Return a tuple representing the dimensionality of the DataFrame.
                # 'Pivot Tables': step1: cal the sum of cluster no
                sum_cluster = df_kmean[df_kmean[algo + str(k)] == i].shape[0]
                data = {f'Cluster Number({algo[0]}-{str(k)})': i,
                        f'# of Seeds({algo[0]}-{str(k)})': sum_cluster}
                # df_km = df_km.append(data, ignore_index=True)
                df_km = pd.concat([df_km, pd.DataFrame([data])], axis=0, ignore_index=True)

            # 'Pivot Tables': step2: sort by # of Seeds
            df_km = df_km.sort_values(by=f'# of Seeds({algo[0]}-{str(k)})', ascending=False, ignore_index=True)
            df_pivot = pd.concat([df_pivot, df_km], axis=1)



### output 'Seed Capture' sheet

In [108]:
    # >>>>>>>>>>>> output 'Seed Capture' sheet
    # cal k_list from algo_dict dict, return list, like: [10, 20, 30, 10, 20, 30]
    algo_dict_list = list(algo_dict.values())
    k_list = []
    for i in range(len(algo_dict_list)):
        k_list += algo_dict_list[i]

    total_nums_seeds = df_filter_seeds.shape[0]
    df_unique = pd.DataFrame()
    cur_unique_seeds, cum_unique_seeds, cum_unique_percent, Continue_flag = 0, 0, 0, ''

### df_seed_capture

In [109]:
    threshold = 0.9
    # loop for vertical
    for i in range(max(k_list)):
        # cur_unique_seeds = 0
        row_data = {'Step': i + 1,
                    'Cluster (all models)': '',
                    'Seed Capture by model (unique seeds only)': cur_unique_seeds,
                    'Seed Capture Running total (unique seeds only)': cum_unique_seeds,
                    'Cumulative %': cum_unique_percent,
                    'Continue or Stop?': Continue_flag}
        # loop for horizontal
        for j in range(len(k_list)):
            # 'Seed Capture': step1: add Cluster (all models)
            clus_no = df_pivot.iloc[i, 2 * j].astype(int)
            if row_data.get('Cluster (all models)') == '':
                row_data['Cluster (all models)'] = clus_no.astype(str)
            else:
                row_data['Cluster (all models)'] += ',' + clus_no.astype(str)
                # row_data['Cluster (all models)'] += '' if row_data.get('Cluster (all models)') == '' else ',' + clus_no

            # 'Seed Capture': step2: Seed Capture by model (unique seeds only)
            df_AN = df_filter_seeds.loc[df_filter_seeds.iloc[:, 1 + j] == clus_no, ['AN']]
            # cal unique by drop duplicated AN rows
            df_unique = pd.concat([df_unique, df_AN]).drop_duplicates().reset_index(drop=True)

        # 'Seed Capture': step3: Seed Capture Running total (unique seeds only)
        cum_unique_seeds = df_unique.shape[0]
        row_data['Seed Capture Running total (unique seeds only)'] = cum_unique_seeds

        # 'Seed Capture': step4:cal Cumulative % and Continue Yes or No
        cum_unique_percent = cum_unique_seeds / total_nums_seeds

        Continue_flag = 'Continue' if cum_unique_percent <= float(threshold) else 'Stop'
        row_data['Continue or Stop?'] = Continue_flag
        # format Cumulative % with percent
        cum_unique_percent = "{0:.2f}%".format(cum_unique_percent * 100)
        row_data['Cumulative %'] = cum_unique_percent

        # 'Seed Capture': step5: finally append row
        df_seed_capture = pd.concat([df_seed_capture, pd.DataFrame([row_data])], axis=0, ignore_index=True)
        if Continue_flag == 'Stop':
            break

    # 'Seed Capture': step6: Seed Capture by model (unique seeds only) by diff function
    df_seed_capture['Seed Capture by model (unique seeds only)'] = \
        df_seed_capture['Seed Capture Running total (unique seeds only)'].diff()

In [110]:
df_seed_capture

Unnamed: 0,Step,Cluster (all models),Seed Capture by model (unique seeds only),Seed Capture Running total (unique seeds only),Cumulative %,Continue or Stop?
0,1,1115222,,24,61.54%,Continue
1,2,31028000,11.0,35,89.74%,Continue
2,3,7335924,4.0,39,100.00%,Stop


### output 'Supervised Clustering' - Ensemble_Score

In [111]:
    # define a dictionary to save the top cluster
    dict_top_cluster = {}
    # split top cluster, like 6,5,24  -> 3 columns: 6    5   24
    df_top_cluster = df_seed_capture['Cluster (all models)'].str.split(pat=',', n=-1, expand=True)
    i_index = 0
    df['Ensemble_Score'] = 0
    # algo_dict = {'Kmeans_': [10, 20, 30], 'NMF_': [10, 20, 30]}
    for algo in algo_dict:
        k_list = algo_dict[algo]
        for k in k_list:
            dict_top_cluster[algo + str(k)] = df_top_cluster[i_index].astype(int).tolist()
            i_index += 1
            # cal every model if it is included in the top cluster
            df['Ensemble_Score'] += \
                df[algo + str(k)].apply(lambda x: 1 if x in dict_top_cluster[algo + str(k)] else 0)

    # Ensemble_AnyOnePositive
    df['Ensemble_AnyOnePositive'] = np.where(
        df['Ensemble_Score'] != 0, 1, 0)

In [112]:
df

Unnamed: 0,AN,Seed,TAB,Kmeans_10,Kmeans_20,Kmeans_30,NMF_10,NMF_20,NMF_30,Ensemble_Score,Ensemble_AnyOnePositive
0,1783,,"""Anywhere but here"": Querying spatial stigma a...",8,6,8,4,4,4,0,0
1,4672,,"""Are We Safe Analysts?"" Cisgender Countertrans...",6,6,11,2,9,22,2,1
2,4444,,"""As a Trans Person You Don't Live. You Merely ...",6,6,5,2,2,2,3,1
3,22,,"""At Your Service"": Sexual Harassment of Female...",3,13,25,0,0,0,4,1
4,2808,,"""Bareback"" pornography consumption and safe-se...",5,5,2,0,12,12,1,1
...,...,...,...,...,...,...,...,...,...,...,...
10089,9737,,â€˜Our favourite drugâ€™: Prevalence of use an...,4,18,21,7,7,7,0,0
10090,7231,,â€˜Thatâ€™s not the kind of church we areâ€™: ...,7,7,28,7,7,7,2,1
10091,9145,,â€˜The ownâ€™ and â€˜the wiseâ€™ as social sup...,5,5,9,1,19,19,0,0
10092,9104,,â€˜Totally straightâ€™: Contested sexual ident...,7,13,28,5,18,18,3,1


In [125]:
df_filter_seeds

Unnamed: 0,AN,Kmeans_10,Kmeans_20,Kmeans_30,NMF_10,NMF_20,NMF_30,Kmeans_10.1,Kmeans_10.2,Kmeans_20.1,Kmeans_30.1,NMF_10.1,NMF_20.1,NMF_30.1,Kmeans_10.3,Kmeans_20.2,Kmeans_30.2,NMF_10.2,NMF_20.2,NMF_30.2
72,7695,1,3,28,9,12,12,1,1,3,28,9,12,12,1,3,28,9,12,12
159,7763,6,6,23,0,15,15,6,6,6,23,0,15,15,6,6,23,0,15,15
180,1657,0,14,3,9,9,29,0,0,14,3,9,9,29,0,14,3,9,9,29
198,5134,2,2,2,5,5,5,2,2,2,2,5,5,5,2,2,2,5,5,5
206,3739,1,1,15,2,2,2,1,1,1,15,2,2,2,1,1,15,2,2,2
216,1060,6,6,6,5,5,5,6,6,6,6,5,5,5,6,6,6,5,5,5
582,198,3,13,22,0,0,21,3,3,13,22,0,0,21,3,13,22,0,0,21
789,2704,1,1,20,0,2,24,1,1,1,20,0,2,24,1,1,20,0,2,24
871,8786,1,1,15,2,2,2,1,1,1,15,2,2,2,1,1,15,2,2,2
1020,8258,7,10,28,5,18,22,7,7,10,28,5,18,22,7,10,28,5,18,22


In [126]:
df_pivot

Unnamed: 0,Cluster Number(K-10),# of Seeds(K-10),Cluster Number(K-20),# of Seeds(K-20),Cluster Number(K-30),# of Seeds(K-30),Cluster Number(N-10),# of Seeds(N-10),Cluster Number(N-20),# of Seeds(N-20),...,Cluster Number(K-20).1,# of Seeds(K-20).1,Cluster Number(K-30).1,# of Seeds(K-30).1,Cluster Number(N-10).1,# of Seeds(N-10).1,Cluster Number(N-20).1,# of Seeds(N-20).1,Cluster Number(N-30),# of Seeds(N-30)
0,1.0,22.0,1.0,15.0,15,13,2.0,18.0,2.0,18.0,...,1.0,15.0,15,13,2.0,18.0,2.0,18.0,2,14
1,3.0,6.0,10.0,4.0,28,5,0.0,12.0,0.0,8.0,...,10.0,4.0,28,5,0.0,12.0,0.0,8.0,0,6
2,7.0,3.0,3.0,3.0,3,4,5.0,4.0,9.0,3.0,...,3.0,3.0,3,4,5.0,4.0,9.0,3.0,24,3
3,2.0,2.0,2.0,2.0,23,3,9.0,4.0,5.0,3.0,...,2.0,2.0,23,3,9.0,4.0,5.0,3.0,9,2
4,6.0,2.0,4.0,2.0,22,3,7.0,1.0,18.0,2.0,...,4.0,2.0,22,3,7.0,1.0,18.0,2.0,25,2
5,9.0,2.0,6.0,2.0,20,3,1.0,0.0,14.0,2.0,...,6.0,2.0,20,3,1.0,0.0,14.0,2.0,12,2
6,0.0,1.0,14.0,2.0,2,2,3.0,0.0,15.0,1.0,...,14.0,2.0,2,2,3.0,0.0,15.0,1.0,29,2
7,4.0,1.0,9.0,2.0,1,2,4.0,0.0,12.0,1.0,...,9.0,2.0,1,2,4.0,0.0,12.0,1.0,5,2
8,5.0,0.0,13.0,2.0,11,1,6.0,0.0,19.0,1.0,...,13.0,2.0,11,1,6.0,0.0,19.0,1.0,18,1
9,8.0,0.0,12.0,1.0,25,1,8.0,0.0,8.0,0.0,...,12.0,1.0,25,1,8.0,0.0,8.0,0.0,14,1


In [127]:
df_seed_capture

Unnamed: 0,Step,Cluster (all models),Seed Capture by model (unique seeds only),Seed Capture Running total (unique seeds only),Cumulative %,Continue or Stop?
0,1,1115222,,24,61.54%,Continue
1,2,31028000,11.0,35,89.74%,Continue
2,3,7335924,4.0,39,100.00%,Stop
