## Generates Clusters, Gini score and candle bar graphs from Embeddings

In [1]:
!pwd

/project/code


In [2]:
cd task_2

/project/code/task_2


In [5]:
!unzip DS_framework-main.zip

Archive:  DS_framework-main.zip
d266477ec9d8fca7d245fcfa6585413b065fda59
   creating: DS_framework-main/
 extracting: DS_framework-main/.gitignore  
  inflating: DS_framework-main/README.md  
  inflating: DS_framework-main/candle_graph_seperate.html  
  inflating: DS_framework-main/cluster_gini.py  
  inflating: DS_framework-main/main.py  
  inflating: DS_framework-main/newplot.png  
  inflating: DS_framework-main/paths.json  
  inflating: DS_framework-main/report.json  
  inflating: DS_framework-main/requirements.txt  
   creating: DS_framework-main/samples/
  inflating: DS_framework-main/samples/embeddings_new_vae_r18_train_v2.tsv  
  inflating: DS_framework-main/samples/embeddings_new_vae_r18_v2.tsv  
  inflating: DS_framework-main/samples/metadata_new_vae_r18_train_v2.tsv  
  inflating: DS_framework-main/samples/metadata_new_vae_r18_v2.tsv  


In [15]:
!pip install plotly

Collecting plotly
  Downloading plotly-5.8.1-py2.py3-none-any.whl (15.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.8.1 tenacity-8.0.1
You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.[0m[33m
[0m

In [1]:
import sklearn
import csv
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import copy
import pandas as pd
from math import dist
import plotly.express as px
import plotly
import json

## Gini Clustering

In [2]:
def gen_clusters_gini(embeddings, metadata, n_clusters, class_names):
    
    n_clusters = n_clusters
    class_names = class_names    
    embeddings = embeddings
    metadata = metadata

    X = []
    L = []

    for row in embeddings:
        X.append(np.array(row).astype(float))

    for row in metadata:
        L.append(row[0])

    print("Total number of images: "+ str(len(X)))

    kmeans = KMeans(n_clusters, init = 'k-means++', random_state=0)
    y_kmeans = kmeans.fit_predict(X)

    dict_map = []
    for i in range(n_clusters):
        dict1 = {}
        for j in range(len(class_names)):
            dict1[class_names[j]]= 0
        dict_map.append(dict1)
    dict_map


    idx = 0

    csv_row = []
    for pred in kmeans.predict(X):

        label = str(L[idx]).strip('[]\'')
        
        csv_row.append([pred])
        dict_map[pred][label] = dict_map[pred][label] + 1
        idx = idx +1
    print(json.dumps(dict_map,indent=4,sort_keys=True))
  

    ##GINI INDEX
    cluster_names = []
    gini_index = []
    clstr = [0]*len(class_names)
    clstr_name = []

    centroids = kmeans.cluster_centers_

    for cluster_num in range(len(centroids)):
        max_class_val = max((dict_map[cluster_num].values()))
        # cluster_names.append(class_names[dict_map2[cluster_num].values().index(max_class_val)])##
        cls_nm = class_names[list(dict_map[cluster_num].values()).index(max_class_val)]
        cluster_names.append(cls_nm)
        clstr_name.append(cls_nm+"_"+str(clstr[class_names.index(cls_nm)]))
        clstr[class_names.index(cls_nm)] += 1
        

    # print(clstr_name)
    print("Cluster names: ", cluster_names)


    gini_index = []
    for cluster_num, class_name in enumerate(class_names):
        num_data_pts_per_class = dict_map[cluster_num].values()
        p_class = [x / sum(num_data_pts_per_class) for x in num_data_pts_per_class]
        # print(p_class)
        summation_p = 0
        for p in range(len(p_class)):
            summation_p = summation_p + p_class[p]**2
        # print(summation_p)
        gini_index_value = 1 - summation_p
        gini_index.append(gini_index_value)
        # print(gini_index_value)
        
    print("Gini Index: ",gini_index)
    avg = sum(gini_index)/len(gini_index)
    print("Average gini index: ",avg)

    json_file = {
        "cluster_names":cluster_names,
        "gini_index": gini_index,
        "avg_gini_index": avg,
        "cluster_map": dict_map
        }
    # print(json.dumps(json_file,indent =4 ))
    with open("report.json", "w") as outfile:
        json.dump(json_file, outfile, indent = 4)

    newCls = []
    Cls = []
    for row in csv_row:
        Cls.append(int(row[0]))


    for c in Cls:
        newCls.append(clstr_name[c])
        
    distances = []
    for point in range(len(X)):
        distances.append( dist(X[point], centroids[Cls[point]]))

    # print(max(distances))

    df = pd.DataFrame({"Cluster" : newCls,
            "Distance" : distances,
            "Class" : L})

    fig = px.box(df, x="Cluster",
                y="Distance",
                color="Class",
                labels={
                    "Cluster": "Gini Index: " + str(gini_index).replace('\'',''),
                    "Distance": "Distance",
                    "Class": "Class"
                },
                title="K-means clustering with "+ str(n_clusters) +" clusters train+test-set")
    fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
    # fig.show()
    plotly.offline.plot(fig, filename='candle_graph_seperate.html')

## Files Path

In [3]:
config = {
    "embeddings_train_path" : "/project/code/task_2/DS_framework-main/samples/embeddings_new_vae_r18_train_v2.tsv",
    "metadata_train_path" : "/project/code/task_2/DS_framework-main/samples/metadata_new_vae_r18_train_v2.tsv",
    "embeddings_test_path" : "/project/code/task_2/DS_framework-main/samples/embeddings_new_vae_r18_v2.tsv",
    "metadata_test_path" : "/project/code/task_2/DS_framework-main/samples/metadata_new_vae_r18_v2.tsv",
    "n_clusters" : 5 ,
    "class_names" : ["car", "person", "truck", "traffic_light", "motor"] 
}

## Clustering on Train Data

In [4]:
import argparse
import sys
import json
import os
import csv


def load_tsv(config):
    embeddings_path = config['embeddings_train_path']
    metadata_path = config['metadata_train_path']
    
    n_clusters = config['n_clusters']
    class_names = config['class_names']
    embeddings_file = open(embeddings_path) 
    embeddings = csv.reader(embeddings_file, delimiter="\t")
    metadata_file = open(metadata_path) 
    metadata = csv.reader(metadata_file, delimiter="\t")
    return embeddings,metadata,n_clusters,class_names

In [5]:
embeddings,metadata,n_clusters,class_names = load_tsv(config)
gen_clusters_gini(embeddings,metadata,n_clusters,class_names)

Total number of images: 3152
[
    {
        "car": 35,
        "motor": 150,
        "person": 153,
        "traffic_light": 115,
        "truck": 149
    },
    {
        "car": 66,
        "motor": 40,
        "person": 27,
        "traffic_light": 89,
        "truck": 138
    },
    {
        "car": 148,
        "motor": 196,
        "person": 180,
        "traffic_light": 41,
        "truck": 275
    },
    {
        "car": 136,
        "motor": 131,
        "person": 125,
        "traffic_light": 34,
        "truck": 491
    },
    {
        "car": 54,
        "motor": 32,
        "person": 35,
        "traffic_light": 16,
        "truck": 296
    }
]
Cluster names:  ['person', 'truck', 'truck', 'truck', 'truck']
Gini Index:  [0.77218794494542, 0.7403549382716049, 0.7590334467120181, 0.6509420387233035, 0.5037735547152099]
Average gini index:  0.6852583846735112


## Clustering on Test Data

In [6]:
def load_tsv(config):
    embeddings_path = config['embeddings_test_path']
    metadata_path = config['metadata_test_path']
    
    n_clusters = config['n_clusters']
    class_names = config['class_names']
    embeddings_file = open(embeddings_path) 
    embeddings = csv.reader(embeddings_file, delimiter="\t")
    metadata_file = open(metadata_path) 
    metadata = csv.reader(metadata_file, delimiter="\t")
    return embeddings,metadata,n_clusters,class_names

In [7]:
embeddings,metadata,n_clusters,class_names = load_tsv(config)
gen_clusters_gini(embeddings,metadata,n_clusters,class_names)

Total number of images: 512
[
    {
        "car": 40,
        "motor": 21,
        "person": 12,
        "traffic_light": 8,
        "truck": 43
    },
    {
        "car": 43,
        "motor": 66,
        "person": 55,
        "traffic_light": 13,
        "truck": 32
    },
    {
        "car": 4,
        "motor": 8,
        "person": 12,
        "traffic_light": 10,
        "truck": 11
    },
    {
        "car": 14,
        "motor": 8,
        "person": 8,
        "traffic_light": 33,
        "truck": 10
    },
    {
        "car": 0,
        "motor": 6,
        "person": 14,
        "traffic_light": 37,
        "truck": 4
    }
]
Cluster names:  ['truck', 'motor', 'person', 'traffic_light', 'traffic_light']
Gini Index:  [0.733480749219563, 0.7613836679563197, 0.7802469135802469, 0.7160818164758866, 0.5654393980112873]
Average gini index:  0.7113265090486607
