In [2]:
import pandas as pd
import numpy as np

In [15]:
from sklearn.preprocessing import LabelEncoder

## Data Transformation

In [5]:
df = pd.read_csv('364_interaction_energies_state_function.txt', sep='\t')
df.fillna('None', inplace=True)

In [50]:
display(df)

Unnamed: 0,index,PDBID,State,Function,1.21_intenergysum,1.21_inttype1,1.21_intenergy1,1.21_inttype2,1.21_intenergy2,1.22_intenergysum,...,7.67_intenergysum,7.67_inttype1,7.67_intenergy1,7.67_inttype2,7.67_intenergy2,7.68_intenergysum,7.68_inttype1,7.68_intenergy1,7.68_inttype2,7.68_intenergy2
0,1,7EW2,Active,Agonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999
1,2,7EW3,Active,Agonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999
2,3,7EW4,Active,Agonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999
3,4,7EW1,Active,Agonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999
4,5,7LD4,Active,Agonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,387,5C1M,Active,Agonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999
360,388,4EJ4,Inactive,Antagonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999
361,389,4DKL,Inactive,Antagonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999
362,390,6Z10,Intermediate,Antagonist,-999,,-999,,-999,-999,...,-999,,-999,,-999,-999,,-999,,-999


In [51]:
df.to_csv('df.csv')

In [6]:
# get residue numbers to serve as rows for new dataframe
resnums = []
cols = [col for col in df.columns if 'sum' in col]

for col in cols:
    resnum = col[:4]
    resnums.append(resnum)
    
#print(resnums)

In [7]:
# get list of PDBID values, states, functions
PDBIDs = list(df['PDBID'])
states = list(df['State'])
functions = list(df['Function'])
#PDBIDs

In [8]:
# generate column names for new dataframe
colnames = []

for PDBID in PDBIDs:
    colnames.append(PDBID + '_state') # new 5/17
    colnames.append(PDBID + '_function') # new 5/17
    colnames.append(PDBID + '_intenergysum')
    colnames.append(PDBID + '_inttype1')
    colnames.append(PDBID + '_intenergy1')
    colnames.append(PDBID + '_inttype2')
    colnames.append(PDBID + '_intenergy2')

In [9]:
colnames

['7EW2_state',
 '7EW2_function',
 '7EW2_intenergysum',
 '7EW2_inttype1',
 '7EW2_intenergy1',
 '7EW2_inttype2',
 '7EW2_intenergy2',
 '7EW3_state',
 '7EW3_function',
 '7EW3_intenergysum',
 '7EW3_inttype1',
 '7EW3_intenergy1',
 '7EW3_inttype2',
 '7EW3_intenergy2',
 '7EW4_state',
 '7EW4_function',
 '7EW4_intenergysum',
 '7EW4_inttype1',
 '7EW4_intenergy1',
 '7EW4_inttype2',
 '7EW4_intenergy2',
 '7EW1_state',
 '7EW1_function',
 '7EW1_intenergysum',
 '7EW1_inttype1',
 '7EW1_intenergy1',
 '7EW1_inttype2',
 '7EW1_intenergy2',
 '7LD4_state',
 '7LD4_function',
 '7LD4_intenergysum',
 '7LD4_inttype1',
 '7LD4_intenergy1',
 '7LD4_inttype2',
 '7LD4_intenergy2',
 '7LD3_state',
 '7LD3_function',
 '7LD3_intenergysum',
 '7LD3_inttype1',
 '7LD3_intenergy1',
 '7LD3_inttype2',
 '7LD3_intenergy2',
 '7RM5_state',
 '7RM5_function',
 '7RM5_intenergysum',
 '7RM5_inttype1',
 '7RM5_intenergy1',
 '7RM5_inttype2',
 '7RM5_intenergy2',
 '7M8W_state',
 '7M8W_function',
 '7M8W_intenergysum',
 '7M8W_inttype1',
 '7M8W_int

In [19]:
# create empty dataframe to fill with values from df
ml_df = pd.DataFrame()
ml_df['residue'] = resnums

for colname in colnames:
    temp_df = pd.DataFrame(columns = [colname])
    pd.concat((ml_df, temp_df), axis = 1)
    ml_df[colname] = 'NA'

  ml_df[colname] = 'NA'


In [20]:
ml_df = ml_df.copy()
import pickle
ml_df.to_pickle('ml_df_empty.pkl')

In [10]:
ml_df = pd.read_pickle('ml_df_empty.pkl')
display(ml_df)

Unnamed: 0,residue,7EW2_state,7EW2_function,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_state,7EW3_function,...,6Z10_intenergy1,6Z10_inttype2,6Z10_intenergy2,6RNK_state,6RNK_function,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2
0,1.21,,,,,,,,,,...,,,,,,,,,,
1,1.22,,,,,,,,,,...,,,,,,,,,,
2,1.23,,,,,,,,,,...,,,,,,,,,,
3,1.24,,,,,,,,,,...,,,,,,,,,,
4,1.25,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,7.64,,,,,,,,,,...,,,,,,,,,,
328,7.65,,,,,,,,,,...,,,,,,,,,,
329,7.66,,,,,,,,,,...,,,,,,,,,,
330,7.67,,,,,,,,,,...,,,,,,,,,,


In [64]:
# for df_row in range(len(df)): # row represents df rows
#     # convert row to a flattened list, dropping the first 4 columns
#     row_values = df.loc[df_row, :].values.tolist()[4:]
    
#     row_idx = 0 # row index
#     col_idx = (df_row * 5) + 1 # start at col 1 for 1st entry, col 6 for 2nd entry, etc.
#     col_end_idx = (df_row * 5) + 5
    
#     for value in row_values:
#         if col_idx > col_end_idx:
#             row_idx += 1 # start on next row in ml_df
#             col_idx = (df_row * 5) + 1 # reset column numbering
#         ml_df.at[row_idx, ml_df.columns[col_idx]] = value # write new value to column col_idx of next row
#         col_idx += 1


In [11]:
for df_row in range(len(df)): # row represents df rows
    # get state and ligand function from the row, assign to separate variables
    state = df.loc[df_row, :].values.tolist()[2]
    function = df.loc[df_row, :].values.tolist()[3]
    
    # get state and function columns for each row
    
    # convert row to a flattened list, dropping the first 4 columns
    row_values = df.loc[df_row, :].values.tolist()[4:]
    # print(row_values, '\n')
    
    row_idx = 0 # row index
    col_idx = (df_row * 7) + 3 # start at col 1 for 1st entry, col 8 for 2nd entry, etc.
    col_end_idx = (df_row * 7) + 7
    
    for value in row_values:
        if col_idx > col_end_idx:
            row_idx += 1 # start on next row in ml_df
            col_idx = (df_row * 7) + 3 # reset column numbering
        ml_df.at[row_idx, ml_df.columns[col_idx]] = value # write new value to column col_idx of next row
        col_idx += 1

# fill in state/function columns for each PDBID
for PDBid in PDBIDs:
    state_col = PDBid + '_state'
    function_col = PDBid + '_function'
    
    ml_df.loc[:, state_col] = states[PDBIDs.index(PDBid)]
    ml_df.loc[:, function_col] = functions[PDBIDs.index(PDBid)]

In [12]:
display(ml_df)

Unnamed: 0,residue,7EW2_state,7EW2_function,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_state,7EW3_function,...,6Z10_intenergy1,6Z10_inttype2,6Z10_intenergy2,6RNK_state,6RNK_function,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2
0,1.21,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999
1,1.22,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999
2,1.23,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999
3,1.24,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999
4,1.25,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,7.64,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999
328,7.65,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999
329,7.66,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999
330,7.67,Active,Agonist,-999,,-999,,-999,Active,Agonist,...,-999,,-999,Intermediate,Antagonist,-999,,-999,,-999


In [13]:
# get unique interaction types
cols = [col for col in df.columns if 'type' in col]
int_types = []

for col in cols:
    for int_type in list(df[col].unique()):
        if int_type not in int_types:
            int_types.append(int_type)
            
print(int_types)

['None', 'Hbond', 'Arene', 'Ionic', 'Covalent']


In [16]:
# encode interaction types as integers
# create instance of labelencoder
labelencoder = LabelEncoder()

# get columns with 'type' in their name
cols = [col for col in ml_df.columns if 'type' in col]

# loop though all columns and convert strings to categorical integer variables
for col in cols:
    ml_df[col] = labelencoder.fit_transform(ml_df[col])

    
# encode states as integers
# get columns with 'type' in their name
cols = [col for col in ml_df.columns if 'state' in col]

# loop though all columns and convert strings to categorical integer variables
for col in cols:
    ml_df[col] = labelencoder.fit_transform(ml_df[col])
    
# encode functions as integers
# get columns with 'type' in their name
cols = [col for col in ml_df.columns if 'function' in col]

# loop though all columns and convert strings to categorical integer variables
for col in cols:
    ml_df[col] = labelencoder.fit_transform(ml_df[col])

In [17]:
display(ml_df)

Unnamed: 0,residue,7EW2_state,7EW2_function,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_state,7EW3_function,...,6Z10_intenergy1,6Z10_inttype2,6Z10_intenergy2,6RNK_state,6RNK_function,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2
0,1.21,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999
1,1.22,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999
2,1.23,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999
3,1.24,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999
4,1.25,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,7.64,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999
328,7.65,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999
329,7.66,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999
330,7.67,0,0,-999,3,-999,2,-999,0,0,...,-999,3,-999,0,0,-999,2,-999,3,-999


In [18]:
ml_df.to_csv('ml_df_test.csv')

---
# Machine Learning

In [20]:
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [21]:
# X has features, y has residues
X = ml_df.drop(['residue'], axis = 1)
y = ml_df['residue']

In [22]:
# scale data
scaler = StandardScaler()
to_scale = [col for col in X.columns.values]
scaler.fit(X[to_scale])
X[to_scale] = scaler.transform(X[to_scale])

# predict z-scores on the test set
X[to_scale] = scaler.transform(X[to_scale]) 

In [23]:
# display scaled values
display(X)

Unnamed: 0,7EW2_state,7EW2_function,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_state,7EW3_function,7EW3_intenergysum,...,6Z10_intenergy1,6Z10_inttype2,6Z10_intenergy2,6RNK_state,6RNK_function,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2
0,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051
1,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051
2,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051
3,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051
4,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051
328,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051
329,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051
330,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,7.386646,-10.959834,9.065415,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051


### Mean Shift Clustering

In [161]:
bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=332)

In [162]:
ms = MeanShift(bandwidth = bandwidth)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
 
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
 
# Print the number of clusters in the data
print("Clusters found: %d" % n_clusters_)

Clusters found: 65


In [163]:
ml_df['cluster_labels'] = labels

In [164]:
ml_df.to_csv('ml_df_meanshift_clustering_results.csv')

In [172]:
ml_df.loc[ml_df['cluster_labels'] == 9]

Unnamed: 0,residue,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_intenergysum,7EW3_inttype1,7EW3_intenergy1,7EW3_inttype2,...,6Z10_inttype1,6Z10_intenergy1,6Z10_inttype2,6Z10_intenergy2,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2,cluster_labels
87,3.29,-6.744811,2,-0.930633,1,-5.814178,-8.58787,0,-0.1,0,...,1,-0.5,1,-0.1,-0.7,1,-0.4,1,-0.2,9


### K Means Clustering

In [107]:
from sklearn.cluster import KMeans
rng = 1

In [108]:
clustering = KMeans(n_clusters=2, random_state=rng)
clustering.fit(X)
labels = clustering.labels_
ml_df['cluster_labels'] = labels

In [193]:
ml_df.to_csv('ml_df_kmeans_clustering_results.csv')

In [111]:
ml_df.loc[ml_df['cluster_labels'] == 0]

Unnamed: 0,residue,7EW2_state,7EW2_function,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_state,7EW3_function,...,6Z10_inttype2,6Z10_intenergy2,6RNK_state,6RNK_function,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2,cluster_labels
0,1.21,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0
1,1.22,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0
2,1.23,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0
3,1.24,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0
4,1.25,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,7.64,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0
328,7.65,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0
329,7.66,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0
330,7.67,0,0,-999,3,-999,2,-999,0,0,...,3,-999,0,0,-999,2,-999,3,-999,0


In [206]:
ml_df = ml_df.drop(['cluster_labels'], axis = 1, inplace = True)

### Agglomerative Clustering

In [24]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
labels = cluster.fit_predict(X)
ml_df['cluster_labels'] = labels

In [25]:
ml_df.to_csv('ml_df_agglomerative_clustering_results.csv')

In [26]:
ml_df.loc[ml_df['cluster_labels'] == 1]

Unnamed: 0,residue,7EW2_state,7EW2_function,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_state,7EW3_function,...,6Z10_inttype2,6Z10_intenergy2,6RNK_state,6RNK_function,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2,cluster_labels
90,3.32,0,0,-999.0,3,-999.0,2,-999.0,0,0,...,0,-0.3,0,0,-0.4,1,-0.1,0,-0.3,1
91,3.33,0,0,-0.1,0,-0.1,2,-999.0,0,0,...,3,-999.0,0,0,-999.0,2,-999.0,3,-999.0,1
266,6.51,0,0,-999.0,3,-999.0,2,-999.0,0,0,...,3,-999.0,0,0,-0.2,0,-0.2,3,-999.0,1
270,6.55,0,0,-999.0,3,-999.0,2,-999.0,0,0,...,3,-999.0,0,0,-999.0,2,-999.0,3,-999.0,1
302,7.39,0,0,-999.0,3,-999.0,2,-999.0,0,0,...,2,-4.114871,0,0,-12.189508,1,-4.0,2,-0.549586,1


### Gaussian Mixture Model

In [104]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)
labels = gmm.fit_predict(X)

In [105]:
ml_df['cluster_labels'] = labels

In [106]:
ml_df.loc[ml_df['cluster_labels'] == 0]

Unnamed: 0,residue,7EW2_state,7EW2_function,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_state,7EW3_function,...,6Z10_inttype2,6Z10_intenergy2,6RNK_state,6RNK_function,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2,cluster_labels
90,3.32,0,0,-999.0,3,-999.0,2,-999.0,0,0,...,0,-0.3,0,0,-0.4,1,-0.1,0,-0.3,0
91,3.33,0,0,-0.1,0,-0.1,2,-999.0,0,0,...,3,-999.0,0,0,-999.0,2,-999.0,3,-999.0,0
266,6.51,0,0,-999.0,3,-999.0,2,-999.0,0,0,...,3,-999.0,0,0,-0.2,0,-0.2,3,-999.0,0
270,6.55,0,0,-999.0,3,-999.0,2,-999.0,0,0,...,3,-999.0,0,0,-999.0,2,-999.0,3,-999.0,0
302,7.39,0,0,-999.0,3,-999.0,2,-999.0,0,0,...,2,-4.114871,0,0,-12.189508,1,-4.0,2,-0.549586,0


### PCA

In [46]:
from sklearn.decomposition import PCA

#plotly imports
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

plotX = X.copy()

#PCA with two principal components
pca_2d = PCA(n_components=2)

clusters = ml_df['cluster_labels']
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX))
PCs_2d.columns = ["PC1_2d", "PC2_2d"]
plotX['clusters'] = clusters
plotX['PC1_2d'] = PCs_2d['PC1_2d']
plotX['PC2_2d'] = PCs_2d['PC2_2d']

uniq_clusters = plotX['clusters'].unique()
uniqs = uniq_clusters.tolist()
uniqs.sort()

cluster0 = plotX[plotX['clusters'] == 0]
cluster1 = plotX[plotX['clusters'] == 1]

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster I",
                    marker = dict(color = 'rgba(228,26,28,0.8)', size = 10, line=dict(width=1,color='Black')),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster II",
                    marker = dict(color = 'rgba(43, 89, 39, 0.98)', size = 10, line=dict(width=1,color='Black')),
                    text = None)

data = [trace1, trace2]

layout = dict(xaxis= dict(title= 'PC1',dtick= 200,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False),
              font_family="Arial",
              font = dict(size=16),
              width=800,
              height=800
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [136]:
plotX.loc[plotX['clusters'] == 1]

Unnamed: 0,7EW2_state,7EW2_function,7EW2_intenergysum,7EW2_inttype1,7EW2_intenergy1,7EW2_inttype2,7EW2_intenergy2,7EW3_state,7EW3_function,7EW3_intenergysum,...,6RNK_state,6RNK_function,6RNK_intenergysum,6RNK_inttype1,6RNK_intenergy1,6RNK_inttype2,6RNK_intenergy2,clusters,PC1_2d,PC2_2d
90,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,0.0,0.0,5.736195,-29.626838,5.725737,-57.057206,8.156564,1,838.920181,-592.641776
91,0.0,0.0,9.162497,-80.672612,9.145153,-15.664495,12.892326,0.0,0.0,9.076506,...,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051,1,615.37457,-335.234895
266,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,0.0,0.0,5.736202,-50.955012,5.725733,-11.15604,8.089051,1,759.779048,772.894336
270,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,0.0,0.0,5.701613,-8.298665,5.691266,-11.15604,8.089051,1,364.548023,239.243019
302,0.0,0.0,9.077976,-13.612786,9.060944,-15.664495,12.892326,0.0,0.0,9.076506,...,0.0,0.0,5.735787,-29.626838,5.725602,-26.456429,8.156547,1,535.703877,-151.05583


In [120]:
PCs_2d

Unnamed: 0,PC1_2d,PC2_2d
0,-15.852944,-1.255547
1,-15.852944,-1.255547
2,-15.852944,-1.255547
3,-15.852944,-1.255547
4,-15.852944,-1.255547
...,...,...
327,-15.852944,-1.255547
328,-15.852944,-1.255547
329,-15.852944,-1.255547
330,-15.852944,-1.255547
