In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Data pre processing

In [50]:
raw_data = pd.read_csv('./data/Final_Results_11_3_20.csv')
crystal_data = raw_data[raw_data['Crystallization Data?']=='Yes']
numeric_cdata = crystal_data.select_dtypes(include='number')

## Plot settings

In [95]:
import plotly.graph_objects as go
colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]
solvents = list(crystal_data['Solvent'].unique())
amines = list(crystal_data['Amine'].unique())
molecule_type = {
    'NNDimethylethane12diammoniumiodide': 'branch',
    'NNDiethylpropane13diammoniumiodide': 'branch',
    'Benzenediaminedihydroiodide': '6-membered-ring',
    '2Pyrrolidin1ium1ylethylammoniumiodide': '5-membered-ring',
    'AcNH3I': 'branch',
    'CyclohexylmethylammoniumIodide': '6-membered-ring',
    'PhEtNH3I': '6-membered-ring',
    'Propane13diammoniumIodide': 'chain',
    'EtNH3I': 'chain',
    'MeNH3I': 'chain',
}
types = ['branch', '6-membered-ring', '5-membered-ring', 'chain']
sol_colors= {solvents[i]:colors[i] for i in range(len(solvents))}
amine_colors = {amines[i]:colors[i] for i in range(len(amines))}
type_colors = {types[i]:colors[i] for i in range(len(types))}


def PCA2D(loadings, features, data):
    #marker_text = [f"Solvent: {row['Solvent']} Amine: {row['Amine']}" for idx, row in crystal_data.iterrows()]
    #colors = [color_scheme[column[idx]] for idx, row in crystal_data.iterrows()]
    fig = go.Figure(data=data)
    
    for i, feature in enumerate(features):
        fig.add_shape(
            type='line',
            x0=0, y0=0,
            x1=loadings[i, 0],
            y1=loadings[i, 1]
        )
        fig.add_annotation(
            x=loadings[i, 0],
            y=loadings[i, 1],
            ax=0, ay=0,
            xanchor="center",
            yanchor="bottom",
            text=feature,
        )
    return fig

def PCA3D(loadings, features, data):
    fig = go.Figure(data=data)
    x_lines = []
    y_lines = []
    z_lines = []
    annotations = []
    for i, feature in enumerate(features):
        x_lines.append(0.0)
        x_lines.append(loadings[i, 0])
        
        y_lines.append(0.0)
        y_lines.append(loadings[i, 1])
        
        z_lines.append(0.0)
        z_lines.append(loadings[i, 2])
        
        a = dict(showarrow=False,
                    x=loadings[i, 0],
                    y=loadings[i, 1],
                    z=loadings[i, 2],
                    text=feature,
                    xanchor="left",
                    xshift=10,
                    opacity=0.7)
        annotations.append(a)
    fig.update_layout(scene=dict(annotations=annotations,))
                    
    
    trace2 = go.Scatter3d(
            x=x_lines,
            y=y_lines,
            z=z_lines,
            mode='lines',
            name='PCA components'
        )
    fig.add_trace(trace2)
    
    return fig

## 2 Principal Components

In [52]:
pca = PCA(n_components=2)
pca.fit(numeric_cdata)
features = list(numeric_cdata.columns)
pca_res = pca.transform(numeric_cdata)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

In [66]:
crystal_data['PCA_X'] = pca_res[:,0]
crystal_data['PCA_Y'] = pca_res[:,1]

21
10




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



## Colored by solvent

In [72]:
traces = []
for solvent in solvents:
    solvent_df = crystal_data[crystal_data['Solvent']==solvent]
    marker_text = [f"Solvent: {row['Solvent']} Amine: {row['Amine']}" for idx, row in solvent_df.iterrows()]
    trace = go.Scatter(x=solvent_df['PCA_X'], y=solvent_df['PCA_Y'], mode='markers',text=marker_text, name=solvent)
    traces.append(trace)
PCA2D(loadings, features, traces)

## Colored by Amine

In [73]:
traces = []
for amine in amines:
    solvent_df = crystal_data[crystal_data['Amine']==amine]
    marker_text = [f"Solvent: {row['Solvent']} Amine: {row['Amine']}" for idx, row in solvent_df.iterrows()]
    trace = go.Scatter(x=solvent_df['PCA_X'], y=solvent_df['PCA_Y'], mode='markers',text=marker_text, name=amine)
    traces.append(trace)
PCA2D(loadings, features, traces)

In [74]:
traces = []
for mol_type in types:
    solvent_df = crystal_data[crystal_data['Molecule Type']==mol_type]
    marker_text = [f"Solvent: {row['Solvent']} Amine: {row['Amine']}" for idx, row in solvent_df.iterrows()]
    trace = go.Scatter(x=solvent_df['PCA_X'], y=solvent_df['PCA_Y'], mode='markers',text=marker_text, name=mol_type)
    traces.append(trace)
PCA2D(loadings, features, traces)

In [75]:
pca3d = PCA(n_components=3)
pca3d.fit(numeric_cdata)
features3d = list(numeric_cdata.columns)
pca_res3d = pca3d.transform(numeric_cdata)
loadings3d = pca3d.components_.T * np.sqrt(pca3d.explained_variance_)

In [77]:
crystal_data['PCA3D_X'] = pca_res3d[:,0]
crystal_data['PCA3D_Y'] = pca_res3d[:,1]
crystal_data['PCA3D_Z'] = pca_res3d[:,2]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [96]:
traces = []
for solvent in solvents:
    solvent_df = crystal_data[crystal_data['Solvent']==solvent]
    marker_text = [f"Solvent: {row['Solvent']} Amine: {row['Amine']}" for idx, row in solvent_df.iterrows()]
    trace = go.Scatter3d(x=solvent_df['PCA3D_X'], y=solvent_df['PCA3D_Y'], z=solvent_df['PCA3D_Z'], mode='markers',text=marker_text, name=solvent)
    traces.append(trace)
PCA3D(loadings3d, features3d, traces)

In [98]:
traces = []
for amine in amines:
    amine_df = crystal_data[crystal_data['Amine']==amine]
    marker_text = [f"Solvent: {row['Solvent']} Amine: {row['Amine']}" for idx, row in solvent_df.iterrows()]
    trace = go.Scatter3d(x=amine_df['PCA3D_X'], y=amine_df['PCA3D_Y'], z=amine_df['PCA3D_Z'], mode='markers',text=marker_text, name=amine)
    traces.append(trace)
PCA3D(loadings3d, features3d, traces)

In [99]:
traces = []
for mol_type in types:
    amine_df = crystal_data[crystal_data['Molecule Type']==mol_type]
    marker_text = [f"Solvent: {row['Solvent']} Amine: {row['Amine']}" for idx, row in solvent_df.iterrows()]
    trace = go.Scatter3d(x=amine_df['PCA3D_X'], y=amine_df['PCA3D_Y'], z=amine_df['PCA3D_Z'], mode='markers',text=marker_text, name=mol_type)
    traces.append(trace)
PCA3D(loadings3d, features3d, traces)