In [1]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_table
import plotly.graph_objs as go
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import csv
import re

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table


In [2]:
tsv_file_path = r'../Data/CTDC_synthetic_20210825.tsv'
with open(tsv_file_path, 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    tsv_data = list(reader)

In [3]:
df = pd.DataFrame(tsv_data)
df = pd.DataFrame(df.values[1:], columns=df.iloc[0])
df = df.reset_index(drop=True)
df = df.replace("", np.nan)
df = df.dropna(subset=['yearOfRegistration',])
df.reset_index(inplace=True,drop=True)

In [4]:
#processing all data
def avg_traffic_time(string):
    if isinstance(string, str) and not pd.isnull(string):
        numbers = re.findall(r'\d+', string)
        return sum(map(int, numbers)) / len(numbers)

def ageBroad(string):
    if isinstance(string, str) and not pd.isnull(string):
        if "+" in string:
            return int(string[:-1])
        else:
            values = list(map(int, string.split("--")))
            return sum(values) / len(values)

# df["gender"] = df["gender"].map({"Male": 0.0, "Female": 1.0}) #Thought about this, but decided to leave be since it will still be easy to depict
# df["majorityStatusAtExploit"] = df["majorityStatusAtExploit"].map({"Adult": 0.0, "Minor": 1.0}) #Thought about this, but decided to leave be since it will still be easy to depict
df["traffickMonths"] = df["traffickMonths"].apply(avg_traffic_time)
df["ageBroad"] = df["ageBroad"].apply(ageBroad)

#convert all except citizenship and exploitation
cols_to_convert = [col for col in df.columns if col not in ["citizenship", "CountryOfExploitation", "gender", "majorityStatusAtExploit"]]
df[cols_to_convert] = df[cols_to_convert].astype(float)

In [5]:
# Get list of all columns
all_columns = df.columns.tolist()
cluster_options = [{"label": str(i), "value": i} for i in range(2, 21)]

app = dash.Dash(__name__)

app.layout = html.Div([
#     html.Div([
#         html.Label("Number of Clusters:"),
#         dcc.Input(
#             id="input-k",
#             type="number",
#             value=2,
#             min=2,
#             max=10,
#             step=1,
#             style={'width': '80px'}
#         ),
#     ], style={'display': 'inline-block', 'vertical-align': 'top', 'margin-right': '75px'}),
    html.Div([
        html.Label("Number of Clusters:"),
        dcc.Dropdown(
            id="input-k",
            options=cluster_options,
            value=2,
            clearable=True,
            multi=False,
            style={'width': '150'}
        )
    ], style={'display': 'inline-block', 'vertical-align': 'top', 'margin-right': '75px'}),
    html.Div([
        html.Label("Columns for X:"),
        dcc.Dropdown(
            id="x-values",
            options=[{"label": col, "value": col} for col in all_columns],
            value=all_columns[0],  # Initial value based on first column
            clearable=True,
            style={'width': '200px'}
        ),
    ], style={'display': 'inline-block', 'vertical-align': 'top', 'margin-right': '75px'}),
    html.Div([
        html.Label("Columns for Y:"),
        dcc.Dropdown(
            id="y-values",
            options=[{"label": col, "value": col} for col in all_columns],
            value=all_columns[1],  # Initial value based on second column
            clearable=True,
            style={'width': '200px'}
        ),
    ], style={'display': 'inline-block', 'vertical-align': 'top', 'margin-right': '75px'}),
    html.Div([
        html.Label("Columns for Z:"),
        dcc.Dropdown(
            id="z-values",
            options=[{"label": col, "value": col} for col in all_columns],
            value=all_columns[2],  # Initial value based on third column
            clearable=True,
            style={'width': '200px'}
        ),
    ], style={'display': 'inline-block', 'vertical-align': 'top', 'margin-right': '75px'}),
    html.Div([
        html.Label("Columns for Clustering:"),
        dcc.Dropdown(
            id="input-columns",
            options=[{"label": col, "value": col} for col in all_columns],
            value=all_columns[:5],  # Initial value based on the first five columns
            multi=True,
        ),
    ], style={'display': 'inline-block', 'vertical-align': 'top', 'margin-right': '75px'}),
    html.Div([
        html.Label("Scale Data?"),
        dcc.Dropdown(
            id="scale-values",
            options=[{"label": "Yes", "value": "Yes"}, {"label": "No", "value": "No"}],
            value="Yes",  # Default to Yes
            clearable=True,
            style={'width': '80px'}
        ),
    ], style={'display': 'inline-block', 'vertical-align': 'top', 'margin-right': '75px'}),
    dash_table.DataTable(
        id='datatable-interactivity',
        columns=[{"name": i, "id": i, "deletable": True, "selectable": True} for i in all_columns],
        data=df.to_dict('records'),
        editable=True,
        filter_action="native",
        sort_action="native",
        sort_mode="multi",
        column_selectable="single",
        row_selectable="multi",
        row_deletable=True,
        selected_columns=[],
        selected_rows=[],
        page_action="native",
        page_current=0,
        page_size=10,
    ),
    html.Div(id='datatable-interactivity-container')
])


@app.callback(
    dash.dependencies.Output('datatable-interactivity-container', "children"),
    [dash.dependencies.Input('datatable-interactivity', "derived_virtual_data"),
     dash.dependencies.Input('datatable-interactivity', "derived_virtual_selected_rows"),
     dash.dependencies.Input("input-k", "value"),
     dash.dependencies.Input("input-columns", "value"),
     dash.dependencies.Input("y-values", "value"),
     dash.dependencies.Input("x-values", "value"),
     dash.dependencies.Input("z-values", "value"),
     dash.dependencies.Input("scale-values", "value")])

def update_clusters(rows, derived_virtual_selected_rows, k, selected_columns, y, x, z, scale):
    if derived_virtual_selected_rows is None or rows is None or selected_columns is None or len(selected_columns) == 0:
        return dcc.Graph(figure={})

    dff = df if rows is None else pd.DataFrame(rows)
    
    dff = dff[selected_columns].dropna()
    
    dff['text'] = "X: " + dff[x].apply(str) + " Y: " + dff[y].apply(str) + " Z: " + dff[z].apply(str)
    t = dff['text'].to_list()
    dff.drop(columns=['text'], inplace=True)

    # Preprocess categorical variables with LabelEncoder
    categorical_cols = [col for col in selected_columns if dff[col].dtype == 'object']
    if categorical_cols:
        encoder = LabelEncoder()
        encoded_data = dff[categorical_cols].apply(encoder.fit_transform)
        dff.drop(categorical_cols, axis=1, inplace=True)
        dff = pd.concat([dff, encoded_data], axis=1)

    # Scale data if selected
    if scale == "Yes":
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(dff)
        dff = pd.DataFrame(scaled_data, columns=dff.columns)

    xidx = dff.columns.get_loc(x)
    yidx = dff.columns.get_loc(y)
    zidx = dff.columns.get_loc(z)
    dff = dff.iloc[:,[xidx,yidx,zidx]].values #create 2-D map

    if len(selected_columns) < 1:
        return dcc.Graph(figure={})

    # Perform k-means clustering
    kmeans = KMeans(n_clusters=k, random_state=56, init="random")
    y_kmeans = kmeans.fit_predict(dff)

    colors = ['red', 'blue', 'green', 'orange', 'purple', 'cyan', 'magenta', 'yellow', 'brown', 'pink', 'gray',
              'black', 'lightblue', 'lime', 'olive', 'teal', 'navy', 'maroon', 'aqua', 'silver', 'indigo', 'gold',
              'coral', 'violet', 'turquoise', 'salmon', 'khaki', 'orchid', 'plum', 'slategray', 'peru', 'sienna',
              'rosybrown', 'thistle', 'chartreuse', 'crimson']

    # Create the scatter plot based on PCA components
    fig_scatter = go.Figure()
    for i in range(k):
        fig_scatter.add_trace(go.Scatter3d(            
            x=dff[y_kmeans == i,0],
            y=dff[y_kmeans == i,1],
            z=dff[y_kmeans == i,2],
            text=t,
            mode='markers',
            marker=dict(
                size=5,
                color=colors[i % len(colors)],
                line=dict(width=0.5, color='black')
            ),
            name='Cluster ' + str(i + 1)
        ))
    fig_scatter.update_layout(
        title='Clusters by KMeans',
        scene=dict(
            xaxis=dict(title=x),
            yaxis=dict(title=y),
            zaxis=dict(title=z)
        ))
    return dcc.Graph(
        figure=fig_scatter,
        style={'height': '100vh', 'width': '100wh'}  # Adjust the height and width as needed
    )

if __name__ == '__main__':
    app.run_server(debug=True)