# Double Helix Dataset

This dataset is designed to demonstrate the power of non-linear methods and in particular the power of neural networks. The dataset has three classes: two helixes and one bar. The bar runs thru the center of the two helixes. The dataset was generated using uniform distributions without noise to simplify the problem. 

## Generating the Data

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

import plotly.express as px
import plotly.express as px

ModuleNotFoundError: No module named 'plotly'

In [None]:
def make_helix(n_samples = 1000):
    
    # Helix 
    x = np.linspace(0, 8*np.pi,n_samples)
    y = np.cos(x) - np.random.normal(loc=0, scale=.5,size=n_samples)
    z = np.sin(x) - np.random.normal(loc=0, scale=.5,size=n_samples)
    helix = np.column_stack((x,y,z))
    
    # Helix 2
    hx = np.linspace(0, 8*np.pi,n_samples)
    hy = np.negative(np.cos(x) - np.random.normal(loc=0, scale=.5,size=n_samples))
    hz = np.negative(np.sin(x) - np.random.normal(loc=0, scale=.5,size=n_samples))
    helix2 = np.column_stack((hx,hy,hz))    
    
    # Bar
    bx = np.random.uniform(low=0, high=helix.max(),size=helix.shape[0])
    by = np.random.normal(loc=0, scale =.1, size=helix.shape[0])
    bz = np.random.normal(loc=0, scale =.1, size=helix.shape[0])
    
    bar= np.column_stack((bx, by, bz))

    return helix, helix2, bar

In [None]:
X, h,  bar = make_helix(1000)

## Visualizing the Data

In [None]:
helix = pd.DataFrame(X, columns=['x', 'y', 'z'])
helix['target'] = 1
helix2 = pd.DataFrame(h, columns=['x', 'y', 'z'])
helix2['target'] = 2
background = pd.DataFrame(bar, columns=['x', 'y', 'z'])
background['target'] = 0

df = pd.concat([helix,helix2, background])

In [None]:
fig = px.scatter_3d(df, x='x', y='y', z='z', color='target', symbol='target')
fig.show()

In [None]:
fig = px.scatter(df, x='x', y='y', color='target', symbol='target')
fig.show()

In [None]:
fig = px.scatter(df, x='y', y='z', color='target', symbol='target')
fig.show()

## Machine Learning on the Helix

Generally speaking linear based methods will fail on this dataset. Tree based methods should work will depending on how the data is split into training and testing. 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['x','y', 'z']], df['target'], test_size=.8)

### Logistic Regression


In [None]:
from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV(cv=5, multi_class='multinomial')
lr.fit(X_train, y_train)
lr.score(X_test,y_test) # Basically a random guess

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(max_depth=2)

dtc.fit(X_train, y_train)

dtc.score(X_test, y_test) #It's seperating the helixes from the bar

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10000, max_depth=10)

clf.fit(X_train, y_train)
clf.score(X_test, y_test) #Good Results :) 

## Neural Network
Fairly shallow / simple

In [None]:
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

import wandb
from wandb.keras import WandbCallback

wandb.init(project="doublehelixbar")

model = Sequential()

y_train_nn = keras.utils.to_categorical(y_train, num_classes=3)
y_test_nn = keras.utils.to_categorical(y_test, num_classes=3)

model.add(Dense(75, activation='relu', input_dim=3))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train_nn, epochs=2000, validation_data=(X_test,y_test_nn), shuffle=True, verbose=False, callbacks=[WandbCallback()])

model.evaluate(X_test,y_test_nn)

## Machine Learning As Time Serries

Here we split the helix data along the x axis instead of using a random sample. This increases the difficulty of generalizing the model to unseen data. 

In [None]:
df = df.sort_values(by='x')

In [None]:
df.reset_index(inplace=True, drop=True)

In [None]:
train = df.iloc[:2400]
test = df.iloc[2400:]

features = ['x', 'y', 'z']
X_train, y_train = train[features], train['target']
X_test, y_test = test[features], test['target']

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV(cv=5, multi_class='multinomial')
lr.fit(X_train, y_train)
lr.score(X_test,y_test) # Basically a random guess

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(max_depth=2)

dtc.fit(X_train, y_train)

dtc.score(X_test, y_test) #It's seperating the helixes from the bar

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10000, max_depth=10)

clf.fit(X_train, y_train)
clf.score(X_test, y_test) # The Struggle is Real

### Neural Network

In [None]:
wandb.init(project="doublehelixbar")
model = Sequential()

y_train_nn = keras.utils.to_categorical(y_train, num_classes=3)
y_test_nn = keras.utils.to_categorical(y_test, num_classes=3)

model.add(Dense(75, activation='relu', input_dim=3))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(75, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train_nn, epochs=2000, validation_data=(X_test,y_test_nn), shuffle=True, verbose=False, callbacks=[WandbCallback()])

model.evaluate(X_test,y_test_nn)

## Saving the Data

In [None]:
df.to_csv('jcs_challenge.csv', header=False, index=False)