# predict chart type with a random forest classifier
The Plotly Community Feed has more than a million user-generated charts, many of which include data.  I want to use this data to train a model to predict the kinds of design decisions that a human data analyst would make when looking at a table of data:
- What kind of chart to draw?
    - line
    - bar
    - scatter
    - heatmap
- What goes on the x axis?
- What goes on the y axis? 
- Are any transforms necisary? 

Conveniently, the precident project vizML has packeged up data from the plotly community feed, along with a package of attributes they calculated from cleaned and de-duplicated data.  As a shortcut, I'll start with their processed attributes. Later, we can go back and collect new data from the plotly community feed to add more information about new types of data -- particularly, about chart types good for visualizing genomic data, like a circos plot.  

In [None]:
!pwd

### Dependencies

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

import os
from os.path import join
import sys
base_path = os.path.abspath(os.path.join('..'))
if base_path not in sys.path:
    sys.path.append(base_path)


### Load Data

In [9]:
data_dir_name = '../data'
data_file_name = 'features_with_9_chart_types_888k.csv'
df = pd.read_csv(os.path.join(data_dir_name, data_file_name))
df.info()

In [42]:
# features data frame without ids or chart type labels
features = df.iloc[:,2:]
features.drop('labels', axis=1, inplace=True)
# one-hot encode labels
labels = pd.get_dummies(df['labels'])
labels.head()

Unnamed: 0,bar,box,heatmap,histogram,line,pie,sankey,scatter,table
0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0


In [43]:
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42)

### Define and Train Preliminary Model

In [None]:
model = RandomForestClassifier(n_estimators=500, 
                               bootstrap = True,
                               max_features = 'sqrt')
# Fit on training data
model.fit(X_train, y_train)

### Optimize Model Hyperparamiters

In [None]:
# Create a grid of hyperparamiters to search

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

### Evaluate Best Model

In [34]:
pd.set_option('display.max_rows', 1000)
features.dtypes
pd.set_option('display.max_rows', 60)

In [32]:
features.dtypes

exists-agg-num                          float64
exists-agg-has                             bool
exists-agg-only_one                        bool
exists-agg-all                             bool
exists-agg-percentage                   float64
                                         ...   
percent_shared_words-agg-avg_abs_dev    float64
percent_shared_words-agg-med_abs_dev    float64
percent_shared_words-agg-min            float64
percent_shared_words-agg-max            float64
labels                                   object
Length: 847, dtype: object

In [37]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888484 entries, 0 to 888483
Columns: 847 entries, exists-agg-num to labels
dtypes: bool(120), float64(726), object(1)
memory usage: 4.9+ GB


In [39]:
# find columns in df where 
filteredColumns = features.dtypes[features.dtypes == np.object]
# list of columns whose data type is object i.e. string
objcols = list(filteredColumns.index)
print(objcols)

['labels']


In [40]:
features.labels

0         scatter
1         scatter
2             bar
3         scatter
4         scatter
           ...   
888479       line
888480        bar
888481    scatter
888482       line
888483        bar
Name: labels, Length: 888484, dtype: object