In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

CHEMICAL_FEATURES = [
    'mc_CRY-chi-0-all',
    'mc_CRY-chi-1-all',
    'mc_CRY-chi-2-all',
    'mc_CRY-chi-3-all',
    'mc_CRY-Z-0-all',
    'mc_CRY-Z-1-all',
    'mc_CRY-Z-2-all',
    'mc_CRY-Z-3-all',
    'mc_CRY-I-1-all',
    'mc_CRY-I-2-all',
    'mc_CRY-I-3-all',
    'mc_CRY-T-0-all',
    'mc_CRY-T-1-all',
    'mc_CRY-T-2-all',
    'mc_CRY-T-3-all',
    'mc_CRY-S-0-all',
    'mc_CRY-S-1-all',
    'mc_CRY-S-2-all',
    'mc_CRY-S-3-all',
    'D_mc_CRY-chi-1-all',
    'D_mc_CRY-chi-2-all',
    'D_mc_CRY-chi-3-all',
    'D_mc_CRY-Z-1-all',
    'D_mc_CRY-Z-2-all',
    'D_mc_CRY-Z-3-all',
    'D_mc_CRY-T-1-all',
    'D_mc_CRY-T-2-all',
    'D_mc_CRY-T-3-all',
    'D_mc_CRY-S-1-all',
    'D_mc_CRY-S-2-all',
    'D_mc_CRY-S-3-all',
    'func-chi-0-all',
    'func-chi-1-all',
    'func-chi-2-all',
    'func-chi-3-all',
    'func-Z-0-all',
    'func-Z-1-all',
    'func-Z-2-all',
    'func-Z-3-all',
    'func-I-1-all',
    'func-I-2-all',
    'func-I-3-all',
    'func-T-0-all',
    'func-T-1-all',
    'func-T-2-all',
    'func-T-3-all',
    'func-S-0-all',
    'func-S-1-all',
    'func-S-2-all',
    'func-S-3-all',
    'func-alpha-0-all',
    'func-alpha-1-all',
    'func-alpha-2-all',
    'func-alpha-3-all',
    'D_func-chi-1-all',
    'D_func-chi-2-all',
    'D_func-chi-3-all',
    'D_func-Z-1-all',
    'D_func-Z-2-all',
    'D_func-Z-3-all',
    'D_func-T-1-all',
    'D_func-T-2-all',
    'D_func-T-3-all',
    'D_func-S-2-all',
    'D_func-S-3-all',
    'D_func-alpha-1-all',
    'D_func-alpha-2-all',
    'D_func-alpha-3-all',
    'f-lig-chi-0',
    'f-lig-chi-1',
    'f-lig-chi-2',
    'f-lig-chi-3',
    'f-lig-Z-0',
    'f-lig-Z-1',
    'f-lig-Z-2',
    'f-lig-Z-3',
    'f-lig-I-0',
    'f-lig-I-1',
    'f-lig-I-2',
    'f-lig-I-3',
    'f-lig-T-0',
    'f-lig-T-1',
    'f-lig-T-2',
    'f-lig-T-3',
    'f-lig-S-0',
    'f-lig-S-1',
    'f-lig-S-2',
    'f-lig-S-3',
    'lc-chi-0-all',
    'lc-chi-1-all',
    'lc-chi-2-all',
    'lc-chi-3-all',
    'lc-Z-0-all',
    'lc-Z-1-all',
    'lc-Z-2-all',
    'lc-Z-3-all',
    'lc-I-2-all',
    'lc-I-3-all',
    'lc-T-0-all',
    'lc-T-1-all',
    'lc-T-2-all',
    'lc-T-3-all',
    'lc-S-3-all',
    'lc-alpha-0-all',
    'lc-alpha-1-all',
    'lc-alpha-2-all',
    'lc-alpha-3-all',
    'D_lc-chi-2-all',
    'D_lc-chi-3-all',
    'D_lc-Z-1-all',
    'D_lc-Z-2-all',
    'D_lc-Z-3-all',
    'D_lc-T-1-all',
    'D_lc-T-2-all',
    'D_lc-T-3-all',
    'D_lc-alpha-1-all',
    'D_lc-alpha-2-all',
    'D_lc-alpha-3-all',
    'tertiary_amide_sum',
    'ester_sum',
    'carbonyl_sum',
    'logP_sum',
    'MR_sum',
    'aromatic_rings_sum',
    'dbonds_sum',
    'abonds_sum',
    'tertiary_amide_mean',
    'ester_mean',
    'carbonyl_mean',
    'logP_mean',
    'MR_mean',
    'aromatic_rings_mean',
    'dbonds_mean',
    'abonds_mean',
    'sum-func-chi-0-all',
    'sum-func-chi-1-all',
    'sum-func-chi-2-all',
    'sum-func-chi-3-all',
    'sum-func-Z-0-all',
    'sum-func-Z-1-all',
    'sum-func-Z-2-all',
    'sum-func-Z-3-all',
    'sum-func-I-0-all',
    'sum-func-I-1-all',
    'sum-func-I-2-all',
    'sum-func-I-3-all',
    'sum-func-T-0-all',
    'sum-func-T-1-all',
    'sum-func-T-2-all',
    'sum-func-T-3-all',
    'sum-func-S-0-all',
    'sum-func-S-1-all',
    'sum-func-S-2-all',
    'sum-func-S-3-all',
    'sum-func-alpha-0-all',
    'sum-func-alpha-1-all',
    'sum-func-alpha-2-all',
    'sum-func-alpha-3-all',
    'sum-D_func-chi-1-all',
    'sum-D_func-chi-2-all',
    'sum-D_func-chi-3-all',
    'sum-D_func-Z-1-all',
    'sum-D_func-Z-2-all',
    'sum-D_func-Z-3-all',
    'sum-D_func-T-1-all',
    'sum-D_func-T-2-all',
    'sum-D_func-T-3-all',
    'sum-D_func-S-1-all',
    'sum-D_func-S-2-all',
    'sum-D_func-S-3-all',
    'sum-D_func-alpha-1-all',
    'sum-D_func-alpha-2-all',
    'sum-D_func-alpha-3-all',
    'sum-f-lig-chi-0',
    'sum-f-lig-chi-1',
    'sum-f-lig-chi-2',
    'sum-f-lig-chi-3',
    'sum-f-lig-Z-0',
    'sum-f-lig-Z-1',
    'sum-f-lig-Z-2',
    'sum-f-lig-Z-3',
    'sum-f-lig-I-0',
    'sum-f-lig-I-1',
    'sum-f-lig-I-2',
    'sum-f-lig-I-3',
    'sum-f-lig-T-0',
    'sum-f-lig-T-1',
    'sum-f-lig-T-2',
    'sum-f-lig-T-3',
    'sum-f-lig-S-0',
    'sum-f-lig-S-1',
    'sum-f-lig-S-2',
    'sum-f-lig-S-3',
    'sum-lc-chi-0-all',
    'sum-lc-chi-1-all',
    'sum-lc-chi-2-all',
    'sum-lc-chi-3-all',
    'sum-lc-Z-0-all',
    'sum-lc-Z-1-all',
    'sum-lc-Z-2-all',
    'sum-lc-Z-3-all',
    'sum-lc-I-0-all',
    'sum-lc-I-1-all',
    'sum-lc-I-2-all',
    'sum-lc-I-3-all',
    'sum-lc-T-0-all',
    'sum-lc-T-1-all',
    'sum-lc-T-2-all',
    'sum-lc-T-3-all',
    'sum-lc-S-0-all',
    'sum-lc-S-1-all',
    'sum-lc-S-2-all',
    'sum-lc-S-3-all',
    'sum-lc-alpha-0-all',
    'sum-lc-alpha-1-all',
    'sum-lc-alpha-2-all',
    'sum-lc-alpha-3-all',
    'sum-D_lc-chi-1-all',
    'sum-D_lc-chi-2-all',
    'sum-D_lc-chi-3-all',
    'sum-D_lc-Z-1-all',
    'sum-D_lc-Z-2-all',
    'sum-D_lc-Z-3-all',
    'sum-D_lc-T-1-all',
    'sum-D_lc-T-2-all',
    'sum-D_lc-T-3-all',
    'sum-D_lc-S-1-all',
    'sum-D_lc-S-2-all',
    'sum-D_lc-S-3-all',
    'sum-D_lc-alpha-1-all',
    'sum-D_lc-alpha-2-all',
    'sum-D_lc-alpha-3-all',
    'sum-mc_CRY-chi-0-all',
    'sum-mc_CRY-chi-1-all',
    'sum-mc_CRY-chi-2-all',
    'sum-mc_CRY-chi-3-all',
    'sum-mc_CRY-Z-0-all',
    'sum-mc_CRY-Z-1-all',
    'sum-mc_CRY-Z-2-all',
    'sum-mc_CRY-Z-3-all',
    'sum-mc_CRY-I-0-all',
    'sum-mc_CRY-I-1-all',
    'sum-mc_CRY-I-2-all',
    'sum-mc_CRY-I-3-all',
    'sum-mc_CRY-T-0-all',
    'sum-mc_CRY-T-1-all',
    'sum-mc_CRY-T-2-all',
    'sum-mc_CRY-T-3-all',
    'sum-mc_CRY-S-0-all',
    'sum-mc_CRY-S-1-all',
    'sum-mc_CRY-S-2-all',
    'sum-mc_CRY-S-3-all',
    'sum-D_mc_CRY-chi-1-all',
    'sum-D_mc_CRY-chi-2-all',
    'sum-D_mc_CRY-chi-3-all',
    'sum-D_mc_CRY-Z-1-all',
    'sum-D_mc_CRY-Z-2-all',
    'sum-D_mc_CRY-Z-3-all',
    'sum-D_mc_CRY-T-1-all',
    'sum-D_mc_CRY-T-2-all',
    'sum-D_mc_CRY-T-3-all',
    'sum-D_mc_CRY-S-1-all',
    'sum-D_mc_CRY-S-2-all',
    'sum-D_mc_CRY-S-3-all',
]

In [26]:
colors2keep = ['aubergine',
 'beige',
 'black',
 'black brown',
 'black green',
 'black purple',
 'black red',
 'blackish purple',
 'blue',
 'blue violet',
 'bluish violet',
 'bright yellow',
 'brown',
 'brown red',
 'brown yellow',
 'cherry red',
 'colorless',
 'cyan',
 'dark blue',
 'dark brown',
 'dark green',
 'dark orange',
 'dark pink',
 'dark purple',
 'dark purplish red',
 'dark red',
 'dark rose',
 'dark violet',
 'dark yellow',
 'deep blue',
 'deep blue black',
 'deep brown',
 'deep purple',
 'deep red',
 'deep yellow',
 'dull dark black',
 'faint pink',
 'gold',
 'golden yellow',
 'gray',
 'green black',
 'green brown',
 'green yellow',
 'green/brown',
 'greenish yellow',
 'grey',
 'intense purple',
 'intense violet',
 'light',
 'light blue',
 'light colorless',
 'light green',
 'light orange',
 'light pink',
 'light purple',
 'light red',
 'light yellow',
 'lilac',
 'magenta',
 'navy blue',
 'orange',
 'orange brown',
 'orange red',
 'orange yellow',
 'pale green',
 'pale purple',
 'pale red',
 'pale straw',
 'pale violet',
 'pale yellow',
 'pink',
 'pink purple',
 'pink red',
 'pink violet',
 'pink/purple',
 'purple',
 'purple black',
 'purple blue',
 'purple red',
 'red',
 'red black',
 'red brown',
 'red orange',
 'red purple',
 'red violet',
 'reddish',
 'reddish brown',
 'reddish purple',
 'rose',
 'ruby red',
 'scarlet',
 'sky blue',
 'slightly pink',
 'straw yellow',
 'translucent',
 'turquoise',
 'violet',
 'violet red',
 'wheat',
 'white',
 'whiteish colorless',
 'yellow',
 'yellow green',
 'yellow orange',
 'yellowish',
 'yellowish green',
 'yellowish white',
 'yellowish yellow']

In [27]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [28]:
df = pd.read_csv('../data/color_feat_merged.csv')

In [29]:
df_tests = pd.read_csv('../../test_set.csv')

In [30]:
len(df)

6423

In [47]:
exlcuded = []
keept = []

THRESHOLD = 0.02

for i, row in df.iterrows(): 
    keep = True
#     if not row['color_cleaned_y'] in colors2keep: 
#         keep = False
    if keep:
        for j, test in df_tests.iterrows():
            if np.sum(np.abs(row[CHEMICAL_FEATURES].values-test[CHEMICAL_FEATURES].values)) < THRESHOLD:
                keep = False 
                break 
    if keep:
        keept.append(row)
    else:
        exlcuded.append(row)

In [48]:
df_rel = pd.DataFrame(keept)

In [49]:
df = df_rel.drop_duplicates(subset=CHEMICAL_FEATURES)

In [50]:
len(df)

5316

In [51]:
def bin_column(column): 
    binned = []
    for value in column: 
        if value < 85: 
            binned.append(0)
        elif 85 <= value < 170:
            binned.append(1)
        else: 
            binned.append(2)
    return np.array(binned).reshape(-1,1)

In [52]:
r_binned = bin_column(df['r'].values)
g_binned = bin_column(df['g'].values)
b_binned = bin_column(df['b'].values)

In [53]:
mlss = MultilabelStratifiedShuffleSplit(n_splits=1, train_size=0.95, test_size=0.05, 
                                        random_state=34567)



In [54]:
for train_idx, test_idx in mlss.split(df, np.hstack([r_binned, g_binned, b_binned])):
    pass

In [55]:
df_train = df.iloc[train_idx].sample(len(train_idx))

In [56]:
df_test = df.iloc[test_idx].sample(len(test_idx))

In [41]:
len(df_train)

5040

In [42]:
len(df_test)

276

In [57]:
df_train.to_csv('../data/development_set_all.csv', index=False)

In [58]:
df_test.to_csv('../data/holdout_set_all.csv', index=False)

In [45]:
df_test

Unnamed: 0.1,Unnamed: 0,mc_CRY-chi-0-all,mc_CRY-chi-1-all,mc_CRY-chi-2-all,mc_CRY-chi-3-all,mc_CRY-Z-0-all,mc_CRY-Z-1-all,mc_CRY-Z-2-all,mc_CRY-Z-3-all,mc_CRY-I-0-all,...,sum-D_func-alpha-2-all,sum-D_func-alpha-3-all,color_cleaned_x,refcode,color_string,delta_t_seconds,r,g,b,color_cleaned_y
24,24,1.44000,33.024000,19.800000,34.87200,4096.0,4096.000000,6400.000000,4352.0,1.0,...,2.533333,-55.666667,colorless,KOBPUN,colorless,23.274056,232.0,234.0,244.0,colorless
1877,1877,2.85610,29.068000,22.950200,63.74680,2304.0,1920.000000,5760.000000,4224.0,1.0,...,416.000000,940.800000,colorless,XATLIQ,colorless,23.274056,232.0,234.0,244.0,colorless
997,997,1.03745,16.223200,10.623750,18.39175,2173.0,1875.500000,1918.000000,1547.5,1.0,...,-36.800000,-20.800000,colorless,NOHKIG,colorless,23.274056,232.0,234.0,244.0,colorless
1240,1240,1.48840,29.377600,21.496400,46.01840,4356.0,3696.000000,4026.000000,5808.0,1.0,...,80.533333,22.666667,colorless,QADWUQ,colorless,23.274056,232.0,234.0,244.0,colorless
5926,5926,2.40250,31.000000,34.435833,127.73550,625.0,1162.500000,4420.833333,4025.0,1.0,...,-12.000000,11.400000,brown,AYUJIP,brown,17.611029,91.0,48.0,5.0,brown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3690,3690,2.85610,30.104533,25.857000,51.27460,2304.0,1984.000000,1728.000000,2880.0,1.0,...,0.000000,0.000000,red,ISACIP,red,13.378720,250.0,11.0,5.0,red
2892,2892,2.40250,26.453333,21.648333,54.28100,625.0,991.666667,1533.333333,1975.0,1.0,...,0.000000,0.000000,pink,DEPXIG,pink,15.281056,234.0,80.0,212.0,pink
6119,6119,1.29960,34.519200,15.789000,32.80920,3600.0,4110.000000,1830.000000,3300.0,1.0,...,0.000000,0.000000,light purple,COTVIS,light purple,16.017467,197.5,126.5,232.5,light purple
4623,4623,3.53440,34.291200,57.528000,118.21440,729.0,1134.000000,1944.000000,2592.0,1.0,...,-12.000000,-0.300000,orange,WOZGUP,orange,22.024030,246.5,132.0,4.0,orange


In [46]:
from colour.models import RGB_to_HSV

In [15]:
RGB_to_HSV(df_train[['r', 'g', 'b']])

  S = as_float_array(delta / maximum)
  delta_R = (((maximum - R) / 6) + (delta / 2)) / delta
  delta_G = (((maximum - G) / 6) + (delta / 2)) / delta
  delta_B = (((maximum - B) / 6) + (delta / 2)) / delta
  H[np.asarray(H < 0)] += 1
  H[np.asarray(H > 1)] -= 1


array([[  3.27345309e-01,   8.74345550e-01,   1.91000000e+02],
       [  3.27345309e-01,   8.74345550e-01,   1.91000000e+02],
       [  8.33333333e-02,   9.45054945e-01,   9.10000000e+01],
       ..., 
       [  6.55660377e-01,   9.46428571e-01,   2.24000000e+02],
       [  6.55660377e-01,   9.46428571e-01,   2.24000000e+02],
       [  9.99099099e-01,   9.78835979e-01,   1.89000000e+02]])