In [1]:
import pandas as pd
import numpy as np
import altair as alt
import ast

This notebook process the raw labels csv file to clean csv for model training  
Only the top most common N genes are selected to reduce difficulty of training

In [2]:
labels_extended = pd.read_csv('data/labels_extended.csv')

In [3]:
# 273 genes in total
genes_count = pd.DataFrame(labels_extended.select_dtypes('int').sum(axis=0), columns=['genes_count']).reset_index()
genes_count

Unnamed: 0,index,genes_count
0,Red Axanthic,140
1,Jolliff Tiger,49
2,Orange Belly,28
3,Black Pastel,3980
4,Mosaic,14
...,...,...
268,Woma,188
269,Axanthic (Jolliff),8
270,Jedi,42
271,Sauce,7


In [4]:
n_genes_select = 30

In [5]:
genes_count.sort_values(by=['genes_count'], ascending=False).head(n_genes_select)

Unnamed: 0,index,genes_count
123,Pastel,20384
261,Yellow Belly,14071
29,Enchi,11883
37,Clown,10533
147,Leopard,8510
120,Piebald,8093
160,Orange Dream,8030
181,Fire,7243
157,Mojave,5582
158,Pinstripe,4633


In [6]:
top_genes = genes_count.sort_values(by=['genes_count'], ascending=False).head(n_genes_select)['index'].to_list()
top_genes

['Pastel',
 'Yellow Belly',
 'Enchi',
 'Clown',
 'Leopard',
 'Piebald',
 'Orange Dream',
 'Fire',
 'Mojave',
 'Pinstripe',
 'Banana',
 'Normal',
 'Black Pastel',
 'Lesser',
 'Spotnose',
 'Cinnamon',
 'GHI',
 'Hypo',
 'Spider',
 'Super Pastel',
 'Desert Ghost',
 'Black Head',
 'Vanilla',
 'Red Stripe',
 'Asphalt',
 'Gravel',
 'Butter',
 'Calico',
 'Albino',
 'Chocolate']

In [7]:
labels_extended['selected_gene_count'] = labels_extended.filter(items=top_genes).sum(axis=1).astype('float32')
labels_extended['total_gene_count'] = labels_extended.select_dtypes('int').sum(axis=1).astype('float32')

In [8]:
# Filter out the rows contain genes that are not the top X most common
selected_gene_df = labels_extended.query('total_gene_count == selected_gene_count')
selected_gene_df = selected_gene_df.filter(items=(labels_extended.columns[:7].to_list() + top_genes))
selected_gene_df

Unnamed: 0,index,genes,sex,origin,price,birth,url,Pastel,Yellow Belly,Enchi,...,Desert Ghost,Black Head,Vanilla,Red Stripe,Asphalt,Gravel,Butter,Calico,Albino,Chocolate
0,21000-0,"['Pastel', 'Desert Ghost']",male,Self Produced,500.0,15th June 2022,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,21000-1,"['Pastel', 'Desert Ghost']",male,Self Produced,500.0,15th June 2022,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,21000-2,"['Pastel', 'Desert Ghost']",male,Self Produced,500.0,15th June 2022,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,1,0,0,0,0,0,0,0,0,0
6,21002-0,"['Pastel', 'Leopard', 'Piebald']",male,Self Produced,450.0,30th October 2023,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,21003-0,"['Red Stripe', 'Yellow Belly']",female,Self Produced,300.0,2022,https://www.morphmarket.com/us/c/reptiles/pyth...,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76941,15990-1,['Pastel'],male,Self Produced,150.0,2022,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,0,0,0,0,0,0,0,0,0,0
76942,15991-0,"['Leopard', 'Pastel', 'Orange Dream', 'Piebald']",male,Self Produced,600.0,,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,0,0,0,0,0,0,0,0,0,0
76943,15992-0,"['Leopard', 'Orange Dream', 'Clown']",male,Self Produced,750.0,16th September 2023,https://www.morphmarket.com/us/c/reptiles/pyth...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76944,15993-0,['Piebald'],female,Self Produced,249.0,2023,https://www.morphmarket.com/us/c/reptiles/pyth...,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
selected_gene_df.to_csv('data/selected_gene_df.csv', index=False)

In [10]:
selected_gene_df

Unnamed: 0,index,genes,sex,origin,price,birth,url,Pastel,Yellow Belly,Enchi,...,Desert Ghost,Black Head,Vanilla,Red Stripe,Asphalt,Gravel,Butter,Calico,Albino,Chocolate
0,21000-0,"['Pastel', 'Desert Ghost']",male,Self Produced,500.0,15th June 2022,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,21000-1,"['Pastel', 'Desert Ghost']",male,Self Produced,500.0,15th June 2022,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,21000-2,"['Pastel', 'Desert Ghost']",male,Self Produced,500.0,15th June 2022,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,1,0,0,0,0,0,0,0,0,0
6,21002-0,"['Pastel', 'Leopard', 'Piebald']",male,Self Produced,450.0,30th October 2023,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,21003-0,"['Red Stripe', 'Yellow Belly']",female,Self Produced,300.0,2022,https://www.morphmarket.com/us/c/reptiles/pyth...,0,1,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76941,15990-1,['Pastel'],male,Self Produced,150.0,2022,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,0,0,0,0,0,0,0,0,0,0
76942,15991-0,"['Leopard', 'Pastel', 'Orange Dream', 'Piebald']",male,Self Produced,600.0,,https://www.morphmarket.com/us/c/reptiles/pyth...,1,0,0,...,0,0,0,0,0,0,0,0,0,0
76943,15992-0,"['Leopard', 'Orange Dream', 'Clown']",male,Self Produced,750.0,16th September 2023,https://www.morphmarket.com/us/c/reptiles/pyth...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76944,15993-0,['Piebald'],female,Self Produced,249.0,2023,https://www.morphmarket.com/us/c/reptiles/pyth...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
