Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
445 lines (346 sloc) 12.3 KB
#
# Collective Knowledge (Unified modeling using python sklearn)
#
# See CK LICENSE.txt for licensing details
# See CK COPYRIGHT.txt for copyright details
#
# Developer: Grigori Fursin
#
cfg={} # Will be updated by CK (meta description of this module)
work={} # Will be updated by CK (temporal data)
ck=None # Will be updated by CK (initialized CK kernel)
# Local settings
##############################################################################
# Initialize module
def init(i):
"""
Input: {}
Output: {
return - return code = 0, if successful
> 0, if error
(error) - error text if return > 0
}
"""
return {'return':0}
##############################################################################
# build model
def build(i):
"""
Input: {
model_name - model name
(model_file) - model output file, otherwise generated as tmp file
model_params - dict with model params
features_table - features table (in experiment module format)
features_keys - features flat keys
characteristics_table - characteristics table (in experiment module format)
characteristics_keys - characteristics flat keys
(keep_temp_files) - if 'yes', keep temp files
(caption) - add caption to graphs, if needed
}
Output: {
return - return code = 0, if successful
> 0, if error
(error) - error text if return > 0
model_file - output model file
}
"""
import tempfile
import os
import pickle
import shutil
o=i.get('out','')
mn=i['model_name']
mp=i.get('model_params',{})
cap=i.get('caption','')
mf=i.get('model_file','')
mf1=i['model_file']+'.model.obj'
mf2=i['model_file']+'.model.dot'
mf2x=i['model_file']+'.modelx.dot'
mf2y=i['model_file']+'.modely.dot'
mf3=i['model_file']+'.model.pdf'
mf3x=i['model_file']+'.model.png'
mf4=i['model_file']+'.model.ft.txt'
mf5=i['model_file']+'.model.inp.ft.json'
mf6=i['model_file']+'.model.inp.char.json'
mf7=i['model_file']+'.model.decision_tree.json'
ftable=i['features_table']
fkeys=i['features_keys']
fdesc=i.get('features_desc',{})
ctable=i['characteristics_table']
ckeys=i['characteristics_keys']
lftable=len(ftable)
lctable=len(ctable)
# Enumerate features and subsitute in file
s=''
fk=0
for fx in fkeys:
uu1='X['+str(fk)+']'
uu2=fdesc.get(fx,{}).get('name','')
s+=uu1+' '+fx+' ('+uu2+')'
s+='\n'
fk+=1
if s!='':
r=ck.save_text_file({'text_file':mf4, 'string':s})
if r['return']>0: return r
if o=='con':
ck.out('*******************************************************')
ck.out('Feature key convertion:')
ck.out('')
ck.out(s)
if lftable!=lctable:
return {'return':1, 'error':'length of feature table ('+str(lftable)+') is not the same as length of characteristics table ('+str(lctable)+')'}
# if len(ckeys)>1:
# return {'return':1, 'error':'currently we support only modeling for 1 characteristic'}
ktf=i.get('keep_temp_files','')
# Convert categorical features to floats
r=convert_categories_to_floats({'table':ftable})
if r['return']>0: return r
fconv=r['conv']
fconv1=r['conv1']
ftable1=r['table']
if len(ftable)>0 and len(fconv)>0 and o=='con':
ck.out('')
ck.out('Converting categories to floats:')
ck.out('')
fll=len(ftable[0])
for fi in range(0,fll):
sfi=str(fi)
x=fconv.get(sfi, {})
if len(x)>0:
ck.out(' Dimension: '+sfi)
import operator
for y in sorted(x.items(), key=operator.itemgetter(1)):
yk=y[1]
yv=y[0]
ck.out(' '+str(yk)+' -> '+str(yv))
ck.out('')
# Prepare (temporary) out model file
fn2=mf
if fn2=='' or i.get('web','')=='yes':
fd2, fn2=tempfile.mkstemp(suffix='.tmp', prefix='ck-')
os.close(fd2)
os.remove(fn2)
else:
fn2=mf1
if os.path.isfile(fn2): os.remove(fn2)
# Remove old files
if os.path.isfile(mf1): os.remove(mf1)
if os.path.isfile(mf2): os.remove(mf2)
if os.path.isfile(mf2x): os.remove(mf2x)
if os.path.isfile(mf3): os.remove(mf3)
if os.path.isfile(mf3x): os.remove(mf3x)
if os.path.isfile(mf5): os.remove(mf5)
if os.path.isfile(mf6): os.remove(mf6)
if os.path.isfile(mf7): os.remove(mf7)
#############################################################
if mn=='dtc' or mn=='dtr':
# http://scikit-learn.org/stable/modules/tree.html
# http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
from sklearn import tree
pmd=mp.get('max_depth',None)
pmln=mp.get('max_leaf_nodes',None)
if mn=='dtc':
clf = tree.DecisionTreeClassifier(max_depth=pmd, max_leaf_nodes=pmln)
else:
clf = tree.DecisionTreeRegressor(max_depth=pmd, max_leaf_nodes=pmln)
clf = clf.fit(ftable1, ctable)
r=ck.save_json_to_file({'json_file':mf5, 'dict':ftable1})
if r['return']>0: return r
r=ck.save_json_to_file({'json_file':mf6, 'dict':ctable})
if r['return']>0: return r
# Save as Graphviz dot
# dot -Tpdf iris.dot -o iris.pdf.
with open(mf2, 'w') as f:
f=tree.export_graphviz(clf, out_file=f)
shutil.copyfile(mf2,mf2x)
# Convert to decision tree
r=ck.access({'action':'convert_to_decision_tree',
'module_uoa':cfg['module_deps']['graph.dot'],
'input_file':mf2,
'caption':cap,
'output_file':mf7})
if r['return']>0: return r
# Substitute features with names
fk=0
for fx in fkeys:
uu1='X['+str(fk)+']'
uu2=fdesc.get(fx,{}).get('name','')
r=ck.substitute_str_in_file({'filename':mf2, 'string1':uu1, 'string2':'if '+uu1+' ('+uu2+')'})
if r['return']>0: return r
fk+=1
# Save as pdf
s='dot -Tpdf '+mf2+' -o '+mf3
if o=='out':
ck.out('')
ck.out('Executing command: '+x)
ck.out('')
os.system(s)
# Save as png
s='dot -Tpng '+mf2+' -o '+mf3x
if o=='out':
ck.out('')
ck.out('Executing command: '+x)
ck.out('')
os.system(s)
# from sklearn.externals.six import StringIO
# import pydot
# dot_data = StringIO()
# tree.export_graphviz(clf, out_file=dot_data)
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# graph.write_pdf(mf3)
else:
return {'return':1, 'error':'model name '+mn+' is not found in module model.sklearn'}
# Dump object
f=open(fn2, 'wb')
pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)
f.close()
return {'return':0, 'model_file':fn2}
##############################################################################
# validate model
def validate(i):
"""
Input: {
model_name - model name:
earth
lm
nnet
party
randomforest
rpart
svm
model_file - file with model (object) code
features_table - features table (in experiment module format)
(keep_temp_files) - if 'yes', keep temp files
}
Output: {
return - return code = 0, if successful
> 0, if error
(error) - error text if return > 0
prediction_table - experiment table with predictions
}
"""
import os
import pickle
mn=i['model_name']
mf=i['model_file']
mf1=i['model_file']+'.model.obj'
mf7=i['model_file']+'.model.decision_tree.json'
ftable=i['features_table']
ktf=i.get('keep_temp_files','')
lftable=len(ftable)
# Convert categorical features to floats
r=convert_categories_to_floats({'table':ftable})
if r['return']>0: return r
fconv=r['conv']
fconv1=r['conv1']
ftable1=r['table']
lt=[]
# Load model object
f=open(mf1, 'rb')
clf=pickle.load(f)
f.close()
sx=''
#############################################################
if mn=='dtc' or mn=='dtr':
from sklearn import tree
pr=clf.predict(ftable1)
# Check if CK decision tree file exists
if os.path.isfile(mf7):
r=ck.load_json_file({'json_file':mf7})
if r['return']>0: return r
labels=r['dict']
prx=[]
q=-1
for ft in ftable1:
q+=1
found=False
value=False
for label in labels:
p=labels[label]
dd=p['decision']
dv=p['value']
skip=False
for k in range(0,len(dd),2):
x=dd[k]
y=dd[k+1]
yc=y['comparison']
yf=int(y['feature'])
yv=float(y['value'])
if yc!='<=': return {'return':1, 'error':'not yet supported condition '+yc+' in decision tree'}
if x=='':
if not ft[yf]<=yv: skip=True
else:
if ft[yf]<=yv: skip=True
if skip: break
if not skip:
found=True
if dv=='true': value=True
else: value=False
break
if not found:
return {'return':1, 'error':'decision tree is incomplete'}
lt.append(label)
# print '**********'
# for z in range(0, len(ftable1[q])):
# zx=ftable1[q][z]
# print 'X['+str(z)+']='+str(zx)
else:
return {'return':1, 'error':'model name '+mn+' is not found in module model.sklearn'}
pr1=[]
for q in pr:
pr1.append([q])
lt1=[]
for q in lt:
lt1.append([q])
return {'return':0, 'prediction_table':pr1, 'label_table':lt1}
##############################################################################
# Convert categorical values to floats
def convert_categories_to_floats(i):
"""
Input: {
table - table
}
Output: {
return - return code = 0, if successful
> 0, if error
(error) - error text if return > 0
table - updated table
conv - conversion table
conv1 - conversion numbers
}
"""
import sys
pv2=False
if sys.version_info[0]<3: pv2=True
table=i['table']
# Convert categorical features to floats
fl=0 # length of feature vector
conv={}
conv1={}
table1=[]
if len(table)>0: fl=len(table[0])
if fl>0:
for k in table:
vec=[]
for j in range(0, fl):
js=str(j)
jj=k[j]
if type(jj)==str or (pv2 and type(jj)==unicode):
if js not in conv:
jx=0.0
conv[js]={}
conv[js][jj]=jx
conv1[js]=jx+1
jj=jx
else:
if jj in conv[js]:
jj=conv[js][jj]
else:
jx=conv1[js]
conv[js][jj]=jx
jj=jx
jx+=1
conv1[js]=jx
vec.append(jj)
table1.append(vec)
return {'return':0, 'table':table1, 'conv':conv, 'conv1':conv1}