In [1]:
import numpy as np

In [2]:
# The following pipeline was taken from
#https://medium.com/towards-data-science/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a
# a analogous, but more qualitative text is
#https://medium.com/moosend-engineering-data-science/how-to-build-a-machine-learning-industry-classifier-5d19156d692f
#
def read_columns(config,parameter):
    '''Read and parse column names from config'''
    sections = list(config.keys())
    sections.sort()
    return [ config[s].get(parameter) for s in sections ]


def read_columns_name(config):
    import re
    name_columns = read_columns(config,'columns')
    out = []
    for columns in name_columns:
        blk = []
        for i,column in enumerate(columns):
            clean_column = re.sub('[^0-9a-zA-Z\+\-\/\*\.]',' ',column).strip()
            blk.append(clean_column)
        out.append(blk)
    return out

def read_columns_description(config):
    import re
    import string
    desc_columns = read_columns(config,'descriptions')
    out = []
    for columns in desc_columns:
        blk = []
        for i,column in enumerate(columns):
            try:
                clean_column = re.sub('[^0-9a-zA-Z ]','',column).strip()
#                 clean_column = re.sub(string.punctuation,'',column).strip()
            except:
                print(i,column)
                clean_column = ''
            blk.append(clean_column)
        out.append(blk)
    return out

def read_columns_ucd(config):
    ucd_columns = read_columns(config,'ucds')
    out = []
    for columns in ucd_columns:
        blk = []
        for i,column in enumerate(columns):
            primary_ucd = column.split(';')[0]
            blk.append(primary_ucd)
#             blk.append(column)
        out.append(blk)
    return out


In [3]:
import json
with open('optical/CATALOGS.json','r') as f:
    config = json.load(f)

ucd_columns = read_columns_ucd(config)

In [4]:
target_ucd = [ u for ucds in ucd_columns for u in ucds ]
print(len(target_ucd))

d_ucd2id = { u:i for i,u in enumerate(set(target_ucd)) }
d_id2ucd = { d_ucd2id[u]:u for u in d_ucd2id }

# target = map(lambda u:d_ucd2id[u], target_label)
# from numpy import array
# target = array(list(target))
target_id = [ d_ucd2id[u] for u in target_ucd ]

1855


## Classifying the column names

In [5]:
name_columns = read_columns_name(config)
assert len(name_columns) == len(ucd_columns)

data = [ n for names in name_columns for n in names ]
np.array(data).shape

(1855,)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

# SVM
from sklearn.linear_model import SGDClassifier

In [7]:
## Pipeline
# Naive Bayes
from sklearn.pipeline import Pipeline
text_clf_nb = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
])
text_clf_nb = text_clf_nb.fit(data,target_id)

predict_nb = lambda w:d_id2ucd.get(text_clf_nb.predict([w])[0])
predict_nb('mag')

'stat.error'

In [8]:
# SVM
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf-svm', SGDClassifier(loss='log',
                                                  penalty='l2', alpha=1e-3, 
                                                  max_iter=100, random_state=42)),
                        ])
_= text_clf_svm.fit(data,target_id)

predict_svm = lambda w:d_id2ucd.get(text_clf_svm.predict([w])[0])
predict_svm('mag')

'stat.error'

In [9]:
probs = text_clf_svm.predict_proba(['flux'])

probs = probs.flatten()

probs.shape

ind_sort = probs.argsort()[::-1]

ind_sort.shape

# probs[ind_sort]

list(map(d_id2ucd.get,ind_sort[:3]))

['stat.error', 'pos.angDistance', 'phot.mag']

In [10]:
text_clf_nb.predict_proba(['mag'])

array([[0.00032075, 0.00032075, 0.00347763, 0.00692214, 0.00064047,
        0.00032127, 0.00064047, 0.00032075, 0.00190907, 0.00127681,
        0.23282715, 0.00127681, 0.30452787, 0.00687847, 0.00284988,
        0.00128094, 0.00032075, 0.00032054, 0.00032075, 0.00032075,
        0.00032075, 0.00032075, 0.00032075, 0.00064047, 0.00032075,
        0.02215629, 0.00032075, 0.00064047, 0.00064047, 0.02474297,
        0.00064047, 0.00032075, 0.00284988, 0.00095915, 0.00032075,
        0.00095915, 0.0006415 , 0.00032075, 0.00064047, 0.00032075,
        0.00064047, 0.00032075, 0.00064254, 0.00064047, 0.00064254,
        0.00032075, 0.01802439, 0.00921977, 0.00032075, 0.00050776,
        0.00650775, 0.00032075, 0.00032075, 0.00032075, 0.00032075,
        0.00032075, 0.00064047, 0.01545382, 0.00190907, 0.01775954,
        0.00160117, 0.00095851, 0.00127681, 0.02017327, 0.00032127,
        0.00095915, 0.00032075, 0.00064047, 0.0009607 , 0.0019078 ,
        0.00687847, 0.00064047, 0.002848  , 0.00

In [11]:
# Save machine state
from sklearn.externals import joblib
joblib.dump(text_clf_svm, 'predict_svm.pkl')

import json
with open('targets_label-id.json','w') as fp:
    json.dump(d_id2ucd,fp)



In [12]:
# Load machine
clf = joblib.load('predict_svm.pkl')
with open('targets_label-id.json','r') as fp:
    target_map = { int(k):v for k,v in json.load(fp).items() }
predict = lambda w:target_map.get(clf.predict([w])[0])

In [13]:
predict('ra')

'POS_EQ_RA_MAIN'

## Classifying column descriptions

In [14]:
desc_columns = read_columns_description(config)
assert len(desc_columns) == len(ucd_columns)

data = [ d for desc in desc_columns for d in desc ]
np.array(data).shape

7 None
8 None


(1855,)

In [15]:
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(data)

# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB().fit(X_train_tfidf,target_id)

from sklearn.pipeline import Pipeline
text_clf_nb = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
])
text_clf_nb = text_clf_nb.fit(data,target_id)


In [16]:
probs = text_clf_nb.predict_proba(['magnitude auto'])

probs = probs.flatten()
ind_sort = probs.argsort()[::-1]

predicted = [(d_id2ucd[i],probs[i]) for i in ind_sort[:3] ]
# list(map(d_id2ucd.get,ind_sort[:3]))

for u,p in predicted:
    print('{} : {:.3f}'.format(u,p))

phot.mag : 0.742
stat.error : 0.061
pos.angDistance : 0.011


In [17]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data, target_id, test_size=.1)

text_clf_nb = text_clf_nb.fit(x_train,y_train)

In [18]:
predicted = [d_id2ucd[i] for i in text_clf_nb.predict(x_test)]
assert len(predicted)==len(y_test)

import pandas as pd
df_eval = pd.DataFrame([(predicted[i],d_id2ucd[y_test[i]]) for i in range(len(y_test))], columns=['predicted','truth'])
df_eval

Unnamed: 0,predicted,truth
0,phot.mag,src.var
1,POS_EQ_DEC_MAIN,POS_EQ_DEC_MAIN
2,stat.error,stat
3,phot.mag,stat.fit.goodness
4,stat.error,meta.code.error
5,phot.mag,phys.magAbs
6,pos.eq.ra,POS_EQ_RA_MAIN
7,POS_EQ_RA_MAIN,meta.ref.url
8,stat.error,stat.error
9,stat.error,phot.mag


In [19]:
sum(df_eval.predicted == df_eval.truth)/len(df_eval)

0.510752688172043