## Constructing the input dataset to Oracle AutoML

Imports

In [1]:
import pandas as pd
import numpy as np

In [6]:
DATA_PATH = "../data/"

Load the vectorized notebooks

In [7]:
vect_data = pd.read_csv(DATA_PATH + 'vect_data_final.csv', index_col=0)

In [8]:
vect_data.head()

Unnamed: 0,notebook_vector,tag
0,"[-0.34790626, 0.29865852, 0.30619365, 0.055174...",computer vision
1,"[-0.3819649, 0.31716973, 0.33679396, 0.0810977...",clustering
2,"[-0.33451593, 0.2831545, 0.29143128, 0.0433948...",computer vision
3,"[-0.27561176, 0.26736438, 0.2665188, 0.0765375...",nlp
4,"[-0.24364452, 0.28803557, 0.2663721, 0.1090390...",classification


In [7]:
vect_data.shape

(6260, 2)

Build the dataframe structure

In [8]:
def build_df():
    vect_dim = 768
    features = []
    for i in range(vect_dim):
        features.append('num_feature_' + str(i))

    features.append('target_category')
    final_df = pd.DataFrame(columns=features)
    return final_df

Make each vector value a single feature in the dataset

In [9]:
def construct_final_df(vect_data):
    final_df = build_df()
    for i in vect_data.index:
        # convert vector values to float
        vect_data_float = []
        vect_data_float.append([float(float_value) for float_value in vect_data.loc[i, 'notebook_vector'].strip('[]').split(',')])
        # append tag to each notebook
        vect_data_float[0].append(vect_data.loc[i, 'tag'])
        final_df.loc[len(final_df)] = vect_data_float[0]
        print(i)
    return final_df 

In [None]:
final_df = construct_final_df(vect_data)

In [None]:
final_df.to_csv(DATA_PATH + 'final_data.csv', index=False)

### Classification test (baseline)

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, f1_score
import matplotlib.pyplot as plt
import plotly.figure_factory as ff

In [5]:
final_df = pd.read_csv(DATA_PATH + 'final_data.csv')
final_df.head()

Unnamed: 0,num_feature_0,num_feature_1,num_feature_2,num_feature_3,num_feature_4,num_feature_5,num_feature_6,num_feature_7,num_feature_8,num_feature_9,...,num_feature_759,num_feature_760,num_feature_761,num_feature_762,num_feature_763,num_feature_764,num_feature_765,num_feature_766,num_feature_767,target_category
0,-0.347906,0.298659,0.306194,0.055175,-0.430964,-0.466804,-0.108935,0.22536,0.385942,0.492067,...,-0.174081,-0.516293,0.40267,-0.094396,0.222461,0.574193,-0.606644,-0.46194,0.539914,computer vision
1,-0.381965,0.31717,0.336794,0.081098,-0.45472,-0.551013,-0.051526,0.256912,0.381023,0.474694,...,-0.182501,-0.52445,0.344222,-0.02771,0.259213,0.608695,-0.722355,-0.420306,0.58906,clustering
2,-0.334516,0.283154,0.291431,0.043395,-0.385153,-0.4945,-0.111106,0.227195,0.380586,0.548708,...,-0.169292,-0.530989,0.416996,-0.10479,0.218367,0.636691,-0.61231,-0.446481,0.508087,computer vision
3,-0.275612,0.267364,0.266519,0.076538,-0.282673,-0.329046,-0.045134,0.221697,0.293198,0.421586,...,-0.149041,-0.485052,0.493056,-0.112996,0.220296,0.61848,-0.594641,-0.469342,0.497985,nlp
4,-0.243645,0.288036,0.266372,0.109039,-0.166584,-0.294326,-0.053085,0.150146,0.304883,0.368951,...,-0.142109,-0.482496,0.465995,-0.077088,0.22602,0.523076,-0.427591,-0.406878,0.511194,classification


In [26]:
x = final_df.drop('target_category', axis = 1).copy()
y = final_df['target_category'].copy()

In [27]:
xtrain, xtest,  ytrain, ytest = train_test_split(x, y, test_size=0.2)

In [8]:
ytrain

4406     classification
6008    computer vision
6229    computer vision
3292         regression
3832         clustering
             ...       
2042    computer vision
5186         clustering
4954         regression
4767                nlp
623                 nlp
Name: target_category, Length: 5008, dtype: object

In [30]:
logr = LogisticRegression(max_iter=10000)

In [31]:
logr.fit(xtrain, ytrain)

In [32]:
yhat = logr.predict(xtest)

In [33]:
accuracy_score(yhat, ytest)

0.7859424920127795

In [63]:
conf_m = confusion_matrix(ytest, yhat).tolist()
conf_m

[[89, 19, 6, 16, 30, 1],
 [9, 163, 10, 15, 22, 1],
 [3, 3, 192, 28, 1, 1],
 [4, 5, 18, 303, 7, 1],
 [12, 20, 2, 6, 187, 2],
 [3, 2, 6, 12, 3, 50]]

In [71]:
# invert conf_m ida values
conf_m = conf_m[::-1]

a = list(y.unique())
b = list(y.unique()) # invert idx values of a

# change each element of conf_m to tbpe string for annotations
conf_m_text = [[str(b) for b in a] for a in conf_m]

# set up figure 
fig = ff.create_annotated_heatmap(conf_m, x=a, y=b, annotation_text=conf_m_text, colorscale='Viridis')

# add title
fig.update_layout(title_text='<b>Confusion matrix</b>',
                  #aaais = dict(title='a'),
                  #baais = dict(title='a')
                 )

# add custom aaais title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

# add custom baais title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.35,
                        y=0.5,
                        showarrow=False,
                        text="Real value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))

# adjust margins to make room for baais title
fig.update_layout(margin=dict(t=50, l=200))

# add colorbar
fig['data'][0]['showscale'] = True
fig.show()

Let's try balancing our dataset 

In [28]:
logr_bal = LogisticRegression(max_iter=10000, class_weight='balanced')

In [29]:
logr_bal.fit(xtrain, ytrain)

In [11]:
import pickle

In [12]:
filename = 'finalized_model.sav'
pickle.dump(logr_bal, open(filename, 'wb'))

In [13]:
xtest

Unnamed: 0,num_feature_0,num_feature_1,num_feature_2,num_feature_3,num_feature_4,num_feature_5,num_feature_6,num_feature_7,num_feature_8,num_feature_9,...,num_feature_758,num_feature_759,num_feature_760,num_feature_761,num_feature_762,num_feature_763,num_feature_764,num_feature_765,num_feature_766,num_feature_767
4953,-0.284166,0.265308,0.250501,0.059773,-0.293241,-0.195005,-0.039972,0.138321,0.248731,0.408354,...,0.406212,-0.092719,-0.519965,0.447469,-0.133863,0.274853,0.541272,-0.575624,-0.415648,0.463614
1742,-0.256528,0.250721,0.266806,0.183580,-0.048057,-0.372898,-0.097070,0.130213,0.262331,0.377111,...,0.346597,-0.147166,-0.478567,0.455503,-0.153839,0.233633,0.598709,-0.406390,-0.406957,0.534875
2899,-0.116456,0.215052,0.240432,0.156587,-0.146027,-0.245312,-0.058848,0.141029,0.260640,0.286272,...,0.359351,-0.088941,-0.451789,0.558014,-0.159376,0.185456,0.652369,-0.302985,-0.398837,0.464013
303,-0.360095,0.293063,0.321145,0.035082,-0.305559,-0.395745,-0.097220,0.165061,0.309050,0.465018,...,0.414574,-0.138237,-0.518623,0.344257,-0.041883,0.274553,0.493639,-0.641247,-0.413809,0.475289
407,-0.178747,0.200021,0.222105,0.106968,-0.036566,-0.343755,0.009203,0.114121,0.149440,0.319532,...,0.288844,-0.149671,-0.368676,0.551033,-0.133438,0.188182,0.675628,-0.373077,-0.375323,0.421074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3511,-0.256259,0.288374,0.250342,-0.025846,-0.464666,-0.392683,-0.083739,0.191818,0.346889,0.427777,...,0.389025,-0.116514,-0.468868,0.442361,-0.080300,0.225603,0.516070,-0.476576,-0.439895,0.515938
2284,-0.233427,0.241343,0.226945,0.095533,-0.125177,-0.303776,-0.063909,0.171266,0.222379,0.392852,...,0.424303,-0.081237,-0.525032,0.469753,-0.103971,0.222740,0.486391,-0.528197,-0.415531,0.507881
899,-0.331804,0.349822,0.278608,0.070609,-0.259272,-0.484103,-0.087355,0.188330,0.371650,0.482033,...,0.381983,-0.158758,-0.487979,0.381095,-0.078864,0.268546,0.613251,-0.514993,-0.423582,0.547945
5834,-0.266324,0.277896,0.246268,0.049421,-0.252962,-0.399935,-0.072752,0.192567,0.317666,0.402015,...,0.369669,-0.141692,-0.469623,0.429152,-0.127803,0.226456,0.615502,-0.556100,-0.424030,0.500063


In [30]:
yhat_bal = logr_bal.predict(xtest)

In [16]:
yhat_bal

array(['regression', 'nlp', 'nlp', ..., 'computer vision',
       'computer vision', 'nlp'], dtype=object)

In [31]:
print('Accuracy score:',accuracy_score(ytest, yhat_bal))
print('F1 score:',f1_score(ytest, yhat_bal, average='weighted'))

Accuracy score: 0.7827476038338658
F1 score: 0.7839468112913339


In [38]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

# fit the predictor and target
rfc.fit(xtrain, ytrain)

# predict
rfc_predict = rfc.predict(xtest)# check performance
# print('ROCAUC score:',roc_auc_score(ytest, rfc_predict))
print('Accuracy score:',accuracy_score(ytest, rfc_predict))
print('F1 score:',f1_score(ytest, rfc_predict, average='weighted'))

Accuracy score: 0.6357827476038339
F1 score: 0.61612061998664
