## Constructing the input dataset to Oracle AutoML

Imports

In [3]:
import pandas as pd
import numpy as np

In [4]:
DATA_PATH = "C:\\Users\\dmasrour\\Documents\\CodeDoc_Generation\\Scripts\\Classification_Task\\data\\"

Load the vectorized notebooks

In [5]:
vect_data = pd.read_csv(DATA_PATH + 'vect_data_final.csv', index_col=0)

In [6]:
vect_data.head()

Unnamed: 0,notebook_vector,tag
0,"[-0.34790626, 0.29865852, 0.30619365, 0.055174...",computer vision
1,"[-0.3819649, 0.31716973, 0.33679396, 0.0810977...",clustering
2,"[-0.33451593, 0.2831545, 0.29143128, 0.0433948...",computer vision
3,"[-0.27561176, 0.26736438, 0.2665188, 0.0765375...",nlp
4,"[-0.24364452, 0.28803557, 0.2663721, 0.1090390...",classification


In [7]:
vect_data.shape

(6260, 2)

Build the dataframe structure

In [8]:
def build_df():
    vect_dim = 768
    features = []
    for i in range(vect_dim):
        features.append('num_feature_' + str(i))

    features.append('target_category')
    final_df = pd.DataFrame(columns=features)
    return final_df

Make each vector value a single feature in the dataset

In [9]:
def construct_final_df(vect_data):
    final_df = build_df()
    for i in vect_data.index:
        # convert vector values to float
        vect_data_float = []
        vect_data_float.append([float(float_value) for float_value in vect_data.loc[i, 'notebook_vector'].strip('[]').split(',')])
        # append tag to each notebook
        vect_data_float[0].append(vect_data.loc[i, 'tag'])
        final_df.loc[len(final_df)] = vect_data_float[0]
        print(i)
    return final_df 

In [None]:
final_df = construct_final_df(vect_data)

In [13]:
final_df.to_csv(DATA_PATH + 'final_data.csv', index=False)

### Classification test (baseline)

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import plotly.figure_factory as ff

In [27]:
x = final_df.drop('target_category', axis = 1).copy()
y = final_df['target_category'].copy()

In [28]:
xtrain, xtest,  ytrain, ytest = train_test_split(x, y, test_size=0.2)

In [29]:
ytrain

2734           computer vision
4534    reinforcement learning
3474           computer vision
218                        nlp
5407            classification
                 ...          
4626            classification
656                 regression
3629            classification
1208                regression
417                        nlp
Name: target_category, Length: 5008, dtype: object

In [30]:
logr = LogisticRegression(max_iter=10000)

In [31]:
logr.fit(xtrain, ytrain)

In [32]:
yhat = logr.predict(xtest)

In [33]:
accuracy_score(yhat, ytest)

0.7859424920127795

In [63]:
conf_m = confusion_matrix(ytest, yhat).tolist()
conf_m

[[89, 19, 6, 16, 30, 1],
 [9, 163, 10, 15, 22, 1],
 [3, 3, 192, 28, 1, 1],
 [4, 5, 18, 303, 7, 1],
 [12, 20, 2, 6, 187, 2],
 [3, 2, 6, 12, 3, 50]]

In [71]:
# invert conf_m ida values
conf_m = conf_m[::-1]

a = list(y.unique())
b = list(y.unique()) # invert idx values of a

# change each element of conf_m to tbpe string for annotations
conf_m_text = [[str(b) for b in a] for a in conf_m]

# set up figure 
fig = ff.create_annotated_heatmap(conf_m, x=a, y=b, annotation_text=conf_m_text, colorscale='Viridis')

# add title
fig.update_layout(title_text='<b>Confusion matrix</b>',
                  #aaais = dict(title='a'),
                  #baais = dict(title='a')
                 )

# add custom aaais title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

# add custom baais title
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.35,
                        y=0.5,
                        showarrow=False,
                        text="Real value",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))

# adjust margins to make room for baais title
fig.update_layout(margin=dict(t=50, l=200))

# add colorbar
fig['data'][0]['showscale'] = True
fig.show()