**The purpose of this notebook is to make 2 seperate datasets for classification by domains (nlp, computer vision, reinforcement learning) and by techniques (classification, regression, clustering)**

In [1]:
import pandas as pd
import numpy as np

In [2]:
DATA_PATH = "../data/"

In [5]:
whole_df = pd.read_csv(DATA_PATH + 'vect_data_final.csv', index_col=0)

In [6]:
whole_df.head()

Unnamed: 0,notebook_vector,tag
0,"[-0.34790626, 0.29865852, 0.30619365, 0.055174...",computer vision
1,"[-0.3819649, 0.31716973, 0.33679396, 0.0810977...",clustering
2,"[-0.33451593, 0.2831545, 0.29143128, 0.0433948...",computer vision
3,"[-0.27561176, 0.26736438, 0.2665188, 0.0765375...",nlp
4,"[-0.24364452, 0.28803557, 0.2663721, 0.1090390...",classification


In [7]:
whole_df.shape

(6260, 2)

In [8]:
whole_df.tag.unique()

array(['computer vision', 'clustering', 'nlp', 'classification',
       'regression', 'reinforcement learning'], dtype=object)

In [14]:
domain_df = whole_df[(whole_df['tag'] == 'nlp') | (whole_df['tag'] == 'computer vision') | (whole_df['tag'] == 'reinforcement learning')].copy()
domain_df.head()

Unnamed: 0,notebook_vector,tag
0,"[-0.34790626, 0.29865852, 0.30619365, 0.055174...",computer vision
2,"[-0.33451593, 0.2831545, 0.29143128, 0.0433948...",computer vision
3,"[-0.27561176, 0.26736438, 0.2665188, 0.0765375...",nlp
7,"[-0.23577915, 0.25037774, 0.22657812, 0.149408...",nlp
12,"[-0.3744954, 0.2507138, 0.31223407, 0.02591716...",reinforcement learning


In [15]:
technique_df = whole_df[(whole_df['tag'] == 'regression') | (whole_df['tag'] == 'classification') | (whole_df['tag'] == 'clustering')].copy()
technique_df.head()

Unnamed: 0,notebook_vector,tag
1,"[-0.3819649, 0.31716973, 0.33679396, 0.0810977...",clustering
4,"[-0.24364452, 0.28803557, 0.2663721, 0.1090390...",classification
5,"[-0.27900544, 0.28826895, 0.27146173, 0.147275...",clustering
6,"[-0.42033634, 0.32177913, 0.3592496, 0.0234226...",classification
8,"[-0.23415434, 0.2189036, 0.23444305, 0.1150547...",regression


In [16]:
domain_df.tag.unique(), technique_df.tag.unique()

(array(['computer vision', 'nlp', 'reinforcement learning'], dtype=object),
 array(['clustering', 'classification', 'regression'], dtype=object))

In [23]:
domain_df.shape, technique_df.shape

((3301, 2), (2959, 2))

In [18]:
def build_df():
    vect_dim = 768
    features = []
    for i in range(vect_dim):
        features.append('num_feature_' + str(i))

    features.append('target_category')
    final_df = pd.DataFrame(columns=features)
    return final_df

In [25]:
def construct_final_df(vect_data):
    final_df = build_df()
    for i in vect_data.index:
        # convert vector values to float
        vect_data_float = []
        vect_data_float.append([float(float_value) for float_value in vect_data.loc[i, 'notebook_vector'].strip('[]').split(',')])
        # append tag to each notebook
        vect_data_float[0].append(vect_data.loc[i, 'tag'])
        final_df.loc[len(final_df)] = vect_data_float[0]
        print(i)
    return final_df 

In [None]:
domain_final = construct_final_df(domain_df)

In [None]:
technique_final = construct_final_df(technique_df)

In [29]:
domain_final.to_csv(DATA_PATH + 'domain_data.csv')
technique_final.to_csv(DATA_PATH + 'technique_data.csv')