<a href="https://colab.research.google.com/github/eiffelwong1/feature-type-inference-using-AL/blob/main/colab_notebooks/utility/data_split_234.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import csv
import time
import pickle
import pandas as pd
import numpy as np
from functools import partial

import tensorflow as tf
from tensorflow import keras

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.base import BaseEstimator

import matplotlib.pyplot as plt
%matplotlib inline

import warnings

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
useStats = 1
useAttributeName = 1
useSample1 = 0
useSample2 = 0

dict_label = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}

dict_label_list = [k for k,v in sorted(dict_label.items(), key = lambda x: x[1])]
print(dict_label_list)

['numeric', 'categorical', 'datetime', 'sentence', 'url', 'embedded-number', 'list', 'not-generalizable', 'context-specific']


In [None]:
def ProcessStats(data,y):

    data1 = data[['total_vals', 'num_nans', '%_nans', 'num_of_dist_val', '%_dist_val', 'mean', 'std_dev', 'min_val', 'max_val','has_delimiters', 'has_url', 'has_email', 'has_date', 'mean_word_count',
       'std_dev_word_count', 'mean_stopword_total', 'stdev_stopword_total',
       'mean_char_count', 'stdev_char_count', 'mean_whitespace_count',
       'stdev_whitespace_count', 'mean_delim_count', 'stdev_delim_count',
       'is_list', 'is_long_sentence']]
    data1 = data1.reset_index(drop=True)
    data1 = data1.fillna(0)

    y.y_act = y.y_act.astype(float)
    
    return data1

vectorizerName = CountVectorizer(ngram_range=(2, 2), analyzer='char')
vectorizerSample = CountVectorizer(ngram_range=(2, 2), analyzer='char')

def FeatureExtraction(data,data1,flag):

    arr = data['Attribute_name'].values
    arr = [str(x) for x in arr]
    
    arr1 = data['sample_1'].values
    arr1 = [str(x) for x in arr1]
    arr2 = data['sample_2'].values
    arr2 = [str(x) for x in arr2]
    arr3 = data['sample_3'].values
    arr3 = [str(x) for x in arr3]    
    #print(len(arr1),len(arr2))
    if flag:
        X = vectorizerName.fit_transform(arr)
        X1 = vectorizerSample.fit_transform(arr1)
        X2 = vectorizerSample.transform(arr2)   
    else:
        X = vectorizerName.transform(arr)
        X1 = vectorizerSample.transform(arr1)
        X2 = vectorizerSample.transform(arr2)        
        
#     print(f"> Length of vectorized feature_names: {len(vectorizer.get_feature_names())}")

    attr_df = pd.DataFrame(X.toarray())
    sample1_df = pd.DataFrame(X1.toarray())
    sample2_df = pd.DataFrame(X2.toarray())
    #print(len(data1),len(attr_df),len(sample1_df),len(sample2_df))

    if useSample1: data2 = sample1_df
    if useSample2: data2 = sample2_df    
    
    data2 = pd.concat([data1, attr_df], axis=1, sort=False)
    #print(len(data2))
    return data2

In [None]:
def get_data(sim_size = 0.9):
    """
    sim_size is the % of training data that goes into simulation set.
    """
    xtrain = pd.read_csv('/content/drive/My Drive//Project_234/Original/data_train.csv')
    xtest = pd.read_csv('/content/drive/My Drive//Project_234/Original/data_test.csv')

    y_train = xtrain.loc[:,['y_act']]
    y_test = xtest.loc[:,['y_act']]
    y_train['y_act'] = [dict_label[i] for i in y_train['y_act']]
    y_test['y_act'] = [dict_label[i] for i in y_test['y_act']]
    
    xtrain1 = ProcessStats(xtrain,y_train)
    xtest1 = ProcessStats(xtest,y_test)


    X_train = FeatureExtraction(xtrain,xtrain1,1)
    X_test = FeatureExtraction(xtest,xtest1,0)


    X_train_new = X_train.reset_index(drop=True)
    y_train_new = y_train.reset_index(drop=True)
    X_train_new = X_train_new.values
    y_train_new = y_train_new.values
    
    # setting up data
    X_train_AL, X_simulation_AL, y_train_AL, y_simulation_AL = train_test_split(X_train_new, y_train, test_size=sim_size, random_state=4, stratify=y_train, shuffle=True)
    # Start AL loops simulated with a part of the test data
    X_simulation_df = pd.DataFrame(X_simulation_AL)
    y_simulation_df = pd.DataFrame(y_simulation_AL)
    # making copies of existing X_test and y_test data
    X_test_AL = pd.DataFrame(X_test)
    y_test_AL = pd.DataFrame(y_test)
    
    assert len(X_train_AL) == len(y_train_AL)
    assert len(X_simulation_df) == len(y_simulation_df)
    assert len(X_test_AL) == len(y_test_AL)
    return X_train_AL, y_train_AL, X_simulation_df, y_simulation_df, X_test_AL, y_test_AL

X_train, y_train, X_sim, y_sim, X_test, y_test = get_data()
print(f"Training set size: {len(X_train)}, Simulation set size: {len(X_sim)}, Test set size: {len(X_test)}")

Training set size: 793, Simulation set size: 7143, Test set size: 1985


In [None]:
# check distribution counts
for v in dict_label.values():
  print(f"Train {v}: {round((y_train['y_act'] == v).sum()/ len(y_train), 3) * 100}%")
  print(f"Test {v}: {round((y_test['y_act'] == v).sum()/ len(y_test), 3) * 100}%")
  print(f"Sim {v}: {round((y_sim['y_act'] == v).sum()/ len(y_sim), 3) * 100}%")

Train 0: 36.6%
Test 0: 35.6%
Sim 0: 36.6%
Train 1: 23.3%
Test 1: 23.0%
Sim 1: 23.400000000000002%
Train 2: 6.800000000000001%
Test 2: 7.1%
Sim 2: 6.800000000000001%
Train 3: 3.6999999999999997%
Test 3: 4.6%
Sim 3: 3.6999999999999997%
Train 4: 1.5%
Test 4: 1.6%
Sim 4: 1.4000000000000001%
Train 5: 5.8999999999999995%
Test 5: 5.0%
Sim 5: 5.8999999999999995%
Train 6: 2.4%
Test 6: 2.9000000000000004%
Sim 6: 2.4%
Train 7: 11.0%
Test 7: 10.8%
Sim 7: 11.0%
Train 8: 8.799999999999999%
Test 8: 9.3%
Sim 8: 8.9%


In [None]:
pd.DataFrame(X_train).to_csv("/content/drive/My Drive/Project_234/AL_Split/split_train_X.csv")
pd.DataFrame(y_train).to_csv("/content/drive/My Drive/Project_234/AL_Split/split_train_y.csv")
X_sim.to_csv("/content/drive/My Drive/Project_234/AL_Split/split_sim_X.csv")
y_sim.to_csv("/content/drive/My Drive/Project_234/AL_Split/split_sim_y.csv") 
X_test.to_csv("/content/drive/My Drive/Project_234/AL_Split/split_test_X.csv") 
y_test.to_csv("/content/drive/My Drive/Project_234/AL_Split/split_test_y.csv")