<a href="https://colab.research.google.com/github/dinaldoap/jit-sdp-nn/blob/master/eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from jitsdp.data import FEATURES, make_stream
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from joblib import Memory

In [None]:
np.random.seed(42)
memory = Memory(location='data', verbose=0)
dataset = 'brackets'

In [None]:
df_prequential = make_stream(dataset)
df_prequential.head()

In [None]:
X = df_prequential[FEATURES].values
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X, index=df_prequential.index, columns=df_prequential[FEATURES].columns)
X.head()

In [None]:
@memory.cache
def reduce_tsne(X):
    return TSNE(n_components=2).fit_transform(X)

def format_to_plot(X, freduce_dimension, df_aditional):
    aditional_columns = list(set(df_aditional.columns) - set(X.columns))
    #df_formatted = np.random.rand(len(X), 2)
    df_formatted = freduce_dimension(X)
    df_formatted = pd.DataFrame(df_formatted, X.index, columns=['x1', 'x2'])
    df_formatted = pd.concat([df_formatted, X, df_aditional[aditional_columns]], axis='columns')
    return df_formatted.rename(columns={'index': 'timestep'})

def remove_outliers(df):
  df_removed = df.copy()
  for col in ['x1', 'x2']:
    q_min, q_max = df_removed[col].quantile([0.01, 0.99])
    print('({}, {})'.format(q_min, q_max))
    df_removed = df_removed[df_removed[col] <= q_max]
    df_removed = df_removed[df_removed[col] >= q_min]
  return df_removed

In [None]:
def plot_drift(data):
  n_bins = 6
  bin_size = len(df) // n_bins
  fig, ax = plt.subplots(nrows=n_bins, figsize=(18, 36), sharex=True, sharey=True, constrained_layout=True)
  for bin in range(n_bins):
    palette = 'coolwarm' if bin % 2 == 0 else 'coolwarm_r'
    bin_data = data[data['timestep'] >= bin*bin_size]
    bin_data = bin_data[bin_data['timestep'] <= (bin+1)*bin_size]    
    sns.scatterplot(x="x1", y="x2", hue='timestep', palette=palette, data=bin_data, ax=ax[bin])
  fig.show()

def normal(data):
  return data[data['y'] == 0]

def bug(data):
  return data[data['y'] == 1]

def plot_drifts(data):
  n_bins = 6
  bin_size = len(df) // n_bins
  fig, ax = plt.subplots(nrows=n_bins, ncols=2, figsize=(22, 36), sharex=True, sharey=True, constrained_layout=True)
  for bin in range(n_bins):
    palette = 'coolwarm' if bin % 2 == 0 else 'coolwarm_r'
    bin_data = data[data['timestep'] >= bin*bin_size]
    bin_data = bin_data[bin_data['timestep'] <= (bin+1)*bin_size]    
    sns.scatterplot(x="x1", y="x2", hue='timestep', palette=palette, data=normal(bin_data), ax=ax[bin, 0])
    sns.scatterplot(x="x1", y="x2", hue='timestep', palette=palette, data=bug(bin_data), ax=ax[bin, 1])
  fig.show()

def plot_by_label(data, title=None, filename=None, xlim=None, ylim=None):
  plt.figure(figsize=(15,15))
  flatui = ["#AAAAAA", "#0000FF", "#FF0000"]
  ax = sns.scatterplot(x="x1", y="x2", hue='target', data=data, palette=flatui)
  ax.set(xlim=xlim, ylim=ylim)
  ax.set_title(label=title)
  if filename is None:
    plt.show()
  else:
    plt.savefig(filename)
    plt.clf()

def plot_drifts_by_label(data):
  n_bins = 6
  bin_size = len(df) // n_bins
  fig, ax = plt.subplots(nrows=n_bins, ncols=1, figsize=(15, 30), sharex=True, sharey=True, constrained_layout=True)
  for bin in range(n_bins):
    bin_data = data[data['timestep'] >= bin*bin_size]
    bin_data = bin_data[bin_data['timestep'] <= (bin+1)*bin_size]    
    ax = sns.scatterplot(x="x1", y="x2", hue='y', data=bin_data, ax=ax[bin])        
  fig.show()
    

In [None]:
df_pts = format_to_plot(X, reduce_tsne, df_prequential)
df_pts.head()


In [None]:
df_pts = remove_outliers(df_pts)
print(len(df_pts))
df_pts.head()

In [None]:
!rm logs/{dataset}/*.png

In [None]:
margim = 1
x1lim = (df_pts['x1'].min() - margim, df_pts['x1'].max() + margim)
x2lim = (df_pts['x2'].min() - margim, df_pts['x2'].max() + margim)
seconds_by_day = 24 * 60 * 60
verification_latency = 90 * seconds_by_day  # seconds
fold_size = 50
df_placeholder = df_pts[:1].copy()
placeholder_values = ['wait', 'train', 'test']
df_placeholder = pd.concat([df_placeholder] * len(placeholder_values))
df_placeholder['target'] = placeholder_values
for i in range(fold_size, len(df_pts)-fold_size, fold_size):
#for i in range(fold_size, 2 * fold_size + 1, fold_size):
    df_train = df_pts[:i].copy()
    df_test = df_pts[i:i + fold_size].copy()
    train_timestamp = df_train['timestamp'].max()        
    df_train['target'] = df_train.apply(lambda row: 'train' if row.timestamp_fix <= train_timestamp else (
        'train' if row.timestamp <= train_timestamp - verification_latency else 'wait'), axis='columns')
    df_test['target'] = 'test'
    df_data = pd.concat([df_placeholder, df_train, df_test])
    plot_by_label(df_data, title='{} {:05d}'.format(dataset, i), filename='logs/{}/{:05d}.png'.format(dataset, i), xlim=x1lim, ylim=x2lim)

In [None]:
#plot_drifts_by_label(df_pts):

In [None]:
#plot_by_label(df_pts)

In [None]:
#df_cluster = df_pts.copy()
#df_cluster = df_cluster[df_cluster['x1'] >= 5]
#df_cluster = df_cluster[df_cluster['x1'] <= 25]
#df_cluster = df_cluster[df_cluster['x2'] >= 60]
#df_cluster = df_cluster[df_cluster['x2'] <= 78]
#df_cluster

In [None]:
a#print(len(df_cluster))
#print(df_cluster['author_email'].unique())
#print(df_cluster['author_date'].max(), df_cluster['author_date'].min())
#print(df_cluster['fix'].unique())