In [my last Colaboratory notebook](https://colab.research.google.com/drive/1tm9mnmoF1mxPIiyJo1gp9xfCxBxJcrfu) I drew up some functions for wrangling structured datasets. An extension of this method could be to incorporate a function that evaluates columns in a dataframe to identify the presence of date or time series data. In this notebook we'll create this function to automate the identification of time series data and update our automunge(.) function to include this new category of data.




# 1) Import data pre-processing functions from last notebook

In [0]:
#imports
import numpy as np
import pandas as pd

In [0]:
#process_numerical_class(mdf_train, mdf_test, column)
#function to normalize data to mean of 0 and standard deviation of 1 from training distribution
#takes as arguement pandas dataframe of training and test data (mdf_train), (mdf_test)\
#and the name of the column string ('column') 
#replaces missing or improperly formatted data with mean of remaining values
#replaces original specified column in dataframe
#returns transformed dataframe

#expect this approach works better when the numerical distribution is thin tailed
#if only have training but not test data handy, use same training data for both dataframe inputs

#imports
from pandas import Series
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

def process_numerical_class(mdf_train, mdf_test, column):
     
    
  #convert all values to either numeric or NaN
  mdf_train[column] = pd.to_numeric(mdf_train[column], errors='coerce')
  mdf_test[column] = pd.to_numeric(mdf_test[column], errors='coerce')

  #get mean of training data
  mean = mdf_train[column].mean()    

  #replace missing data with training set mean
  mdf_train[column] = mdf_train[column].fillna(mean)
  mdf_test[column] = mdf_test[column].fillna(mean)

  #subtract mean from column for both train and test
  mdf_train[column] = mdf_train[column] - mean
  mdf_test[column] = mdf_test[column] - mean

  #get standard deviation of training data
  std = mdf_train[column].std()

  #divide column values by std for both training and test data
  mdf_train[column] = mdf_train[column] / std
  mdf_test[column] = mdf_test[column] / std


  return mdf_train, mdf_test
  

  
#process_binary_class(mdf, column, missing)
#converts binary classification values to 0 or 1
#takes as arguement a pandas dataframe (mdf), \
#the name of the column string ('column') \
#and the string classification to assign to missing data ('missing')
#replaces original specified column in dataframe
#returns transformed dataframe

#missing category must be identical to one of the two existing categories
#returns error message if more than two categories remain


def process_binary_class(mdf, column, missing):
    
  #replace missing data with specified classification
  mdf[column] = mdf[column].fillna(missing)

  #if more than two remaining classifications, return error message    
  if len(mdf[column].unique()) > 2:
      print('ERROR: number of categories in column for process_binary_class() call >2')
      return mdf

  #convert column to binary 0/1 classification
  lb = preprocessing.LabelBinarizer()
  mdf[column] = lb.fit_transform(mdf[column])

  return mdf

  
#process_text_class(mdf_train, mdf_test, column)
#preprocess column with text classifications
#takes as arguement two pandas dataframe containing training and test data respectively 
#(mdf_train, mdf_test), and the name of the column string ('column')

#note this trains both training and test data simultaneously due to unique treatment if any category
#missing from training set but not from test set to ensure consistent formatting 

#deletes the original column from master dataframe and
#replaces with onehot encodings
#with columns named after column_ + text classifications
#missing data replaced with category label 'missing'+column
#any categories missing from the training set removed from test set
#any category present in training but missing from test set given a column of zeros for consistent formatting
#ensures order of all new columns consistent between both sets
#returns two transformed dataframe (mdf_train, mdf_test)

#if only have training but not test data handy, use same training data for both dataframe inputs


def process_text_class(mdf_train, mdf_test, column):

  #replace NA with a dummy variable
  mdf_train[column] = mdf_train[column].fillna('_missing')
  mdf_test[column] = mdf_test[column].fillna('_missing')


  #extract categories for column labels
  #note that .unique() extracts the labels as a numpy array
  labels_train = mdf_train[column].unique()
  labels_train.sort(axis=0)
  labels_test = mdf_test[column].unique()
  labels_test.sort(axis=0)

  #transform text classifications to numerical id
  encoder = LabelEncoder()
  cat_train = mdf_train[column]
  cat_train_encoded = encoder.fit_transform(cat_train)

  cat_test = mdf_test[column]
  cat_test_encoded = encoder.fit_transform(cat_test)


  #apply onehotencoding
  onehotencoder = OneHotEncoder()
  cat_train_1hot = onehotencoder.fit_transform(cat_train_encoded.reshape(-1,1))
  cat_test_1hot = onehotencoder.fit_transform(cat_test_encoded.reshape(-1,1))

  #append column header name to each category listing
  #note the iteration is over a numpy array hence the [...] approach
  labels_train[...] = column + '_' + labels_train[...]
  labels_test[...] = column + '_' + labels_test[...]


  #convert sparse array to pandas dataframe with column labels
  df_train_cat = pd.DataFrame(cat_train_1hot.toarray(), columns=labels_train)
  df_test_cat = pd.DataFrame(cat_test_1hot.toarray(), columns=labels_test)


  #Get missing columns in test set that are present in training set
  missing_cols = set( df_train_cat.columns ) - set( df_test_cat.columns )
  #Add a missing column in test set with default value equal to 0
  for c in missing_cols:
      df_test_cat[c] = 0
  #Ensure the order of column in the test set is in the same order than in train set
  #Note this also removes categories in test set that aren't present in training set
  df_test_cat = df_test_cat[df_train_cat.columns]


  #concatinate the sparse set with the rest of our training data
  mdf_train = pd.concat([df_train_cat, mdf_train], axis=1)
  mdf_test = pd.concat([df_test_cat, mdf_test], axis=1)


  #delete original column from training data
  del mdf_train[column]    
  del mdf_test[column]


  return mdf_train, mdf_test
  


# 2) Define process_time_class(.) function

In [0]:
#process_time_class(mdf_train, mdf_test, column)
#preprocess column with time classifications
#takes as arguement two pandas dataframe containing training and test data respectively 
#(mdf_train, mdf_test), and the name of the column string ('column')

#note this trains both training and test data simultaneously due to unique treatment if any category
#missing from training set but not from test set to ensure consistent formatting 

#deletes the original column from master dataframe and
#replaces with distinct columns for year, month, day, hour, minute, second
#each normalized to the mean and std, with missing values plugged with the mean
#with columns named after column_ + time category
#returns two transformed dataframe (mdf_train, mdf_test)

#if only have training but not test data handy, use same training data for both dataframe inputs

import datetime as dt

def process_time_class(mdf_train, mdf_test, column):
  
  #apply pd.to_datetime to column, note that the errors = 'coerce' needed for messy data
  mdf_train[column] = pd.to_datetime(mdf_train[column], errors = 'coerce')
  mdf_test[column] = pd.to_datetime(mdf_test[column], errors = 'coerce')
  
  #mdf_train[column].replace(-np.Inf, np.nan)
  #mdf_test[column].replace(-np.Inf, np.nan)
  
  #get mean of various categories of datetime objects to use to plug in missing cells
  meanyear = mdf_train[column].dt.year.mean()    
  meanmonth = mdf_train[column].dt.month.mean()
  meanday = mdf_train[column].dt.day.mean()
  meanhour = mdf_train[column].dt.hour.mean()
  meanminute = mdf_train[column].dt.minute.mean()
  meansecond = mdf_train[column].dt.second.mean()
  
  #get standard deviation of training data
  stdyear = mdf_train[column].dt.year.std()  
  stdmonth = mdf_train[column].dt.month.std()
  stdday = mdf_train[column].dt.day.std()
  stdhour = mdf_train[column].dt.hour.std()
  stdminute = mdf_train[column].dt.minute.std()
  stdsecond = mdf_train[column].dt.second.std()
  
  
  #create new columns for each category in train set
  mdf_train[column + '_year'] = mdf_train[column].dt.year
  mdf_train[column + '_month'] = mdf_train[column].dt.month
  mdf_train[column + '_day'] = mdf_train[column].dt.day
  mdf_train[column + '_hour'] = mdf_train[column].dt.hour
  mdf_train[column + '_minute'] = mdf_train[column].dt.minute
  mdf_train[column + '_second'] = mdf_train[column].dt.second
  
  #do same for test set
  mdf_test[column + '_year'] = mdf_test[column].dt.year
  mdf_test[column + '_month'] = mdf_test[column].dt.month
  mdf_test[column + '_day'] = mdf_test[column].dt.day
  mdf_test[column + '_hour'] = mdf_test[column].dt.hour
  mdf_test[column + '_minute'] = mdf_test[column].dt.minute 
  mdf_test[column + '_second'] = mdf_test[column].dt.second
  

  #replace missing data with training set mean
  mdf_train[column + '_year'] = mdf_train[column + '_year'].fillna(meanyear)
  mdf_train[column + '_month'] = mdf_train[column + '_month'].fillna(meanmonth)
  mdf_train[column + '_day'] = mdf_train[column + '_day'].fillna(meanday)
  mdf_train[column + '_hour'] = mdf_train[column + '_hour'].fillna(meanhour)
  mdf_train[column + '_minute'] = mdf_train[column + '_minute'].fillna(meanminute)
  mdf_train[column + '_second'] = mdf_train[column + '_second'].fillna(meansecond)
  
  #do same for test set
  mdf_test[column + '_year'] = mdf_test[column + '_year'].fillna(meanyear)
  mdf_test[column + '_month'] = mdf_test[column + '_month'].fillna(meanmonth)
  mdf_test[column + '_day'] = mdf_test[column + '_day'].fillna(meanday)
  mdf_test[column + '_hour'] = mdf_test[column + '_hour'].fillna(meanhour)
  mdf_test[column + '_minute'] = mdf_test[column + '_minute'].fillna(meanminute)
  mdf_test[column + '_second'] = mdf_test[column + '_second'].fillna(meansecond)
  
  #subtract mean from column for both train and test
  mdf_train[column + '_year'] = mdf_train[column + '_year'] - meanyear
  mdf_train[column + '_month'] = mdf_train[column + '_month'] - meanmonth
  mdf_train[column + '_day'] = mdf_train[column + '_day'] - meanday
  mdf_train[column + '_hour'] = mdf_train[column + '_hour'] - meanhour
  mdf_train[column + '_minute'] = mdf_train[column + '_minute'] - meanminute
  mdf_train[column + '_second'] = mdf_train[column + '_second'] - meansecond
  
  mdf_test[column + '_year'] = mdf_test[column + '_year'] - meanyear
  mdf_test[column + '_month'] = mdf_test[column + '_month'] - meanmonth
  mdf_test[column + '_day'] = mdf_test[column + '_day'] - meanday
  mdf_test[column + '_hour'] = mdf_test[column + '_hour'] - meanhour
  mdf_test[column + '_minute'] = mdf_test[column + '_minute'] - meanminute
  mdf_test[column + '_second'] = mdf_test[column + '_second'] - meansecond
  
  
  #divide column values by std for both training and test data
  mdf_train[column + '_year'] = mdf_train[column + '_year'] / stdyear
  mdf_train[column + '_month'] = mdf_train[column + '_month'] / stdmonth
  mdf_train[column + '_day'] = mdf_train[column + '_day'] / stdday
  mdf_train[column + '_hour'] = mdf_train[column + '_hour'] / stdhour
  mdf_train[column + '_minute'] = mdf_train[column + '_minute'] / stdminute
  mdf_train[column + '_second'] = mdf_train[column + '_second'] / stdsecond
  
  mdf_test[column + '_year'] = mdf_test[column + '_year'] / stdyear
  mdf_test[column + '_month'] = mdf_test[column + '_month'] / stdmonth
  mdf_test[column + '_day'] = mdf_test[column + '_day'] / stdday
  mdf_test[column + '_hour'] = mdf_test[column + '_hour'] / stdhour
  mdf_test[column + '_minute'] = mdf_test[column + '_minute'] / stdminute
  mdf_test[column + '_second'] = mdf_test[column + '_second'] / stdsecond
  
  
  #now replace NaN with 0
  mdf_train[column + '_year'] = mdf_train[column + '_year'].fillna(0)
  mdf_train[column + '_month'] = mdf_train[column + '_month'].fillna(0)
  mdf_train[column + '_day'] = mdf_train[column + '_day'].fillna(0)
  mdf_train[column + '_hour'] = mdf_train[column + '_hour'].fillna(0)
  mdf_train[column + '_minute'] = mdf_train[column + '_minute'].fillna(0)
  mdf_train[column + '_second'] = mdf_train[column + '_second'].fillna(0)
  
  #do same for test set
  mdf_test[column + '_year'] = mdf_test[column + '_year'].fillna(0)
  mdf_test[column + '_month'] = mdf_test[column + '_month'].fillna(0)
  mdf_test[column + '_day'] = mdf_test[column + '_day'].fillna(0)
  mdf_test[column + '_hour'] = mdf_test[column + '_hour'].fillna(0)
  mdf_test[column + '_minute'] = mdf_test[column + '_minute'].fillna(0)
  mdf_test[column + '_second'] = mdf_test[column + '_second'].fillna(0)
  
  
  
  #this is to address an issue I found when parsing columns with only time no date
  #which returned -inf vlaues
  checkyear = np.isinf(mdf_train[column + '_year'][0])
  if checkyear:
    del mdf_train[column + '_year']
    if column + '_year' in mdf_test.columns:
      del mdf_test[column + '_year']

  checkmonth = np.isinf(mdf_train[column + '_month'][0])
  if checkmonth:
    del mdf_train[column + '_month']
    if column + '_month' in mdf_test.columns:
      del mdf_test[column + '_month']

  checkday = np.isinf(mdf_train[column + '_day'][0])
  if checkmonth:
    del mdf_train[column + '_day']
    if column + '_day' in mdf_test.columns:
      del mdf_test[column + '_day']
  
  
  #delete original column from training data
  del mdf_train[column]    
  if column in mdf_test.columns:
    del mdf_test[column]  

  
  return mdf_train, mdf_test

# 3) Define evalcategory(.) and automunge(.) functions

In [0]:
#evalcategory(df, column)
#Function that dakes as input a dataframe and associated column id \
#evaluates the contents of cells and classifies the column into one of four categories
#category 1, 'binary', is for columns with only two categorys of text or integer
#category 2, 'number', is for columns with numerical integer or float values
#category 3, 'text', is for columns with multiple categories appropriate for one-hot
#category 4, 'date', is for columns with Timestamp data
#returns category id as a string

import collections
import datetime as dt

def evalcategory(df, column):
  
  
  #I couldn't find a good pandas tool for evaluating data class, \
  #So will iterate an array through each row of the dataframe column and \
  #evaluation for most common variable using the collections library \
  #this probably isn't extremely efficient for big data scale
  array = []
  for index, row in df.iterrows():
    array = np.append(array, type(row[column]))
      
  c = collections.Counter(array)
  mc = c.most_common(1)
  
  #additional array needed to check for time series
  datearray = []
  for index, row in df.iterrows():
    datearray = np.append(datearray,type(pd.to_datetime(row[column], errors = 'coerce')))
  
  datec = collections.Counter(datearray)
  datemc = datec.most_common(1)
  
    
  #This is kind of hack to evaluate class by comparing these with output of mc
  checkint = 1
  checkfloat = 1.1
  checkstring = 'string'
  checkNAN = float('NaN')

  #there's probably easier way to do this, here will create a check for date
  df_checkdate = pd.DataFrame([{'checkdate' : '7/4/2018'}])
  df_checkdate['checkdate'] = pd.to_datetime(df_checkdate['checkdate'], errors = 'coerce')
  

  #create dummy variable to store determined class (default is text class)
  category = 'text'
  
  #if most common in column is string and > two values, set category to text
  if isinstance(checkstring, mc[0][0]) and df[column].nunique() > 2:
    category = 'text'
  
  #if most common is date, set category to date
  if isinstance(df_checkdate['checkdate'][0], datemc[0][0]):
    category = 'date'
  
  #if most common in column is integer and > two values, set category to number
  if isinstance(checkint, mc[0][0]) and df[column].nunique() > 2:
    category = 'number'
    
  #if most common in column is float, set category to number
  if isinstance(checkfloat, mc[0][0]):
    category = 'number'
    
  #if most common in column is NaN, set category to number
  if isinstance(checkNAN, mc[0][0]):
    category = 'number'
  
  #if most common in column is integer and only two values, set category to binary
  if isinstance(checkint, mc[0][0]) and df[column].nunique() == 2:
    category = 'binary'
  
  #if most common in column is string and only two values, set category to binary
  if isinstance(checkstring, mc[0][0]) and df[column].nunique() == 2:
    category = 'binary'
  
  return category

#automunge(df_train, df_test, labels_column, valpercent=0.20)
#Function that when fed a train and test data set automates the process \
#of evaluating each column for determination and applicaiton of appropriate preprocessing.
#Takes as arguement pandas dataframes of training and test data (mdf_train), (mdf_test)\
#the name of the column from train set containing labels, \
#a value identifying th labels column from train dataset, \
#and a value for percent of training data to be applied to a validation set.
#Based on an evaluation of columns selectively applies one of four preprocessing functions to each.
#Shuffles the data and splits the training set into train and validation sets.
#Returns following sets as numpy arrays: train, labels, validation, validationlabels, test

#Note that this approach assumes that the test data is available at time of training
#A different approach may be required if processing of test data is not simultaneous
#although one potential solution is to apply this function intiially with a dummy\
#dataframe for test set and then when test data becomes available reapply \
#with original train set used for training the model along with the test set

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

def automunge(df_train, df_test, labels_column, valpercent=0.20):
  
  
  #extract labels from train set
  df_labels = pd.DataFrame(df_train[labels_column])
  del df_train[labels_column]
  
  
  #confirm consistency of train an test sets
  
  #check number of columns is consistent
  if df_train.shape[1] != df_test.shape[1]:
    print("error, different number of columns in train and test sets")
    return
  
  #check column headers are consistent (this works independent of order)
  columns_train = set(list(df_train))
  columns_test = set(list(df_test))
  if columns_train != columns_test:
    print("error, different column labels in the train and test set")
    return
   
  #sort columns alphabetically to ensure same order
  #to be honest I'm not positive that this piece is working correctly
  df_train = df_train.sort_index(axis=0)
  df_test = df_test.sort_index(axis=0)
  
  
  #extract column lists again but this time as a list
  columns_train = list(df_train)
  columns_test = list(df_test)
  
  
  #For each column, determine appropriate processing function
  #processing function will be based on evaluation of train set
  for column in columns_train:

    category = evalcategory(df_train, column)

    #for binary class use the majority field for missing plug value
    if category == 'binary':
      binary_missing_plug = df_train['Y/N'].value_counts().index.tolist()[0]
    
    
    #apply appropriate processing function to this column based on the result
    if category == 'binary':
      df_train = process_binary_class(df_train, column, binary_missing_plug)
      df_test = process_binary_class(df_test, column, binary_missing_plug)
      
    if category == 'number':
      df_train, df_test = process_numerical_class(df_train, df_test, column)
      
    if category == 'text':
      df_train, df_test = process_text_class(df_train, df_test, column)
    
    if category == 'date':
      df_train, df_test = process_time_class(df_train, df_test, column)
      
      
  
  #determine labels category and apply appropriate function
  labelscategory = evalcategory(df_labels, labels_column)
  
  #empty dummy labels "test" df for our preprocessing functions
  labelsdummy = pd.DataFrame()
  
  #apply appropriate processing function to this column based on the result
  if labelscategory == 'binary':
    df_labels = process_binary_class(df_labels, labels_column, binary_missing_plug)
      
  if labelscategory == 'number':
    df_labels, labelsdummy = process_numerical_class(df_labels, labelsdummy, labels_column)
 
  #it occurs to me there might be an argument for preferring a single numerical \
  #classifier for labels to keep this to a single column, if so scikitlearn's \
  #LabelEcncoder could be used here, will assume that onehot encoding is acceptable
  if labelscategory == 'text':
    df_labels, labelsdummy = process_text_class(df_labels, labelsdummy, column)
  
  
  #great the data is processed now let's do a few moore global training preps
  
  #convert all of our dataframes to numpy arrays (train, test, and labels)
  np_train = df_train.values
  np_test = df_test.values
  np_labels = df_labels.values
  
  
  #set randomness seed number
  answer = 42
  
  #shuffle training set and labels
  np_train = shuffle(np_train, random_state = answer)
  np_test = shuffle(np_test, random_state = answer)
  np_labels = shuffle(np_labels, random_state = answer)
  
  
  #split validation sets from training and labels
  train, validation, labels, validationlabels = \
  train_test_split(np_train, np_labels, test_size=valpercent, random_state=answer)
  test = np_test
  
  
  
  
  return train, labels, validation, validationlabels, test




## 3) Test Functions¶

In [0]:
#create sample test and train data for demonstration purposes

#train data set from list of dictionaries
train = [{'number': 1, 'Y/N': 'Y', 'shape': 'circle', 'date' : '2/12/18', 'label': 'cat'}, 
         {'number': 2, 'Y/N': 'N', 'shape': 'square', 'date' : 'August 12, 2016', 'label': 'dog'}, 
         {'number': None, 'Y/N': 'Y', 'shape': 'circle', 'date' : None, 'label': 'cat'}, 
         {'number': 3.1, 'Y/N': None, 'shape': 'square', 'date' : 'July 4, 2016', 'label': 'cat'}, 
         {'number': -1, 'Y/N': 'N', 'shape': None, 'date' : 'Jul 4, 2018', 'label': 'dog'}, 
         {'number': 'Q', 'Y/N': 'N', 'shape': 'oval', 'date' : '2015', 'label': 'dog'}]

#convert train data to pandas dataframe
df_train = pd.DataFrame(train)

#test data set from list of dictionaries
test = [{'number': 2.1, 'Y/N': 'N', 'shape': 'square', 'date' : '4/14/18'}, 
        {'number': -1, 'Y/N': 'N', 'shape': None, 'date' : 'August 12, 2016'},
        {'number': 1, 'Y/N': 'Y', 'shape': 'circle', 'date' : 'July 4, 2018'}, 
        {'number': None, 'Y/N': 'Y', 'shape': 'square', 'date' : None}, 
        {'number': 3, 'Y/N': None, 'shape': 'circle', 'date' : 'Aug 31, 2018'}, 
        {'number': 0, 'Y/N': 'N', 'shape': 'octogon', 'date' : '2017'}, 
        {'number': 'Q', 'Y/N': 'Y', 'shape': 'square', 'date' : 'Jan 1, 2019'}]

#convert test data to pandas dataframe
df_test = pd.DataFrame(test)

In [18]:
df_train

Unnamed: 0,Y/N,date,label,number,shape
0,Y,2/12/18,cat,1,circle
1,N,"August 12, 2016",dog,2,square
2,Y,,cat,,circle
3,,"July 4, 2016",cat,3.1,square
4,N,"Jul 4, 2018",dog,-1,
5,N,2015,dog,Q,oval


In [0]:
#apply automunge

train, labels, validation, validationlabels, test = \
automunge(df_train, df_test, labels_column = 'label')



In [20]:
train

array([[ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.35222288, -0.4472136 ,  0.6172134 , -0.5118745 ,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        , -1.19256959, -1.2344268 , -1.10249892,  0.        ,
         0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -1.6856477 ,  1.04349839,  0.6172134 , -0.5118745 ,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ]])

In [21]:
labels

array([[0],
       [1],
       [1],
       [0]])

In [22]:
validation

array([[ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        -0.20375961,  1.04349839, -0.9258201 ,  1.06312396,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.53718443, -0.4472136 ,  0.9258201 ,  1.06312396,  0.        ,
         0.        ,  0.        ]])

In [23]:
validationlabels

array([[0],
       [1]])

In [24]:
test

array([[ 0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.61127884,  1.04349839, -0.3086067 ,  1.45687358,  0.        ,
         0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -1.6856477 , -0.4472136 ,  0.9258201 ,  1.06312396,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.94470366,  0.2981424 , -1.2344268 , -1.10249892,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        -0.20375961,  1.04349839,  0.6172134 , -0.5118745 ,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         1.27812848,  1.04349839,  0.9258201 ,  4.80374531,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
         0.        ,  