# Feature Extraction

Import library and read local cpu_ffmpeg.csv

In [1]:

import pandas as pd
import numpy as np
import importlib
from scipy import stats
from skfeature.utility import *
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, SelectKBest, chi2, f_classif
from sklearn.preprocessing import MinMaxScaler

In [2]:

pwd = "C:/Users/42times28/Desktop/AI506/AMD/Assignment2/"
inputDir = "Input/"
processedDir = "Preprocessed/"
featuresDir = "Features/"
#filename = "cpu-data.csv"
filename = "df_latest_nj_8bench_mem.xlsx"

In [3]:
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [4]:
def load_excel_to_dataframe(filename = ""):
    if filename == "":
        from tkinter import Tk
        from tkinter.filedialog import askopenfilename

        Tk().withdraw()
        filename = askopenfilename()
        
    df = pd.read_excel(filename)
    return df

In [5]:
def drop_string_datetime_na(dataframe):
    timestamp = dataframe['timestamp']
    dataframe = dataframe[list(dataframe.T[(dataframe.dtypes==np.float64) | (dataframe.dtypes==np.int64)].index) + ["host", "uuid"]]
    dataframe[dataframe.T[dataframe.dtypes==np.int64].index] = dataframe[dataframe.T[dataframe.dtypes==np.int64].index].astype(float)
    dataframe = dataframe.dropna(axis=1, how='all')
    nunique = dataframe.nunique()
    cols_to_drop = nunique[nunique == 1].index
    dataframe = dataframe.drop(list(cols_to_drop) + ["Unnamed: 0"], axis=1)
    return dataframe, timestamp

In [6]:

def removeCorrelatedFeatures(dataframe):
    sel = VarianceThreshold(threshold=0.01)
    sel.fit_transform(dataframe)
    quasi_constant = [col for col in dataframe.columns if col not in sel.get_feature_names_out()]
    train = dataframe[sel.get_feature_names_out()]
    corr_matrix = train.corr()
    corr_features = [feature for feature in corr_matrix.columns if (corr_matrix[feature].iloc[:corr_matrix.columns.get_loc(feature)] > 0.8).any()]
    dataframe = dataframe.drop(quasi_constant + corr_features, axis=1)
    features = dataframe.columns
    nparray = MinMaxScaler().fit_transform(dataframe)
    return pd.DataFrame(nparray, columns = features)


    
def selectFeaturesFunc(dataframe, y, function_name):
    module = importlib.import_module('sklearn.feature_selection')
    func = getattr(module, function_name)
    func_features = SelectKBest(func, k = len(dataframe.columns) // 4 + 2)
    X_kbest_features = func_features.fit(dataframe.to_numpy(), y)
    return X_kbest_features.get_feature_names_out(dataframe.columns).tolist()


In [7]:
def applyFeatureSelection(dataframe):
    #benchmarks = dataframe['host']
    #platforms = dataframe['uuid']
    #features = list(dataframe.columns.values)

    y = dataframe['host']
    dataframe = dataframe.drop(['host', 'uuid'], axis=1)
    uncorr_df = removeCorrelatedFeatures(dataframe)
    selectedFeatures = []
    
    selectedFeatures += selectFeaturesFunc(uncorr_df, y, "mutual_info_classif")
    selectedFeatures += selectFeaturesFunc(uncorr_df, y, "chi2")
    selectedFeatures += selectFeaturesFunc(uncorr_df, y, "f_classif")
    
    selectedFeatures = np.unique(selectedFeatures)

    dataframe = dataframe[selectedFeatures]
    nparray = MinMaxScaler().fit_transform(dataframe)
    dataframe = pd.DataFrame(nparray, columns = selectedFeatures)
    return dataframe, y
    

In [8]:
#df = load_excel_to_dataframe()
df = load_excel_to_dataframe(pwd + inputDir + filename)
df, timestamp = drop_string_datetime_na(df)
df, y = applyFeatureSelection(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[dataframe.T[dataframe.dtypes==np.int64].index] = dataframe[dataframe.T[dataframe.dtypes==np.int64].index].astype(float)
  f = msb / msw


In [9]:
timestamp.info()

<class 'pandas.core.series.Series'>
RangeIndex: 47520 entries, 0 to 47519
Series name: timestamp
Non-Null Count  Dtype         
--------------  -----         
47520 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 371.4 KB


In [10]:
df['timestamp'] = timestamp
df['id'] = y

In [11]:
#temp = list(y.unique())
#codes = dict(zip(temp,range(len(temp))))
#print(codes)
#df['id'] = y.map(codes)

In [12]:
df.info()
print(np.unique(df['Active(file)']))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47520 entries, 0 to 47519
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Active(file)        47520 non-null  float64       
 1   Inactive(anon)      47520 non-null  float64       
 2   Inactive(file)      47520 non-null  float64       
 3   anonpages           47520 non-null  float64       
 4   buffers             47520 non-null  float64       
 5   committed_as        47520 non-null  float64       
 6   directmap4k         47520 non-null  float64       
 7   kernelstack         47520 non-null  float64       
 8   memory_utilization  47520 non-null  float64       
 9   swaptotal           47520 non-null  float64       
 10  unevictable         47520 non-null  float64       
 11  timestamp           47520 non-null  datetime64[ns]
 12  id                  47520 non-null  object        
dtypes: datetime64[ns](1), float64(11), object(1)
m

In [13]:
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
from tsfresh.feature_extraction.settings import from_columns

def drop_nunique_na(dataframe):
    dataframe = dataframe.dropna(axis=1, how='all')
    nunique = dataframe.nunique()
    cols_to_drop = nunique[nunique == 1].index
    dataframe = dataframe.drop(list(cols_to_drop), axis=1)
    return dataframe

settings = ComprehensiveFCParameters()
output_matrix = pd.DataFrame(index=df.columns)
features = list(df.columns)

for feature in features:
    if feature == 'id' or feature == 'timestamp':
        continue
    print(feature)
    X = extract_features(df[[feature, 'id', 'timestamp']], column_id='id', column_sort='timestamp', n_jobs=16,
                         default_fc_parameters=settings)

    reduced_X = drop_nunique_na(X)
    uncorr_X = removeCorrelatedFeatures(reduced_X)
    output_matrix = pd.concat([output_matrix, uncorr_X], axis=1, sort=False)


Active(file)


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [01:07<00:00,  8.50s/it]


Inactive(anon)


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [01:24<00:00, 10.59s/it]


Inactive(file)


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [01:28<00:00, 11.05s/it]


anonpages


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [01:29<00:00, 11.21s/it]


buffers


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [01:32<00:00, 11.57s/it]


committed_as


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [01:41<00:00, 12.70s/it]


directmap4k


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [01:48<00:00, 13.50s/it]


kernelstack


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [02:03<00:00, 15.38s/it]


memory_utilization


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [02:01<00:00, 15.20s/it]


swaptotal


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [02:08<00:00, 16.01s/it]


unevictable


Feature Extraction: 100%|████████████████████████████████████████████████████████████████| 8/8 [01:25<00:00, 10.71s/it]


In [14]:
output_matrix

Unnamed: 0,Active(file)__has_duplicate_max,Active(file)__has_duplicate_min,Active(file)__sum_values,Active(file)__length,Active(file)__variation_coefficient,Active(file)__skewness,Active(file)__kurtosis,Active(file)__absolute_sum_of_changes,Active(file)__longest_strike_below_mean,Active(file)__longest_strike_above_mean,...,unevictable__length,unevictable__skewness,unevictable__kurtosis,unevictable__longest_strike_below_mean,unevictable__longest_strike_above_mean,unevictable__last_location_of_minimum,unevictable__benford_correlation,unevictable__large_standard_deviation__r_0.30000000000000004,"unevictable__fft_coefficient__attr_""angle""__coeff_14",unevictable__value_count__value_0
0,0.0,0.0,0.959124,0.0,0.097039,0.459274,0.060115,0.088697,0.089256,0.138288,...,0.0,0.0,1.0,0.00119,0.498608,1.0,0.078118,0.0,1.0,0.0
1,0.0,0.0,0.803208,0.0,0.137893,0.880596,0.013148,0.003879,0.728643,0.497716,...,0.0,1.0,0.003149,1.0,0.0,1.0,0.078118,1.0,0.602877,0.0
2,0.0,0.0,0.352327,0.0,0.493815,0.77904,0.007171,1.0,0.0,0.0,...,0.0,1.0,0.003149,0.0,0.0,1.0,,0.0,0.0,1.0
3,1.0,0.0,0.011098,1.0,1.0,0.841689,0.003272,0.00148,1.0,0.519037,...,1.0,0.999904,2.2e-05,0.859127,1.0,0.002233,0.0,0.0,0.499721,0.0
4,0.0,0.0,0.0,1.0,0.261139,1.0,0.08818,0.000106,0.889687,1.0,...,1.0,1.0,0.003149,0.0,0.0,1.0,0.078118,0.0,0.0,0.0
5,0.0,0.0,0.00331,0.0,0.038947,0.0,1.0,0.000185,0.153147,0.429485,...,0.0,0.995134,0.0,0.482738,0.604872,0.0,0.567532,0.0,0.494218,0.0
6,1.0,1.0,0.005884,0.0,0.183637,0.673174,0.0,3e-05,0.370184,0.954005,...,0.0,0.999388,3.2e-05,0.499008,0.585847,0.003374,1.0,0.0,0.501273,0.0
7,0.0,0.0,1.0,0.0,0.0,0.767505,0.003056,0.0,0.572386,0.696619,...,0.0,1.0,0.003149,1.0,0.0,1.0,0.078118,1.0,0.602877,0.0


In [None]:
#constructed_features = from_columns(uncorr_X)
#constructed_features = constructed_features[feature]

#reduced_X = extract_features(df[[feature, 'id', 'timestamp']], column_id='id', column_sort='timestamp', n_jobs=16,
#                         default_fc_parameters=constructed_features)