<a href="https://colab.research.google.com/github/claredavies/AutomaticLearning/blob/main/MachineLearningDataPreparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [44]:
from scipy.io import arff
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

RANDOM_SEED = 0

Function to read in arff

# Functions

In [2]:
def read_arff_as_df_scipy( filename ):
 # input : name of the file where the dataset is stored
 # output : a dataframe
 dataset = arff.loadarff(filename)
 df = pd.DataFrame( dataset[0] , columns = dataset[1])
 return df

In [3]:
def return_columns_missing_values(df):
  return df.columns[df.isnull().any()]

In [15]:
def missing_value_column_percentage(dataframe, column_name):
  percent_missing = dataframe.loc[:, column_name].isnull().sum() * 100 / len(dataframe)
  return percent_missing

In [28]:
def convert_object_datatypes_to_categorical(dataframe):
  for column_name in dataframe:
    if(dataframe[column_name].dtype == object):
      dataframe[column_name] = dataframe[column_name].astype('category')
  return dataframe

In [35]:
def convert_categorical_to_integer(dataframe):
  cat_columns = dataframe.select_dtypes(['category']).columns
  dataframe[cat_columns] = dataframe[cat_columns].apply(lambda x: x.cat.codes)
  return dataframe

# Read in Data

In [46]:
df = read_arff_as_df_scipy('dataset_55_hepatitis.arff')

# Data Exploration

In [47]:
print(df.info)

<bound method DataFrame.info of       AGE        SEX STEROID ANTIVIRALS FATIGUE MALAISE ANOREXIA LIVER_BIG  \
0    30.0    b'male'   b'no'      b'no'   b'no'   b'no'    b'no'     b'no'   
1    50.0  b'female'   b'no'      b'no'  b'yes'   b'no'    b'no'     b'no'   
2    78.0  b'female'  b'yes'      b'no'  b'yes'   b'no'    b'no'    b'yes'   
3    31.0  b'female'    b'?'     b'yes'   b'no'   b'no'    b'no'    b'yes'   
4    34.0  b'female'  b'yes'      b'no'   b'no'   b'no'    b'no'    b'yes'   
..    ...        ...     ...        ...     ...     ...      ...       ...   
150  46.0  b'female'  b'yes'      b'no'  b'yes'  b'yes'   b'yes'    b'yes'   
151  44.0  b'female'  b'yes'      b'no'  b'yes'   b'no'    b'no'    b'yes'   
152  61.0  b'female'   b'no'      b'no'  b'yes'  b'yes'    b'no'     b'no'   
153  53.0    b'male'   b'no'      b'no'  b'yes'   b'no'    b'no'    b'yes'   
154  43.0  b'female'  b'yes'      b'no'  b'yes'   b'no'    b'no'    b'yes'   

    LIVER_FIRM SPLEEN_PALPABLE 

In [48]:
print(df.head)

<bound method NDFrame.head of       AGE        SEX STEROID ANTIVIRALS FATIGUE MALAISE ANOREXIA LIVER_BIG  \
0    30.0    b'male'   b'no'      b'no'   b'no'   b'no'    b'no'     b'no'   
1    50.0  b'female'   b'no'      b'no'  b'yes'   b'no'    b'no'     b'no'   
2    78.0  b'female'  b'yes'      b'no'  b'yes'   b'no'    b'no'    b'yes'   
3    31.0  b'female'    b'?'     b'yes'   b'no'   b'no'    b'no'    b'yes'   
4    34.0  b'female'  b'yes'      b'no'   b'no'   b'no'    b'no'    b'yes'   
..    ...        ...     ...        ...     ...     ...      ...       ...   
150  46.0  b'female'  b'yes'      b'no'  b'yes'  b'yes'   b'yes'    b'yes'   
151  44.0  b'female'  b'yes'      b'no'  b'yes'   b'no'    b'no'    b'yes'   
152  61.0  b'female'   b'no'      b'no'  b'yes'  b'yes'    b'no'     b'no'   
153  53.0    b'male'   b'no'      b'no'  b'yes'   b'no'    b'no'    b'yes'   
154  43.0  b'female'  b'yes'      b'no'  b'yes'   b'no'    b'no'    b'yes'   

    LIVER_FIRM SPLEEN_PALPABLE SP

Get number of samples and features and store

In [49]:
no_samples = len(df.index)
no_features = len(df.columns)
print(no_samples)
print(no_features)

155
20


Check which columns have missing values

In [50]:
columns_missing_values = return_columns_missing_values(df)
print(columns_missing_values)

Index(['BILIRUBIN', 'ALK_PHOSPHATE', 'SGOT', 'ALBUMIN', 'PROTIME'], dtype='object')


# Data Cleaning

Drop features with more than 40% of missing values

In [51]:
for column_name in columns_missing_values:
  column_missing_values_percentage = missing_value_column_percentage(df, column_name)
  if(column_missing_values_percentage > 40):
    df = df.drop(column_name, axis=1)

In [52]:
# drop missing values
df1 = df.dropna()
# fill missing values with median
df2 = df.fillna(df.median())

  after removing the cwd from sys.path.


 Transform categorical features into integer-valued features, using progressive integer labels. You have to do it
for both df1 and df2.

In [53]:
df1 = convert_object_datatypes_to_categorical(df1)
df2 = convert_object_datatypes_to_categorical(df2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [54]:
df1.dtypes

AGE                 float64
SEX                category
STEROID            category
ANTIVIRALS         category
FATIGUE            category
MALAISE            category
ANOREXIA           category
LIVER_BIG          category
LIVER_FIRM         category
SPLEEN_PALPABLE    category
SPIDERS            category
ASCITES            category
VARICES            category
BILIRUBIN           float64
ALK_PHOSPHATE       float64
SGOT                float64
ALBUMIN             float64
HISTOLOGY          category
Class              category
dtype: object

In [55]:
df1 = convert_categorical_to_integer(df1)
df2 = convert_categorical_to_integer(df2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]



# Normalisation

In [56]:
y1 = df1['Class']
X1 = df1.drop("Class", axis=1)

y2 = df2['Class']
X2 = df2.drop("Class", axis=1)

standardizer = StandardScaler()
X1 = standardizer.fit_transform(X1)
X2 = standardizer.fit_transform(X2)


# Feature Selection

In [69]:
import six
import sys
sys.modules['sklearn.externals.six'] = six

import joblib
sys.modules['sklearn.externals.joblib'] = joblib

In [77]:
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

knn = KNeighborsClassifier(n_neighbors=2) # ml_algo used = knn

sfs1 = SFS(knn,
k_features=5,
forward=True, # if forward = True then SFS otherwise SBS
floating=False,
verbose=2,
scoring='accuracy'
)

#after applying sfs fit the data:
sfs1.fit(X1, y1)
print(sfs1.k_feature_names_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.2s finished

[2022-10-17 14:40:42] Features: 1/5 -- score: 0.8666666666666666[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    0.2s finished

[2022-10-17 14:40:42] Features: 2/5 -- score: 0.8666666666666666[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.2s finished

[2022-10-17 14:40:42] Features: 3/5 -- score: 0.8666666666666666[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

('1', '3', '4', '7', '11')


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.1s finished

[2022-10-17 14:40:42] Features: 5/5 -- score: 0.8666666666666666

# PCA (feature reduction)

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=RANDOM_SEED)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=RANDOM_SEED)