In [1]:
import pandas as pd
import os
import glob
import io
import requests
import ssl
from sklearn import preprocessing
from sklearn.utils import shuffle

In [2]:
# dir paths
CONVERTED_DATA = '../data/data_converted'
PROCESSED_DATA = '../data/processed'

# Converted datastest
path_converted = os.path.abspath(CONVERTED_DATA)
csv_files_converted = glob.glob(os.path.join(path_converted, '*.csv'))
print(path_converted)
print(csv_files_converted)

# processed data sets
path_processed = os.path.abspath(PROCESSED_DATA)
csv_files_processed = glob.glob(os.path.join(path_processed, '*.csv'))
print(path_processed)
print(csv_files_processed)

/home/rusty/fun/Master-Thesis/data/data_converted
['/home/rusty/fun/Master-Thesis/data/data_converted/borovecki.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/subramanian.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/burczynski.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/sorlie.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/khan.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/singh.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/tian.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/chiaretti.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/christensen.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/chin.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/su.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/shipp.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/nakayama.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/alon.csv', '/home/rusty/fun/Master-Thesis/data/data_conv

In [3]:
df_conv = pd.read_csv(csv_files_converted[0])
df_conv.head()

Unnamed: 0,data,label
0,0.36808,1
1,0.726225,0
2,0.212942,0
3,0.427461,0
4,0.596491,1


In [4]:
df_pros = pd.read_csv(csv_files_processed[4])
df_pros.head()

Unnamed: 0,x.GENE1,x.GENE2,x.GENE3,x.GENE4,x.GENE5,x.GENE6,x.GENE7,x.GENE8,x.GENE9,x.GENE10,...,x.GENE2300,x.GENE2301,x.GENE2302,x.GENE2303,x.GENE2304,x.GENE2305,x.GENE2306,x.GENE2307,x.GENE2308,y
0,0.773344,-2.438405,-0.482562,-2.721135,-1.217058,0.827809,1.342604,0.057042,0.133569,0.565427,...,-0.027474,-1.660205,0.588231,-0.463624,-3.952845,-5.496768,-1.414282,-0.6476,-1.763172,EWS
1,-0.078178,-2.415754,0.412772,-2.825146,-0.626236,0.054488,1.429498,-0.120249,0.456792,0.159053,...,-0.246284,-0.836325,-0.571284,0.034788,-2.47813,-3.661264,-1.093923,-1.20932,-0.824395,EWS
2,-0.084469,-1.649739,-0.241308,-2.875286,-0.889405,-0.027474,1.1593,0.015676,0.191942,0.496585,...,0.024985,-1.059872,-0.403767,-0.678653,-2.939352,-2.73645,-1.965399,-0.805868,-1.139434,EWS
3,0.965614,-2.380547,0.625297,-1.741256,-0.845366,0.949687,1.093801,0.819736,-0.28462,0.994732,...,0.357115,-1.893128,0.255107,0.163309,-1.021929,-2.077843,-1.127629,0.331531,-2.179483,EWS
4,0.075664,-1.728785,0.852626,0.272695,-1.84137,0.327936,1.251219,0.77145,0.030917,0.278313,...,0.061753,-2.273998,-0.039365,0.368801,-2.566551,-1.675044,-1.08205,-0.965218,-1.836966,EWS


In [5]:
def prepare_dataset_for_modeling_plane(dataset_name,
                                 pred_type,
                                 data_directory=None,
                                 na_values='?',
                                 n_samples_max=None,
                                 random_state=999,
                                 drop_const_columns=True,
                                 scale_data=True):


    if pred_type not in ['c', 'r']:
        raise ValueError("Prediction type needs to be either 'c' for classification or 'r' for regression.")

    if data_directory:
        # read in from local directory
        df = pd.read_csv(data_directory + dataset_name, na_values=na_values, header=0)
        print(f'DATA: {df.head()}')
    else:
        # read in the data file from GitHub into a Pandas data frame
        if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
                getattr(ssl, '_create_unverified_context', None)):
            ssl._create_default_https_context = ssl._create_unverified_context
        github_location = 'https://raw.githubusercontent.com/vaksakalli/datasets/master/'
        dataset_url = github_location + dataset_name.lower()
        df = pd.read_csv(io.StringIO(requests.get(dataset_url).content.decode('utf-8')), na_values=na_values, header=0)

    # drop missing values before (any) sampling
    df = df.dropna()

    n_observations = df.shape[0]  # no. of observations in the dataset
    n_samples = n_observations  # initialization - no. of observations after (any) sampling
    if n_samples_max and (n_samples_max < n_observations):
        # do not sample more rows than what is in the dataset
        n_samples = n_samples_max
    df = shuffle(df, n_samples=n_samples, random_state=random_state)

    if drop_const_columns:
        df = df.loc[:, df.nunique() > 1]
    df = df.drop_duplicates(ignore_index=True)

    y = df.iloc[:, -1].values
    x = df.iloc[:, :-1]

    categorical_cols = x.columns[x.dtypes == object].tolist()

    print(f'\nnumber of nominal categorical descriptive features detected: {len(categorical_cols)}\n')

    for col in categorical_cols:
        n = len(x[col].unique())
        if n == 2:
            x[col] = pd.get_dummies(x[col], drop_first=True)
    x = pd.get_dummies(x).values

    if scale_data:
        x = preprocessing.MinMaxScaler().fit_transform(x)
        if pred_type == 'r':
            y = preprocessing.MinMaxScaler().fit_transform(y.reshape(-1, 1)).flatten()

    if pred_type == 'c':
        y = preprocessing.LabelEncoder().fit_transform(y)

    return x, y

In [6]:
cc = pd.read_csv(csv_files_processed[4])
cc.head()

Unnamed: 0,x.GENE1,x.GENE2,x.GENE3,x.GENE4,x.GENE5,x.GENE6,x.GENE7,x.GENE8,x.GENE9,x.GENE10,...,x.GENE2300,x.GENE2301,x.GENE2302,x.GENE2303,x.GENE2304,x.GENE2305,x.GENE2306,x.GENE2307,x.GENE2308,y
0,0.773344,-2.438405,-0.482562,-2.721135,-1.217058,0.827809,1.342604,0.057042,0.133569,0.565427,...,-0.027474,-1.660205,0.588231,-0.463624,-3.952845,-5.496768,-1.414282,-0.6476,-1.763172,EWS
1,-0.078178,-2.415754,0.412772,-2.825146,-0.626236,0.054488,1.429498,-0.120249,0.456792,0.159053,...,-0.246284,-0.836325,-0.571284,0.034788,-2.47813,-3.661264,-1.093923,-1.20932,-0.824395,EWS
2,-0.084469,-1.649739,-0.241308,-2.875286,-0.889405,-0.027474,1.1593,0.015676,0.191942,0.496585,...,0.024985,-1.059872,-0.403767,-0.678653,-2.939352,-2.73645,-1.965399,-0.805868,-1.139434,EWS
3,0.965614,-2.380547,0.625297,-1.741256,-0.845366,0.949687,1.093801,0.819736,-0.28462,0.994732,...,0.357115,-1.893128,0.255107,0.163309,-1.021929,-2.077843,-1.127629,0.331531,-2.179483,EWS
4,0.075664,-1.728785,0.852626,0.272695,-1.84137,0.327936,1.251219,0.77145,0.030917,0.278313,...,0.061753,-2.273998,-0.039365,0.368801,-2.566551,-1.675044,-1.08205,-0.965218,-1.836966,EWS


In [7]:
# dir paths
CONVERTED_DATA = '../data/data_converted'
PROCESSED_DATA = '../data/processed'

# Converted datastest
path_converted = os.path.abspath(CONVERTED_DATA)
csv_files_converted = glob.glob(os.path.join(path_converted, '*.csv'))
print(path_converted)
print(csv_files_converted)

# processed data sets
path_processed = os.path.abspath(PROCESSED_DATA)
csv_files_processed = glob.glob(os.path.join(path_processed, '*.csv'))
print(path_processed)
print(csv_files_processed)

def prepare_dataset_for_modeling(dataset_name,
                                 pred_type,
                                 data_directory=None,
                                 na_values='?',
                                 n_samples_max=None,
                                 random_state=999,
                                 drop_const_columns=True,
                                 scale_data=True):

    print(f'DATASET NAME: {dataset_name}')
    print(f'directory path: {data_directory}')

    if pred_type not in ['c', 'r']:
        raise ValueError("Prediction type needs to be either 'c' for classification or 'r' for regression.")

    if data_directory:
        if not data_directory.endswith('/'):
            data_directory += '/'
        df = pd.read_csv(data_directory + dataset_name, na_values=na_values, header=0)
        print(f'DATA: {df.head()}')
    print(f'DF LEN BEFORE DROP NA: {len(df)}')
    df = df.dropna()
    print(f'DF AFTER DROP NA: {len(df)}')

    n_observations = df.shape[0]  # no. of observations in the dataset
    n_samples = n_observations  # initialization - no. of observations after (any) sampling
    if n_samples_max and (n_samples_max < n_observations):
        # do not sample more rows than what is in the dataset
        n_samples = n_samples_max
    df = shuffle(df, n_samples=n_samples, random_state=random_state)

    if drop_const_columns:
        df = df.loc[:, df.nunique() > 1]
    df = df.drop_duplicates(ignore_index=True)

    y = df.iloc[:, -1].values
    x = df.iloc[:, :-1]

    categorical_cols = x.columns[x.dtypes == object].tolist()

    print(f'\nnumber of nominal categorical descriptive features detected: {len(categorical_cols)}\n')

    for col in categorical_cols:
        n = len(x[col].unique())
        if n == 2:
            x[col] = pd.get_dummies(x[col], drop_first=True)
    x = pd.get_dummies(x).values

    if scale_data:
        x = preprocessing.MinMaxScaler().fit_transform(x)
        if pred_type == 'r':
            y = preprocessing.MinMaxScaler().fit_transform(y.reshape(-1, 1)).flatten()

    if pred_type == 'c':
        y = preprocessing.LabelEncoder().fit_transform(y)

    return x, y

dataset_name = os.path.basename(csv_files_processed[4])  # Extract the dataset name from the file path
data_directory = os.path.dirname(csv_files_processed[4]) 

x, y = prepare_dataset_for_modeling(dataset_name, pred_type='c', data_directory=data_directory)

print(f'X : {x}')
print(f'Y : {y}')
test_df_x = pd.DataFrame(x)
test_df_y = pd.DataFrame(y)

/home/rusty/fun/Master-Thesis/data/data_converted
['/home/rusty/fun/Master-Thesis/data/data_converted/borovecki.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/subramanian.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/burczynski.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/sorlie.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/khan.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/singh.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/tian.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/chiaretti.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/christensen.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/chin.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/su.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/shipp.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/nakayama.csv', '/home/rusty/fun/Master-Thesis/data/data_converted/alon.csv', '/home/rusty/fun/Master-Thesis/data/data_conv

In [8]:
test_df_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2298,2299,2300,2301,2302,2303,2304,2305,2306,2307
0,0.860841,0.796585,0.434166,0.072852,0.961813,0.488233,0.296081,0.670272,0.52432,0.875058,...,0.856483,0.62722,0.437818,0.630889,0.112992,0.667834,0.60779,0.630719,0.946783,0.57471
1,0.10714,0.190904,0.135591,0.900945,0.486763,0.304533,0.362354,0.372306,0.756917,0.723061,...,0.0,0.564739,0.664596,0.303845,0.028926,0.5731,0.910751,0.633459,0.728358,0.566102
2,0.861976,0.532713,0.022876,0.383588,0.748794,0.645816,0.180995,0.713352,0.291189,0.493789,...,0.864783,0.383464,0.439293,0.418139,0.769302,0.701675,0.688168,0.747119,0.707713,0.721908
3,0.727536,0.26334,0.696222,0.867443,0.364803,0.467848,0.303109,0.622896,1.0,0.685706,...,0.52284,0.169435,0.720511,0.533016,0.7848,0.855902,0.866603,0.492418,0.700157,0.389735
4,0.736457,0.612572,0.592135,0.075775,0.773278,0.644309,0.380658,0.75461,0.577157,0.757996,...,0.723895,0.57222,0.521391,0.920097,0.215299,0.643051,0.545423,0.420247,0.839062,0.464071


In [9]:
test_df_y.head()

Unnamed: 0,0
0,3
1,0
2,3
3,2
4,3


In [10]:
result = pd.concat([test_df_x, test_df_y.rename(columns={0: 'label'})], axis=1)
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2299,2300,2301,2302,2303,2304,2305,2306,2307,label
0,0.860841,0.796585,0.434166,0.072852,0.961813,0.488233,0.296081,0.670272,0.52432,0.875058,...,0.62722,0.437818,0.630889,0.112992,0.667834,0.60779,0.630719,0.946783,0.57471,3
1,0.10714,0.190904,0.135591,0.900945,0.486763,0.304533,0.362354,0.372306,0.756917,0.723061,...,0.564739,0.664596,0.303845,0.028926,0.5731,0.910751,0.633459,0.728358,0.566102,0
2,0.861976,0.532713,0.022876,0.383588,0.748794,0.645816,0.180995,0.713352,0.291189,0.493789,...,0.383464,0.439293,0.418139,0.769302,0.701675,0.688168,0.747119,0.707713,0.721908,3
3,0.727536,0.26334,0.696222,0.867443,0.364803,0.467848,0.303109,0.622896,1.0,0.685706,...,0.169435,0.720511,0.533016,0.7848,0.855902,0.866603,0.492418,0.700157,0.389735,2
4,0.736457,0.612572,0.592135,0.075775,0.773278,0.644309,0.380658,0.75461,0.577157,0.757996,...,0.57222,0.521391,0.920097,0.215299,0.643051,0.545423,0.420247,0.839062,0.464071,3


In [11]:
# test with irsi dataset
iris = pd.read_csv('iris.csv')

x_iris, y_iris = prepare_dataset_for_modeling_plane('iris.csv', pred_type='c')

print(f'X : {x_iris}')
print(f'Y : {y_iris}')
test_df_x_iris = pd.DataFrame(x_iris)
test_df_y_iris = pd.DataFrame(y_iris)

IndexError: single positional indexer is out-of-bounds

DATASET NAME: iris.csv
directory path: None


UnboundLocalError: local variable 'df' referenced before assignment

In [16]:
def prepare_dataset(dataset_name,
                                 pred_type,
                                 data_directory=None,
                                 na_values='?',
                                 n_samples_max=None,
                                 random_state=999,
                                 drop_const_columns=True,
                                 scale_data=True):

    print(f'DATASET NAME: {dataset_name}')
    print(f'directory path: {data_directory}')

    if pred_type not in ['c', 'r']:
        raise ValueError("Prediction type needs to be either 'c' for classification or 'r' for regression.")

    if data_directory:
        if not data_directory.endswith('/'):
            data_directory += '/'
        df = pd.read_csv(data_directory + dataset_name, na_values=na_values, header=0)
        print(f'DATA: {df.head()}')
    else:
        df = pd.read_csv(dataset_name)
    print(f'DF LEN BEFORE DROP NA: {len(df)}')
    df = df.dropna()
    print(f'DF AFTER DROP NA: {len(df)}')

    n_observations = df.shape[0]  # no. of observations in the dataset
    n_samples = n_observations  # initialization - no. of observations after (any) sampling
    if n_samples_max and (n_samples_max < n_observations):
        # do not sample more rows than what is in the dataset
        n_samples = n_samples_max
    df = shuffle(df, n_samples=n_samples, random_state=random_state)

    if drop_const_columns:
        df = df.loc[:, df.nunique() > 1]
    df = df.drop_duplicates(ignore_index=True)

    y = df.iloc[:, -1].values
    x = df.iloc[:, :-1]

    categorical_cols = x.columns[x.dtypes == object].tolist()

    print(f'\nnumber of nominal categorical descriptive features detected: {len(categorical_cols)}\n')

    for col in categorical_cols:
        n = len(x[col].unique())
        if n == 2:
            x[col] = pd.get_dummies(x[col], drop_first=True)
    x = pd.get_dummies(x).values

    if scale_data:
        x = preprocessing.MinMaxScaler().fit_transform(x)
        if pred_type == 'r':
            y = preprocessing.MinMaxScaler().fit_transform(y.reshape(-1, 1)).flatten()

    if pred_type == 'c':
        y = preprocessing.LabelEncoder().fit_transform(y)

    return x, y

In [17]:
x, y = prepare_dataset('iris.csv', pred_type='c')
print(f'X : {x}')
print(f'Y : {y}')

DATASET NAME: iris.csv
directory path: None
DF LEN BEFORE DROP NA: 150
DF AFTER DROP NA: 150

number of nominal categorical descriptive features detected: 0

X : [[0.44444444 0.41666667 0.54237288 0.58333333]
 [0.36111111 0.375      0.44067797 0.5       ]
 [0.33333333 0.16666667 0.47457627 0.41666667]
 [0.19444444 0.625      0.05084746 0.08333333]
 [0.33333333 0.625      0.05084746 0.04166667]
 [0.22222222 0.625      0.06779661 0.04166667]
 [0.22222222 0.75       0.15254237 0.125     ]
 [0.41666667 0.29166667 0.49152542 0.45833333]
 [0.52777778 0.33333333 0.6440678  0.70833333]
 [0.22222222 0.625      0.06779661 0.08333333]
 [0.19444444 0.58333333 0.10169492 0.125     ]
 [0.69444444 0.5        0.83050847 0.91666667]
 [0.25       0.58333333 0.06779661 0.04166667]
 [0.5        0.25       0.77966102 0.54166667]
 [0.19444444 0.         0.42372881 0.375     ]
 [0.61111111 0.41666667 0.81355932 0.875     ]
 [0.44444444 0.41666667 0.69491525 0.70833333]
 [0.58333333 0.29166667 0.72881356 0.75