In [1]:
import pandas as pd
import random as rd
from sklearn import preprocessing

In [2]:
columns = ['Name', 'Data Types', 'Default Task', 'Attribute Types', 
           '# Instances','# Attributes', 'Year', 'Missing Values', 'Area']

In [4]:
# Load data
data = pd.read_table('uci-datasets-pricing.txt', sep='\t', names=columns, header=0, engine='python')
original_data = data.copy(True)
original_data.to_pickle('uci-original-no-na.pkl')

### Removing NA values

In [None]:
data = data.dropna(axis=0, how='any') # Delete rows with NA values
data = pd.DataFrame(data.values, columns=data.columns)
data

### Transform columns in dummy values

In [None]:
def split_columns(data, column):
    column_iter = (set(x.split(',')) for x in data[column].values)
    splited_columns = sorted(set.union(*column_iter))
    splited_columns = [x.replace('\xa0', '').strip() for x in splited_columns]
    splited_columns = list(set(splited_columns))
    return splited_columns

In [None]:
types = split_columns(data, 'Data Types')
tasks = split_columns(data, 'Default Task')
areas = split_columns(data, 'Area')
attr_types = split_columns(data, 'Attribute Types')

In [None]:
def transforma_all_to_dummies(data, columns_dict):
    dummies = {}
    for column,values in columns_dict.items():
        for h in values:
            dummy_col = []
            for i in range(len(data[column].values)):
                if h in data[column].values[i] or h+'\xa0' in data[column].values[i]:
                    dummy_col.append(1)
                else:
                    dummy_col.append(0)
            dummies[h] = dummy_col
            data[h] = pd.DataFrame(dummies[h], dtype='int32')
        
        del data[column]
    
    return data

In [None]:
data = transforma_all_to_dummies(data, columns_dict={'Data Types': types, 'Default Task': tasks, 
                                                     'Area': areas,'Attribute Types': attr_types})

### Converting Yes/No to Boolean values

In [None]:
mapping = {'Yes': 0, 'No': 1}
data['Missing Values'] = data['Missing Values'].map(mapping)

### Calculating price

In [None]:
prices = [min(2 * int(row['# Instances']), 3000)
          + min(2 * int(row['# Attributes']), 2000)
          + int(row['Year']) + rd.randint(100, 1000)
          - 1000 * int(row['Missing Values'] == 'Yes')
          for idx, row in data.iterrows()]
data['Price'] = pd.Series(prices, dtype='int32')

### Scaling features

In [None]:
def normalize_data(data, columns):
    for col in columns:
        scaler = preprocessing.MinMaxScaler((0,1))
        scaler.fit(data[col].values)
        x_scaled = scaler.transform(data[col].values)
        data[col] = pd.Series(x_scaled, dtype='float64')

    return data

In [None]:
data = normalize_data(data, ['# Instances', '# Attributes', 'Year', 'Price'])

In [None]:
data

In [None]:
data.to_pickle('uci-preprocessed.pkl')