# Project # 1: Data Preprocessing
##### Alan Duong, Priyatham D, Chandu Kathi, Gio
##### CSC 177

## Import Statements Tensorflow functions, and Data

In [1]:
# Import Statements
import pandas as pd
import numpy as np
import random
import collections.abc
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
import os

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.abc.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) * (normalized_high - normalized_low) + normalized_low

In [3]:
# Caution : Import the toy1.csv into the google collab before you start
data = pd.read_csv('./toy1.csv')

instances = data.shape[0]
attributes = data.shape[1]

print('Number of instances = %d' % (instances))
print('Number of attributes = %d' % (attributes))

Number of instances = 10000
Number of attributes = 6


In [4]:
data.head()

Unnamed: 0,metal,shape,height,length,width,weight
0,silver,cylinder,6,5,5,1235.82
1,bronze,cylinder,2,6,6,525.34
2,bronze,sphere,2,2,2,38.91
3,silver,sphere,6,6,6,1186.39
4,tin,cylinder,10,6,6,2066.85


## Data is clean so dirty by randomly nulling 10 values in each numeric field

In [5]:
# understanding column data type and # of missing values
for col in data.columns:
  print("\t%s, type=(%s) : %d" % (col, type(data[col][0]), data[col].isna().sum()))

	metal, type=(<class 'str'>) : 0
	shape, type=(<class 'str'>) : 0
	height, type=(<class 'numpy.int64'>) : 0
	length, type=(<class 'numpy.int64'>) : 0
	width, type=(<class 'numpy.int64'>) : 0
	weight, type=(<class 'numpy.float64'>) : 0


In [6]:
# Data is very clean, so we have to dirty it a little
import random

# Replace random weight values with null
for i in range(0, 10):
  random_int = random.randint(0, instances)

  if data.loc[random_int, ['weight']] is not np.NaN:
    data.loc[random_int, 'weight'] = np.NaN

# Replace random numbers with null
for i in range(0, 10):
  random_int = random.randint(0, instances)

  if data.loc[random_int, ['height']] is not np.NaN:
    data.loc[random_int, 'height'] = np.NaN

# Replace random numbers with null
for i in range(0, 10):
  random_int = random.randint(0, instances)

  if data.loc[random_int, ['length']] is not np.NaN:
    data.loc[random_int, 'length'] = np.NaN
    
# Replace random numbers with null
for i in range(0, 10):
  random_int = random.randint(0, instances)

  if data.loc[random_int, ['width']] is not np.NaN:
    data.loc[random_int, 'width'] = np.NaN

In [7]:
# Check if the data is now dirty
for col in data.columns:
  print("\t%s, type=(%s) : %d" % (col, type(data[col][0]), data[col].isna().sum()))

	metal, type=(<class 'str'>) : 0
	shape, type=(<class 'str'>) : 0
	height, type=(<class 'numpy.float64'>) : 10
	length, type=(<class 'numpy.float64'>) : 10
	width, type=(<class 'numpy.float64'>) : 10
	weight, type=(<class 'numpy.float64'>) : 10


## Dealing with outliers and missing values

In [None]:
encode_numeric_zscore(data, 'height')
encode_numeric_zscore(data, 'length')
encode_numeric_zscore(data, 'width')

In [None]:
encoded_data = pd.get_dummies(data, columns=['shape', 'metal'])

In [None]:
missing_median(encoded_data, 'length')
missing_median(encoded_data, 'width')
missing_median(encoded_data, 'height')
missing_median(encoded_data, 'weight')

In [None]:
encoded_data.head()

## Test / Train Split


In [None]:
x, y = to_xy(encoded_data, 'weight')

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)