# Various methods to split the data in train and test datasets

In [1]:
from sklearn.cross_validation import train_test_split
from sklearn import datasets
import numpy as np
import pandas as pd



# Read the data, clean data and separate features and response

In [2]:
iris = datasets.load_iris() # In the inbuilt Iris dataset the features and response variables are already separated in iris.data
# and iris.target
adult_data = pd.read_csv('./Adult_Data/adult_data.csv', header = None)
# 1. Rename columns
adult_data.rename(columns={0: 'age',
                           1: 'workclass',
                           2: 'fnlwgt',
                           3: 'education',
                           4: 'education-num',
                           5: 'marital-status',
                           6: 'occupation',
                           7: 'relationship',
                           8: 'race',
                           9: 'sex',
                           10: 'capital-gain',
                           11: 'capital-loss',
                           12: 'hours-per-week',
                           13: 'native-country',
                           14: 'salary'}, inplace = True)
# 2. Replace >50K with 1 and <=50K as 0. There are whitespaces in the column value that's why a simple replace would not work.
# strip the whitespaces first
adult_data['salary'] = adult_data['salary'].str.strip().replace({">50K": 1, "<=50K": 0})
# 3. Separate features and response
number_cols = adult_data.shape[1]
adult_data_response = adult_data['salary'] # adult_data_response is a vector
adult_data_features = adult_data.iloc[:, 0:number_cols-1]

# Random split based on percent of data
The input data is simply divided into training and test dataset based on a predefined %. In the below code - training set will 
have 80% of the data and test set will have 20% of the data

In [3]:
adult_data_features_train, adult_data_features_test, adult_data_response_train, adult_data_response_test = \
            train_test_split(adult_data_features, adult_data_response, test_size = 0.2, random_state = 4)
    
iris_features_train, iris_features_test, iris_response_train, iris_response_test = \
            train_test_split(iris.data, iris.target, test_size = 0.2, random_state = 4)

In [4]:
print("ADULT DATASET")
print("*" * 100)
print("Total number of rows in Adult dataset = ", adult_data.shape[0])
print("Total number of columns in Adult dataset = ", adult_data.shape[1])
print("*" * 100)
print("Total number of rows in Features Train dataset = ", adult_data_features_train.shape[0])
print("Total number of columns in Features Train dataset = ", adult_data_features_train.shape[1])
print("*" * 100)
print("Total number of rows in Features Test dataset = ", adult_data_features_test.shape[0])
print("Total number of columns in Features Test dataset = ", adult_data_features_test.shape[1])
print("*" * 100)
print("Sum of rows Features Test and Train dataset = ", adult_data_features_train.shape[0] + adult_data_features_test.shape[0])
print("*" * 100)
print("Total number of rows in Target Train dataset = ", adult_data_response_train.shape[0])
print("*" * 100)
print("Total number of rows in Target Test dataset = ", adult_data_response_test.shape[0])
print("*" * 100)
print("Sum of rows Target Test and Train dataset = ", adult_data_response_train.shape[0] + adult_data_response_train.shape[0])
print("*" * 100)
print("The Target Train and Test sets is a vector so there is no .shape[1] value")
print("*" * 100)

ADULT DATASET
****************************************************************************************************
Total number of rows in Adult dataset =  32561
Total number of columns in Adult dataset =  15
****************************************************************************************************
Total number of rows in Features Train dataset =  26048
Total number of columns in Features Train dataset =  14
****************************************************************************************************
Total number of rows in Features Test dataset =  6513
Total number of columns in Features Test dataset =  14
****************************************************************************************************
Sum of rows Features Test and Train dataset =  32561
****************************************************************************************************
Total number of rows in Target Train dataset =  26048
***********************************************************

In [5]:
print("IRIS DATASET")
print("*" * 100)
print("Total number of rows in Iris dataset = ", iris.data.shape[0])
print("Total number of columns in Iris dataset = ", iris.data.shape[1])
print("*" * 100)
print("Total number of rows in Features Train dataset = ", iris_features_train.data.shape[0])
print("Total number of columns in Features Train dataset = ", iris_features_train.data.shape[1])
print("*" * 100)
print("Total number of rows in Features Test dataset = ", iris_features_test.data.shape[0])
print("Total number of columns in Features Test dataset = ", iris_features_test.data.shape[1])
print("*" * 100)
print("Sum of rows Features Test and Train dataset = ", iris_features_train.data.shape[0] + iris_features_test.data.shape[0])
print("*" * 100)
print("Total number of rows in Target Train dataset = ", iris_response_train.data.shape[0])
print("*" * 100)
print("Total number of rows in Target Test dataset = ", iris_response_test.data.shape[0])
print("*" * 100)
print("Sum of rows Target Test and Train dataset = ", iris_response_train.data.shape[0] + iris_response_test.data.shape[0])
print("*" * 100)
print("The Target Train and Test sets is a vector so there is no .shape[1] value")
print("*" * 100)

IRIS DATASET
****************************************************************************************************
Total number of rows in Iris dataset =  150
Total number of columns in Iris dataset =  4
****************************************************************************************************
Total number of rows in Features Train dataset =  120
Total number of columns in Features Train dataset =  4
****************************************************************************************************
Total number of rows in Features Test dataset =  30
Total number of columns in Features Test dataset =  4
****************************************************************************************************
Sum of rows Features Test and Train dataset =  150
****************************************************************************************************
Total number of rows in Target Train dataset =  120
***************************************************************************

# Split based on distribution of Target variable

In [6]:
### Using the Adult dataset as Iris dataset has equal number of rows for each category (50 each). The below code in its current 
### form will not go to "if not skewed_distribution" logic. 
# Finding the number of unique values of Target variable in the input dataset.
### !!! The target variable name has to be hardcoded !!! ###
unique_target_variables = np.unique(adult_data_response)
print("Unique varibales array =", unique_target_variables)
print("Datatype of Unique varibale array =", type(unique_target_variables))
# Finding the number of rows for each of unique value of Target variable in the input dataset
number_unique_target_variables = unique_target_variables.shape[0]
print("Number of unique Target variables =", number_unique_target_variables)
# Finding if the target variable distribution is skewed or not
# 1. Find rows of each of unique variable in target dataset
row_array = []
count_range = range(0, number_unique_target_variables)
for count in count_range:
    row_array.append(np.where(adult_data_response == unique_target_variables[count])[0].size)
# 2. Find minimum and maximum number of rows for target variable
max_rows = np.max(row_array)
min_rows = np.min(row_array)
print("Max rows =", max_rows)
print("Min rows =", min_rows)
# 3. Find the number of rows that should have have been if the target variable was normally distributed in input dataset
normal_distribution_rows = adult_data_response.shape[0] / number_unique_target_variables
print("Normal Distribution Rows =", normal_distribution_rows)
# 4. If the minimum and maximum number of rows are within 10% of normal distribution then the target variable is normally 
# distributed in input dataset otherwise the distribution is skewed
diff_normal_max = (max_rows - normal_distribution_rows) / normal_distribution_rows * 100
diff_normal_min = (normal_distribution_rows - min_rows) / normal_distribution_rows * 100
print("diff_normal_max =", diff_normal_max)
print("diff_normal_min =", diff_normal_min)

if diff_normal_max <= 10 and diff_normal_min <= 10:
    skewed_distribution = False
else:
    skewed_distribution = True
print("Is distribution skewed?", skewed_distribution)
# If the target variable distribution is not skewed then simply split the input dataset based on %
if not skewed_distribution:
    print("Exceuting Not Skewed Distribution")
    adult_data_features_train, adult_data_features_test, adult_data_response_train, adult_data_response_test = \
            train_test_split(adult_data_features, adult_data_response, test_size = 0.2, random_state = 4)
# If the target variable distribution is skewed then 
else:
    print("Exceuting Skewed Distribution")
    # Creating the final test and train datasets
    # Features will be a dataframe as it is multiple columns
    final_train_features = pd.DataFrame({})
    final_test_features = pd.DataFrame({})
    # Response will be a Series as it has only one column
    final_train_response = pd.Series([])
    final_test_response = pd.Series([])
# 1. Split the input dataset into number of datasets equal to the number of unique values of Target variable, one for each
# unique value of Target variable
#### It is not advisable to have variable variable names in Python where the variable names themselves are created in a loop. 
#### We should use a disctianry instead. In the current situation at hand we never know how many unique Target values are in the
#### data so creating different dynamically created variable names will get messy. So I am going to work with a temporary
#### variable and keep on adding the data to the final test and train variables. This is similar to what I have done in R. See
#### https://github.com/divijsharma/RScripts/blob/master/Automated%20-%20Split%20-%20Target%20Variable%20Distribution%20function.R
# The target variable will be different in different datasets so the target variable name has to be hardcoded.
    for count in count_range:
        temp_split = adult_data[adult_data.salary == unique_target_variables[count]]
        temp_split_features = temp_split.iloc[:, 0:number_cols-1] # The number of columns is same as raw data. 
        temp_split_response = temp_split['salary'] # The target variable is same as raw data.

# 2. Split each of the above datasets into train and test dataset based on %. Training set will have 80% of the data and test set 
# will have 20% of the data. This is right now hardcoded. 
        temp_split_features_train, temp_split_features_test, temp_split_response_train, temp_split_response_test = \
            train_test_split(temp_split_features, temp_split_response, test_size = 0.2, random_state = 4)

# 3. Join the various test and train datasets to create the final test and train dataset that will be input to model.
        # features will form a dataframe
        final_train_features = final_train_features.append(temp_split_features_train)
        final_test_features = final_test_features.append(temp_split_features_test)
        # Response will form a vector
        final_train_response = final_train_response.append(temp_split_response_train)
        final_test_response = final_test_response.append(temp_split_response_test)

Unique varibales array = [0 1]
Datatype of Unique varibale array = <class 'numpy.ndarray'>
Number of unique Target variables = 2
Max rows = 24720
Min rows = 7841
Normal Distribution Rows = 16280.5
diff_normal_max = 51.8380885108
diff_normal_min = 51.8380885108
Is distribution skewed? True
Exceuting Skewed Distribution


In [7]:
print("Rows in final_train_features =", final_train_features.shape[0])
print("Rows in final_test_features =", final_test_features.shape[0])
print("Rows in final_train_response =", final_train_response.shape[0])
print("Rows in final_test_response =", final_test_response.shape[0])

Rows in final_train_features = 26048
Rows in final_test_features = 6513
Rows in final_train_response = 26048
Rows in final_test_response = 6513


### Function to split based on Target

In [None]:
def split_dataset_on_target(df, target_col_name, test_size_inp):
    number_cols = df.shape[1]
    response_df = df[target_col_name]
    features_df = df.drop(target_col_name, axis=1)
    unique_target_variables = np.unique(response_df)
    # Finding the number of rows for each of unique value of Target variable in the input dataset
    number_unique_target_variables = unique_target_variables.shape[0]
    # Finding if the target variable distribution is skewed or not
    # 1. Find rows of each of unique variable in target dataset
    row_array = []
    count_range = range(0, number_unique_target_variables)
    for count in count_range:
        row_array.append(np.where(response_df == unique_target_variables[count])[0].size)
    # 2. Find minimum and maximum number of rows for target variable
    max_rows = np.max(row_array)
    min_rows = np.min(row_array)
    # 3. Find the number of rows that should have have been if the target variable was normally distributed in input dataset
    normal_distribution_rows = response_df.shape[0] / number_unique_target_variables
    # 4. If the minimum and maximum number of rows are within 10% of normal distribution then the target variable is normally 
    # distributed in input dataset otherwise the distribution is skewed
    diff_normal_max = (max_rows - normal_distribution_rows) / normal_distribution_rows * 100
    diff_normal_min = (normal_distribution_rows - min_rows) / normal_distribution_rows * 100

    if diff_normal_max <= 10 and diff_normal_min <= 10:
        skewed_distribution = False
    else:
        skewed_distribution = True
    # If the target variable distribution is not skewed then simply split the input dataset based on %
    if not skewed_distribution:
        features_train, features_test, response_train, response_test = \
            train_test_split(features_df, response_df, test_size = test_size_inp, random_state = 4)
    # If the target variable distribution is skewed then 
    else:
        # Creating the final test and train datasets
        # Features will be a dataframe as it is multiple columns
        features_train = pd.DataFrame({})
        features_test = pd.DataFrame({})
        # Response will be a Series as it has only one column
        response_train = pd.Series([])
        response_test = pd.Series([])
    # 1. Split the input dataset into number of datasets equal to the number of unique values of Target variable, one for each
    # unique value of Target variable
    #### It is not advisable to have variable variable names in Python where the variable names themselves are created in a loop. 
    #### We should use a disctionary instead. In the current situation at hand we never know how many unique Target values are in the
    #### data so creating different dynamically created variable names will get messy. So I am going to work with a temporary
    #### variable and keep on adding the data to the final test and train variables. This is similar to what I have done in R. See
    #### https://github.com/divijsharma/RScripts/blob/master/Automated%20-%20Split%20-%20Target%20Variable%20Distribution%20function.R
    # The target variable will be different in different datasets so the target variable name has to be hardcoded.
        for count in count_range:
            temp_split = df[df[target_col_name] == unique_target_variables[count]]
            temp_split_features = temp_split.drop(target_col_name, axis=1) # The number of columns is same as raw data. 
            temp_split_response = temp_split[target_col_name] # The target variable is same as raw data.

    # 2. Split each of the above datasets into train and test dataset based on %. Training set will have 80% of the data and test set 
    # will have 20% of the data. This is right now hardcoded. 
            temp_split_features_train, temp_split_features_test, temp_split_response_train, temp_split_response_test = \
                train_test_split(temp_split_features, temp_split_response, test_size = test_size_inp, random_state = 4)

    # 3. Join the various test and train datasets to create the final test and train dataset that will be input to model.
    # features will form a dataframe
            features_train = features_train.append(temp_split_features_train)
            features_test = features_test.append(temp_split_features_test)
        # Response will form a vector
            response_train = response_train.append(temp_split_response_train)
            response_test = response_test.append(temp_split_response_test)
            
    # Return all the datasets
    return features_train, features_test, response_train, response_test

### Example of how to call function

In [None]:
app_train_features_1, app_train_features_2, app_train_response_1, app_train_response_2 =  \
    split_dataset_on_target(app_train, 'TARGET', 0.2)