In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
import pickle
import csv

In [15]:
class validation_set:
    
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

In [16]:
class test_set:
    
    def __init__(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test

In [17]:
class data_set:
    
    def __init__(self, validation_set, test_set):
        self.validation_set = validation_set
        self.test_set = test_set

In [18]:
def create_csv(name_file, data, col_names, list_opt = False):
    new_data = data.tolist()

    with open(name_file, 'w', newline='') as f:
        if list_opt:
            new_new_data = [[i] for i in new_data]
        else:
            new_new_data = new_data
        write = csv.writer(f)
        write.writerow(col_names)
        write.writerows(new_new_data)

In [19]:
def generate_train_test(file_name):
    df = pd.read_csv(file_name, sep = ',', engine = 'python')
    X = df.drop('RainTomorrow', axis = 1).values
    y = df['RainTomorrow'].values
    columns_names = list(df.columns)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True)
    X_columns = columns_names[:len(columns_names) - 1]
    y_columns = columns_names[len(columns_names) - 1:]
    return [X_train, y_train, X_test, y_test, X_columns, y_columns]

In [20]:
def generate_folds(data, k):
    X_train = data[0]
    y_train = data[1]
    X_test = data[2]
    y_test = data[3]
    X_columns = data[4]
    y_columns = data[5]
    print('Cross Validation k =', k)
    validation_sets = []
    kf = KFold(n_splits = k)
    c = 0
    for train_index, test_index in kf.split(X_train):
        c = c + 1
        X_train_v, X_test_v = X_train[train_index], X_train[test_index]
        y_train_v, y_test_v = y_train[train_index], y_train[test_index]
        validation_sets.append(validation_set(X_train_v, y_train_v, X_test_v, y_test_v))
        create_csv(name_file = "./data_validation_train_" + str(k) + "_" + str(c) + ".csv", 
                    data = X_train_v, col_names = X_columns)
        create_csv(name_file = "./data_test_" + str(k) + "_" + str(c) + ".csv", 
                    data = X_test_v, col_names = X_columns)
        create_csv(name_file = "./target_validation_train_" + str(k) + "_" + str(c) + ".csv", 
                    data = y_train_v, col_names = y_columns, list_opt = True)
        create_csv(name_file = "./target_test_" + str(k) + "_" + str(c) + ".csv", 
                    data = y_test_v, col_names = y_columns, list_opt = True)        
    my_test_set = test_set(X_test, y_test)
    my_data_set = data_set(validation_sets, my_test_set) 
    return (my_data_set)
data = generate_train_test('./weatherAUS.csv')

In [21]:
ks = [3,5,10]
for k in ks:
    new_data = generate_folds(data, k)
    
    dataset_file = open('./weatherAUS' + str(k) + '.pkl', 'wb')
    pickle.dump(new_data, dataset_file)
    dataset_file.close()

create_csv(name_file = "./data_test.csv", data = new_data.test_set.X_test, col_names = data[4])
create_csv(name_file = "./target_test.csv", data = new_data.test_set.y_test, col_names = data[5], list_opt = True)

Cross Validation k = 3
Cross Validation k = 5
Cross Validation k = 10
