In [131]:


import pandas as pd
import numpy as np
import os, os.path, pickle, sys
from datetime import datetime, date

from sklearn.model_selection import train_test_split

In [2]:
wheat_database = "/Users/hn/Documents/01_research_data/Other_people/Ehsan/wheat/"
data_dir = wheat_database + "data/"
separate_varieties_dir = data_dir + "varieties/"

reOrganized_dir = data_dir + "reOrganized/"
os.makedirs(reOrganized_dir, exist_ok=True)

In [3]:
# This is not what we want.
# We want full data series.
# filename = (reOrganized_dir + "all_stages_df22805_varietyAvgd.sav")
# all_stages_df22805_varietyAvgd = pd.read_pickle(filename)
# all_stages_df22805_varietyAvgd.keys()

# all_stages_df22805_varietyAvgd = all_stages_df22805_varietyAvgd["all_stages_data"]
# all_stages_df22805_varietyAvgd.head(2)

In [4]:
filename = (reOrganized_dir + "variables_dict.sav")
variables_dict = pd.read_pickle(filename)
variables_dict.keys()

variables_dict = variables_dict["variables_abb_dict"]
variables_dict

{'fdd': 'freezing_dd',
 'srad': 'shortwave_rad_Wm2',
 'prdtr': 'precip_div_dtr',
 'dtr': 'diurnal_temp',
 'dgdd': 'diurnal_gdd',
 'ravg': 'relative_humidity_avg',
 'hdd': 'high_dd',
 'vs': 'wind_speed_ms',
 'vpd': 'vpd_kPa'}

In [5]:
filename = (reOrganized_dir + "average_and_seperate_varieties_weekly.sav")
average_and_seperate_weekly = pd.read_pickle(filename)
average_and_seperate_weekly.keys()

dict_keys(['averaged_varieties_weekly', 'separate_varieties_weekly', 'separate_varieties_annual', 'averaged_varieties_annual', 'separate_varieties_4season', 'averaged_varieties_4season', 'dates', 'source_code', 'Author', 'Date'])

In [6]:
separate_varieties_weekly = average_and_seperate_weekly["separate_varieties_weekly"]
separate_varieties_weekly.head(2)

Unnamed: 0,location,year,variety,yield,1_tavg,2_tavg,3_tavg,4_tavg,5_tavg,6_tavg,...,16_vs,17_vs,18_vs,19_vs,20_vs,21_vs,22_vs,23_vs,24_vs,25_vs
0,Almira,2005,Alpowa,43.9,7.725714,14.655,11.125714,14.511429,14.647143,12.345,...,3.3,2.528571,2.55,,,,,,,
1,Almira,2005,Alturas,37.7,7.725714,14.655,11.125714,14.511429,14.647143,12.345,...,3.3,2.528571,2.55,,,,,,,


### Drop columns with name dtr.1 in them from dataframe.

In [7]:
print (separate_varieties_weekly.shape)
dt1_cols = [s for s in separate_varieties_weekly.columns if "dtr.1" in s]
separate_varieties_weekly.drop(columns=dt1_cols, inplace=True)
print (separate_varieties_weekly.shape)

(1931, 329)
(1931, 304)


In [8]:
## Get rid of location, year, variety and turn them into ID
separate_varieties_weekly["ID"] = separate_varieties_weekly["location"] + "_" +\
                                  separate_varieties_weekly["year"].astype(str) + "_" +\
                                  separate_varieties_weekly["variety"]


separate_varieties_weekly.drop(columns=["location", "year", "variety"], inplace=True)
separate_varieties_weekly.head(2)

Unnamed: 0,yield,1_tavg,2_tavg,3_tavg,4_tavg,5_tavg,6_tavg,7_tavg,8_tavg,9_tavg,...,17_vs,18_vs,19_vs,20_vs,21_vs,22_vs,23_vs,24_vs,25_vs,ID
0,43.9,7.725714,14.655,11.125714,14.511429,14.647143,12.345,18.738571,14.102143,14.849286,...,2.528571,2.55,,,,,,,,Almira_2005_Alpowa
1,37.7,7.725714,14.655,11.125714,14.511429,14.647143,12.345,18.738571,14.102143,14.849286,...,2.528571,2.55,,,,,,,,Almira_2005_Alturas


In [9]:
# Convert the yield to string to save the data in .TS format

separate_varieties_weekly["yield"] = separate_varieties_weekly["yield"].astype("str")

In [10]:
import re

col_names = list(separate_varieties_weekly.columns)
col_names[:4]

['yield', '1_tavg', '2_tavg', '3_tavg']

In [11]:
## detect columns that start with a digit
## so we can extract them and put them in a list as time series.
## re.match() only matches at the beginning of the string.
## re.search() looks for a match anywhere in the string:
pattern = r"^\d"
digital_columns = [s for s in col_names if re.match(pattern, s)]
digital_columns[:4]

['1_tavg', '2_tavg', '3_tavg', '4_tavg']

In [12]:
print (f"{len(digital_columns) = }")
print (f"{len(col_names) = }")

len(digital_columns) = 300
len(col_names) = 302


In [13]:
non_digital_columns = sorted([s for s in col_names if not(re.match(pattern, s))])
non_digital_columns[:4]

['ID', 'yield']

In [14]:
# count number of different variables
variables_list = [s.split("_")[1] for s in digital_columns]
variables_list = list(set(variables_list))
variables_list

['gdd',
 'ravg',
 'hdd',
 'fdd',
 'vpd',
 'tavg',
 'srad',
 'dtr',
 'prdtr',
 'vs',
 'precip',
 'dgdd']

In [15]:
rows = len(separate_varieties_weekly)
cols = len(non_digital_columns) + len(variables_list)
separate_vars_weekly_TS = pd.DataFrame(np.zeros((rows, cols)))
separate_vars_weekly_TS.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
columns = non_digital_columns + variables_list
separate_vars_weekly_TS.columns = columns
separate_vars_weekly_TS.head(2)

Unnamed: 0,ID,yield,gdd,ravg,hdd,fdd,vpd,tavg,srad,dtr,prdtr,vs,precip,dgdd
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Populate dataframe

In [17]:
separate_vars_weekly_TS[non_digital_columns] = separate_varieties_weekly[non_digital_columns]

separate_vars_weekly_TS_NaNZeros = separate_vars_weekly_TS.copy()
separate_vars_weekly_TS_NaNRand = separate_vars_weekly_TS.copy()
separate_vars_weekly_TS.head(2)

Unnamed: 0,ID,yield,gdd,ravg,hdd,fdd,vpd,tavg,srad,dtr,prdtr,vs,precip,dgdd
0,Almira_2005_Alpowa,43.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Almira_2005_Alturas,37.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
separate_varieties_weekly.head(2)

Unnamed: 0,yield,1_tavg,2_tavg,3_tavg,4_tavg,5_tavg,6_tavg,7_tavg,8_tavg,9_tavg,...,17_vs,18_vs,19_vs,20_vs,21_vs,22_vs,23_vs,24_vs,25_vs,ID
0,43.9,7.725714,14.655,11.125714,14.511429,14.647143,12.345,18.738571,14.102143,14.849286,...,2.528571,2.55,,,,,,,,Almira_2005_Alpowa
1,37.7,7.725714,14.655,11.125714,14.511429,14.647143,12.345,18.738571,14.102143,14.849286,...,2.528571,2.55,,,,,,,,Almira_2005_Alturas


In [127]:
# week_cnt = np.array(sorted(list(set([int(s.split("_")[0]) for s in digital_columns]))))
# max_number_of_weeks = week_cnt.max()
# max_number_of_weeks

In [20]:
## create a dictionary with keys that are name of variables
## and its values are list of weekly columns associated with that variable
variables_colums_grp = dict()
for key_ in variables_list:
    variables_colums_grp[key_] = [s for s in col_names if re.search(key_, s)]

In [21]:
## Populate the dataframe with list of time-series in each cell
for a_variable in variables_list:
    curr_columns = variables_colums_grp[a_variable]
    ts = separate_varieties_weekly[curr_columns].values.tolist()
    
    # convert each of them to a pd.Series
    # to be used lated for export .ts files
    ts = [pd.Series(s) for s in ts]
    
    # convert it to pandas series?
    separate_vars_weekly_TS[a_variable] = ts

#### Replace NaNs 
with zeros and random variables, possibly, to be used in mv-ts-transformers.

In [22]:
separate_varieties_weekly_NaNZeros = separate_varieties_weekly.copy()
separate_varieties_weekly_NaNRand = separate_varieties_weekly.copy()

In [23]:
separate_varieties_weekly_NaNZeros = separate_varieties_weekly_NaNZeros.fillna(0)

import random
random.seed(42)
np.random.seed(42)
for col in separate_varieties_weekly_NaNRand.columns:
    na_mask = separate_varieties_weekly_NaNRand[col].isna()
    # random integers between 0 and 9
    separate_varieties_weekly_NaNRand.loc[na_mask, col] = np.random.normal(size=na_mask.sum())

separate_varieties_weekly_NaNRand.head(2)

Unnamed: 0,yield,1_tavg,2_tavg,3_tavg,4_tavg,5_tavg,6_tavg,7_tavg,8_tavg,9_tavg,...,17_vs,18_vs,19_vs,20_vs,21_vs,22_vs,23_vs,24_vs,25_vs,ID
0,43.9,7.725714,14.655,11.125714,14.511429,14.647143,12.345,18.738571,14.102143,14.849286,...,2.528571,2.55,0.491343,-1.155124,0.058218,0.403769,1.16051,-0.186669,-0.231709,Almira_2005_Alpowa
1,37.7,7.725714,14.655,11.125714,14.511429,14.647143,12.345,18.738571,14.102143,14.849286,...,2.528571,2.55,0.21919,0.08282,-0.754932,-1.195912,-0.838097,-0.568007,-0.943152,Almira_2005_Alturas


In [24]:
## Populate the dataframe with list of time-series in each cell
for a_variable in variables_list:
    curr_columns = variables_colums_grp[a_variable]
    ts = separate_varieties_weekly_NaNZeros[curr_columns].values.tolist()
    
    # convert each of them to a pd.Series
    # to be used lated for export .ts files
    ts = [pd.Series(s) for s in ts]
    
    # convert it to pandas series?
    separate_vars_weekly_TS_NaNZeros[a_variable] = ts

In [25]:
## Populate the dataframe with list of time-series in each cell
for a_variable in variables_list:
    curr_columns = variables_colums_grp[a_variable]
    ts = separate_varieties_weekly_NaNRand[curr_columns].values.tolist()
    
    # convert each of them to a pd.Series. 
    # to be used lated for export .ts files
    ts = [pd.Series(s) for s in ts]
    
    # convert it to pandas series?
    separate_vars_weekly_TS_NaNRand[a_variable] = ts

In [26]:
separate_vars_weekly_TS_NaNRand.head(2)

Unnamed: 0,ID,yield,gdd,ravg,hdd,fdd,vpd,tavg,srad,dtr,prdtr,vs,precip,dgdd
0,Almira_2005_Alpowa,43.9,0 13.650000 1 12.905000 2 16.75...,0 56.278571 1 52.964286 2 54.84285...,0 0.000000 1 11.620000 2 0.05000...,0 28.900000 1 0.000000 2 9.44000...,0 0.651429 1 1.100000 2 0.745714 3...,0 7.725714 1 14.655000 2 11.12571...,0 210.757143 1 278.185714 2 239.11...,0 15.014286 1 15.761429 2 13.09428...,0 0.034115 1 0.049412 2 0.070524 3...,0 4.042857 1 3.871429 2 2.928571 3...,0 2.800000 1 5.900000 2 3.90000...,0 13.650000 1 12.905000 2 16.75500...
1,Almira_2005_Alturas,37.7,0 13.650000 1 12.905000 2 16.75...,0 56.278571 1 52.964286 2 54.84285...,0 0.000000 1 11.620000 2 0.05000...,0 28.900000 1 0.000000 2 9.44000...,0 0.651429 1 1.100000 2 0.745714 3...,0 7.725714 1 14.655000 2 11.12571...,0 210.757143 1 278.185714 2 239.11...,0 15.014286 1 15.761429 2 13.09428...,0 0.034115 1 0.049412 2 0.070524 3...,0 4.042857 1 3.871429 2 2.928571 3...,0 2.800000 1 5.900000 2 3.90000...,0 13.650000 1 12.905000 2 16.75500...


In [27]:
separate_vars_weekly_TS_NaNZeros.head(2)

Unnamed: 0,ID,yield,gdd,ravg,hdd,fdd,vpd,tavg,srad,dtr,prdtr,vs,precip,dgdd
0,Almira_2005_Alpowa,43.9,0 13.650 1 12.905 2 16.755 3 ...,0 56.278571 1 52.964286 2 54.84285...,0 0.00 1 11.62 2 0.05 3 4.7...,0 28.90 1 0.00 2 9.44 3 0.0...,0 0.651429 1 1.100000 2 0.745714 3...,0 7.725714 1 14.655000 2 11.12571...,0 210.757143 1 278.185714 2 239.11...,0 15.014286 1 15.761429 2 13.09428...,0 0.034115 1 0.049412 2 0.070524 3...,0 4.042857 1 3.871429 2 2.928571 3...,0 2.8 1 5.9 2 3.9 3 38.4 4 ...,0 13.650 1 12.905 2 16.755 3 1...
1,Almira_2005_Alturas,37.7,0 13.650 1 12.905 2 16.755 3 ...,0 56.278571 1 52.964286 2 54.84285...,0 0.00 1 11.62 2 0.05 3 4.7...,0 28.90 1 0.00 2 9.44 3 0.0...,0 0.651429 1 1.100000 2 0.745714 3...,0 7.725714 1 14.655000 2 11.12571...,0 210.757143 1 278.185714 2 239.11...,0 15.014286 1 15.761429 2 13.09428...,0 0.034115 1 0.049412 2 0.070524 3...,0 4.042857 1 3.871429 2 2.928571 3...,0 2.8 1 5.9 2 3.9 3 38.4 4 ...,0 13.650 1 12.905 2 16.755 3 1...


In [28]:
separate_vars_weekly_TS.head(2)

Unnamed: 0,ID,yield,gdd,ravg,hdd,fdd,vpd,tavg,srad,dtr,prdtr,vs,precip,dgdd
0,Almira_2005_Alpowa,43.9,0 13.650 1 12.905 2 16.755 3 ...,0 56.278571 1 52.964286 2 54.84285...,0 0.00 1 11.62 2 0.05 3 4.7...,0 28.90 1 0.00 2 9.44 3 0.0...,0 0.651429 1 1.100000 2 0.745714 3...,0 7.725714 1 14.655000 2 11.12571...,0 210.757143 1 278.185714 2 239.11...,0 15.014286 1 15.761429 2 13.09428...,0 0.034115 1 0.049412 2 0.070524 3...,0 4.042857 1 3.871429 2 2.928571 3...,0 2.8 1 5.9 2 3.9 3 38.4 4 ...,0 13.650 1 12.905 2 16.755 3 1...
1,Almira_2005_Alturas,37.7,0 13.650 1 12.905 2 16.755 3 ...,0 56.278571 1 52.964286 2 54.84285...,0 0.00 1 11.62 2 0.05 3 4.7...,0 28.90 1 0.00 2 9.44 3 0.0...,0 0.651429 1 1.100000 2 0.745714 3...,0 7.725714 1 14.655000 2 11.12571...,0 210.757143 1 278.185714 2 239.11...,0 15.014286 1 15.761429 2 13.09428...,0 0.034115 1 0.049412 2 0.070524 3...,0 4.042857 1 3.871429 2 2.928571 3...,0 2.8 1 5.9 2 3.9 3 38.4 4 ...,0 13.650 1 12.905 2 16.755 3 1...


In [29]:
separate_varieties_weekly.head(2)

Unnamed: 0,yield,1_tavg,2_tavg,3_tavg,4_tavg,5_tavg,6_tavg,7_tavg,8_tavg,9_tavg,...,17_vs,18_vs,19_vs,20_vs,21_vs,22_vs,23_vs,24_vs,25_vs,ID
0,43.9,7.725714,14.655,11.125714,14.511429,14.647143,12.345,18.738571,14.102143,14.849286,...,2.528571,2.55,,,,,,,,Almira_2005_Alpowa
1,37.7,7.725714,14.655,11.125714,14.511429,14.647143,12.345,18.738571,14.102143,14.849286,...,2.528571,2.55,,,,,,,,Almira_2005_Alturas


In [30]:
separate_vars_weekly_TS["vs"][0]

0     4.042857
1     3.871429
2     2.928571
3     2.871429
4     3.628571
5     3.685714
6     3.785714
7     3.628571
8     3.200000
9     3.571429
10    2.357143
11    3.671429
12    3.828571
13    3.000000
14    3.014286
15    3.300000
16    2.528571
17    2.550000
18         NaN
19         NaN
20         NaN
21         NaN
22         NaN
23         NaN
24         NaN
dtype: float64

In [31]:
separate_vars_weekly_TS_NaNZeros["vs"][0]

0     4.042857
1     3.871429
2     2.928571
3     2.871429
4     3.628571
5     3.685714
6     3.785714
7     3.628571
8     3.200000
9     3.571429
10    2.357143
11    3.671429
12    3.828571
13    3.000000
14    3.014286
15    3.300000
16    2.528571
17    2.550000
18    0.000000
19    0.000000
20    0.000000
21    0.000000
22    0.000000
23    0.000000
24    0.000000
dtype: float64

In [32]:
separate_vars_weekly_TS_NaNRand["vs"][0]

0     4.042857
1     3.871429
2     2.928571
3     2.871429
4     3.628571
5     3.685714
6     3.785714
7     3.628571
8     3.200000
9     3.571429
10    2.357143
11    3.671429
12    3.828571
13    3.000000
14    3.014286
15    3.300000
16    2.528571
17    2.550000
18    0.491343
19   -1.155124
20    0.058218
21    0.403769
22    1.160510
23   -0.186669
24   -0.231709
dtype: float64

### Split 80-20 for all three datasets

In [33]:
X_train, X_test, y_train, y_test = train_test_split(separate_vars_weekly_TS.drop(columns=["yield"], inplace=False),
                                                    separate_vars_weekly_TS["yield"], 
                                                    test_size=0.2, random_state=42)

In [34]:
train_idx = list(X_train.index)
test_idx = list(X_test.index)

In [35]:
separate_vars_weekly_TS_train = separate_vars_weekly_TS[separate_vars_weekly_TS.index.isin(train_idx)].copy()
separate_vars_weekly_TS_test = separate_vars_weekly_TS[separate_vars_weekly_TS.index.isin(test_idx)].copy()

print (f"{separate_vars_weekly_TS_train.shape = }")
print (f"{separate_vars_weekly_TS_test.shape = }")

separate_vars_weekly_TS_train.shape = (1544, 14)
separate_vars_weekly_TS_test.shape = (387, 14)


In [36]:
separate_vars_weekly_TS_NaNZeros_train = separate_vars_weekly_TS_NaNZeros[
                                            separate_vars_weekly_TS_NaNZeros.index.isin(train_idx)].copy()
separate_vars_weekly_TS_NaNZeros_test = separate_vars_weekly_TS_NaNZeros[
                                            separate_vars_weekly_TS_NaNZeros.index.isin(test_idx)].copy()

print (f"{separate_vars_weekly_TS_NaNZeros_train.shape = }")
print (f"{separate_vars_weekly_TS_NaNZeros_test.shape = }")

separate_vars_weekly_TS_NaNZeros_train.shape = (1544, 14)
separate_vars_weekly_TS_NaNZeros_test.shape = (387, 14)


In [37]:
separate_vars_weekly_TS_NaNRand_train = separate_vars_weekly_TS_NaNRand[
                                            separate_vars_weekly_TS_NaNRand.index.isin(train_idx)].copy()
separate_vars_weekly_TS_NaNRand_test = separate_vars_weekly_TS_NaNRand[
                                            separate_vars_weekly_TS_NaNRand.index.isin(test_idx)].copy()
print (f"{separate_vars_weekly_TS_NaNRand_train.shape = }")
print (f"{separate_vars_weekly_TS_NaNRand_test.shape = }")

separate_vars_weekly_TS_NaNRand_train.shape = (1544, 14)
separate_vars_weekly_TS_NaNRand_test.shape = (387, 14)


In [147]:
import sktime
from sktime.datasets import load_arrow_head
from sktime.datasets import write_dataframe_to_tsfile

In [39]:
# out_dir_ = reOrganized_dir + "wheat_regression_data_mvts/NaNRand/"
# os.makedirs(out_dir_, exist_ok=True)

# with open(out_dir_ + 'separate_vars_weekly_TS_NaNRand_TRAIN.ts', "w", encoding="utf-8") as f:
#     f.write(separate_vars_weekly_TS_NaNRand_train)

In [41]:
def ensure_series(cell):
    # If already a Series, return as-is
    if isinstance(cell, pd.Series):
        return cell
    # If list/array, wrap as Series
    elif isinstance(cell, (list, tuple)) or hasattr(cell, "__len__"):
        return pd.Series(cell)
    # Otherwise (scalar or NaN), wrap as 1-element Series
    else:
        return pd.Series([cell])

In [42]:
for col in X.columns:
    print (type(X[col]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [43]:
# from sktime.datatypes import check_is_scitype

# check_is_scitype(X, scitype="Panel", return_metadata=True)

In [48]:
for ii in list(separate_vars_weekly_TS_NaNZeros_train["yield"]):
    if type(ii)==None:
        print (ii)

In [52]:
def check_non_series_cells(df):
    for row_idx in df.index:
        for col_name in df.columns:
            value = df.at[row_idx, col_name]
            if not isinstance(value, pd.Series):
                print(f"Non-Series found at row {row_idx}, column '{col_name}': type={type(value)}")
                
X = separate_vars_weekly_TS_NaNZeros_train.drop(columns=["ID", "yield"])
check_non_series_cells(X)

In [162]:
out_dir_ = reOrganized_dir + "wheat_regression_data_mvts/separate_vars_weekly_TS_NaNZeros/"
os.makedirs(out_dir_, exist_ok=True)

X = separate_vars_weekly_TS_NaNZeros_train.drop(columns=["ID", "yield"]).copy()
X.reset_index(drop=True, inplace=True)
y = pd.Series(separate_vars_weekly_TS_NaNZeros_train["yield"])

write_dataframe_to_tsfile(data=X, 
                          class_label = y, 
                          class_value_list=y.tolist(),
                          path=out_dir_,
                          problem_name= "separate_vars_weekly_TS_NaNZeros_TRAIN")


del(X,y)

X = separate_vars_weekly_TS_NaNZeros_test.drop(columns=["ID", "yield"]).copy()
X.reset_index(drop=True, inplace=True)
y = pd.Series(separate_vars_weekly_TS_NaNZeros_test["yield"])

write_dataframe_to_tsfile(data=X, 
                          class_label = y, 
                          class_value_list=y.tolist(),
                          path=out_dir_,
                          problem_name= "separate_vars_weekly_TS_NaNZeros_TEST")

In [163]:
out_dir_ = reOrganized_dir + "wheat_regression_data_mvts/separate_vars_weekly_TS_NaNRand/"
os.makedirs(out_dir_, exist_ok=True)

X = separate_vars_weekly_TS_NaNRand_train.drop(columns=["ID", "yield"]).copy()
X.reset_index(drop=True, inplace=True)
y = pd.Series(separate_vars_weekly_TS_NaNRand_train["yield"])

write_dataframe_to_tsfile(data=X, 
                          class_label = y, 
                          class_value_list=y.tolist(),
                          path=out_dir_,
                          problem_name= "separate_vars_weekly_TS_NaNRand_TRAIN")


del(X,y)

X = separate_vars_weekly_TS_NaNRand_test.drop(columns=["ID", "yield"]).copy()
X.reset_index(drop=True, inplace=True)
y = pd.Series(separate_vars_weekly_TS_NaNRand_test["yield"])

write_dataframe_to_tsfile(data=X, 
                          class_label = y, 
                          class_value_list=y.tolist(),
                          path=out_dir_,
                          problem_name= "separate_vars_weekly_TS_NaNRand_TEST")

In [164]:
out_dir_ = reOrganized_dir + "wheat_regression_data_mvts/separate_vars_weekly_TS_wNaN/"
os.makedirs(out_dir_, exist_ok=True)

X = separate_vars_weekly_TS_train.drop(columns=["ID", "yield"]).copy()
X.reset_index(drop=True, inplace=True)
y = pd.Series(separate_vars_weekly_TS_train["yield"])

write_dataframe_to_tsfile(data=X, 
                          class_label = y, 
                          class_value_list=y.tolist(),
                          path=out_dir_,
                          problem_name= "separate_vars_weekly_TS_wNaN_TRAIN")


del(X,y)

X = separate_vars_weekly_TS_test.drop(columns=["ID", "yield"]).copy()
X.reset_index(drop=True, inplace=True)
y = pd.Series(separate_vars_weekly_TS_test["yield"])

write_dataframe_to_tsfile(data=X, 
                          class_label = y, 
                          class_value_list=y.tolist(),
                          path=out_dir_,
                          problem_name= "separate_vars_weekly_TS_wNaN_TEST")

In [170]:
def compare_dataframes_by_position(df1, df2):
    if df1.shape != df2.shape:
        print(f"DataFrames have different shapes: {df1.shape} vs {df2.shape}")
        return

    unequal = False
    for i in range(df1.shape[0]):  # Iterate rows by position
        for j in range(df1.shape[1]):  # Iterate columns by position
            val1 = df1.iat[i, j]
            val2 = df2.iat[i, j]
            if isinstance(val1, pd.Series) and isinstance(val2, pd.Series):
                if not val1.equals(val2):
                    print(f"Difference at row={i}, col={j}:")
                    print(f"  df1: {val1}")
                    print(f"  df2: {val2}")
                    unequal = True
            else:
                if val1 != val2:
                    print(f"Difference at row={i}, col={j}: df1={val1}, df2={val2}")
                    unequal = True

    if not unequal:
        print("The DataFrames are identical by content (column names ignored).")


In [189]:
# Check if it worked:
dir_ = reOrganized_dir + "wheat_regression_data_mvts/separate_vars_weekly_TS_NaNRand/"
f_name = "separate_vars_weekly_TS_NaNRand_TRAIN.ts"
A, A_labels = sktime.datasets.load_from_tsfile_to_dataframe(dir_ + f_name,
                                                            return_separate_X_and_y=True, 
                                                            replace_missing_vals_with='NaN')

df2 = separate_vars_weekly_TS_NaNRand_train.drop(columns=["ID", "yield"]).copy()
compare_dataframes_by_position(A, df2)

print ((list(separate_vars_weekly_TS_NaNRand_train["yield"]) == A_labels).sum() == len(A_labels))

print ("----------   Testset.   ----------")

f_name = "separate_vars_weekly_TS_NaNRand_TEST.ts"
A, A_labels = sktime.datasets.load_from_tsfile_to_dataframe(dir_ + f_name,
                                                            return_separate_X_and_y=True, 
                                                            replace_missing_vals_with='NaN')

df2 = separate_vars_weekly_TS_NaNRand_test.drop(columns=["ID", "yield"]).copy()
compare_dataframes_by_position(A, df2)

print ((list(separate_vars_weekly_TS_NaNRand_test["yield"]) == A_labels).sum() == len(A_labels))

The DataFrames are identical by content (column names ignored).
True
----------   Testset.   ----------
The DataFrames are identical by content (column names ignored).
True


In [190]:
# Check if it worked:
dir_ = reOrganized_dir + "wheat_regression_data_mvts/separate_vars_weekly_TS_NaNZeros/"
f_name = "separate_vars_weekly_TS_NaNZeros_TRAIN.ts"

A, A_labels = sktime.datasets.load_from_tsfile_to_dataframe(dir_ + f_name,
                                                            return_separate_X_and_y=True, 
                                                            replace_missing_vals_with='NaN')

df2 = separate_vars_weekly_TS_NaNZeros_train.drop(columns=["ID", "yield"]).copy()
compare_dataframes_by_position(A, df2)

print ((list(separate_vars_weekly_TS_NaNZeros_train["yield"]) == A_labels).sum() == len(A_labels))

print ("----------   Testset.   ----------")

f_name = "separate_vars_weekly_TS_NaNZeros_TEST.ts"
A, A_labels = sktime.datasets.load_from_tsfile_to_dataframe(dir_ + f_name,
                                                            return_separate_X_and_y=True, 
                                                            replace_missing_vals_with='NaN')

df2 = separate_vars_weekly_TS_NaNZeros_test.drop(columns=["ID", "yield"]).copy()
compare_dataframes_by_position(A, df2)

print ((list(separate_vars_weekly_TS_NaNZeros_test["yield"]) == A_labels).sum() == len(A_labels))


The DataFrames are identical by content (column names ignored).
True
----------   Testset.   ----------
The DataFrames are identical by content (column names ignored).
True


In [193]:
# Check if it worked:
dir_ = reOrganized_dir + "wheat_regression_data_mvts/separate_vars_weekly_TS_wNaN/"
f_name = "separate_vars_weekly_TS_wNaN_TRAIN.ts"

A, A_labels = sktime.datasets.load_from_tsfile_to_dataframe(dir_ + f_name,
                                                            return_separate_X_and_y=True, 
                                                            replace_missing_vals_with='NaN')

df2 = separate_vars_weekly_TS_train.drop(columns=["ID", "yield"]).copy()
compare_dataframes_by_position(A, df2)

print ((list(separate_vars_weekly_TS_train["yield"]) == A_labels).sum() == len(A_labels))

print ("----------   Testset.   ----------")
f_name = "separate_vars_weekly_TS_wNaN_TEST.ts"
A, A_labels = sktime.datasets.load_from_tsfile_to_dataframe(dir_ + f_name,
                                                            return_separate_X_and_y=True, 
                                                            replace_missing_vals_with='NaN')

df2 = separate_vars_weekly_TS_test.drop(columns=["ID", "yield"]).copy()
compare_dataframes_by_position(A, df2)

print ((list(separate_vars_weekly_TS_test["yield"]) == A_labels).sum() == len(A_labels))

The DataFrames are identical by content (column names ignored).
True
----------   Testset.   ----------
The DataFrames are identical by content (column names ignored).
True
