In [2]:
import os
import pandas as pd


In [4]:
data_path = os.path.join("..", "data", "01_raw")

What are the files in the data folder ?

In [15]:
list_files = os.listdir(data_path)
list_files

['pr_maize_test.parquet',
 'pr_maize_train.parquet',
 'pr_wheat_test.parquet',
 'pr_wheat_train.parquet',
 'rsds_maize_test.parquet',
 'rsds_maize_train.parquet',
 'rsds_wheat_test.parquet',
 'rsds_wheat_train.parquet',
 'sample_submission.csv',
 'soil_co2_maize_test.parquet',
 'soil_co2_maize_train.parquet',
 'soil_co2_wheat_test.parquet',
 'soil_co2_wheat_train.parquet',
 'tasmax_maize_test.parquet',
 'tasmax_maize_train.parquet',
 'tasmax_wheat_test.parquet',
 'tasmax_wheat_train.parquet',
 'tasmin_maize_test.parquet',
 'tasmin_maize_train.parquet',
 'tasmin_wheat_test.parquet',
 'tasmin_wheat_train.parquet',
 'tas_maize_test.parquet',
 'tas_maize_train.parquet',
 'tas_wheat_test.parquet',
 'tas_wheat_train.parquet',
 'train_solutions_maize.parquet',
 'train_solutions_wheat.parquet']

Training data (1982-2020)
- For each crop (maize and wheat)
    - Five files containing climate variables from 30 days before and 210 days after planting date
        - rsds - daily short-wave radiation
        - pr - daily precipitation
        - tas - daily mean temperature
        - tmax - daily maximum temperature
        - tmin - daily minimum temperature
        - One “solutions” file containing the target variable (yield)
    - Soil
        - One file containing soil texture, real year, nitrogen fertilization rate and CO2 concentration

Test data (2021-2098)
- For each crop (maize and wheat)
    - Five files containing climate variables from 30 days before and 210 days after planting date
        - rsds - daily short-wave radiation
        - pr - daily precipitation
        - tas - daily mean temperature
        - tmax - daily maximum temperature
        - tmin - daily minimum temperature
    - Soil
        - One file containing soil texture, real year, nitrogen fertilization rate and CO2 concentration

sample_submission.csv - an example of how your submitted predictions file should look ==> No need to use this file

---

What does the yield file looks like? What's its shape and what are the statistics of the yields?

In [9]:
train_solutions_maize_df = pd.read_parquet(os.path.join(data_path, 'train_solutions_maize.parquet'))

In [11]:
print(f"train_solutions_maize_df")
print(f"\nShape: {train_solutions_maize_df.shape}")
train_solutions_maize_df.describe()

train_solutions_maize_df

Shape: (349719, 1)


Unnamed: 0,yield
count,349719.0
mean,3.695721
std,2.747027
min,0.0
25%,1.654
50%,2.842
75%,4.978
max,23.496


In [12]:
train_solutions_wheat_df = pd.read_parquet(os.path.join(data_path, 'train_solutions_wheat.parquet'))

In [13]:
print(f"train_solutions_wheat_df")
print(f"\nShape: {train_solutions_wheat_df.shape}")
train_solutions_wheat_df.describe()

train_solutions_wheat_df

Shape: (278747, 1)


Unnamed: 0,yield
count,278747.0
mean,2.805196
std,1.903385
min,0.0
25%,1.54
50%,2.468
75%,3.594
max,13.935


==> The Y variable to predict, need to join with other tables

---

Does all the data files for each crops have the same number of lines?

In [21]:
def check_file_line_count(list_files, suffix):
    line_counts = []
    for file in list_files:
        if file.endswith(suffix):
            try:
                df = pd.read_parquet(os.path.join(data_path, file))
                line_counts.append(len(df))
                del df
            except Exception as e:
                print(f"Error reading file {file}: {e}")
    return line_counts

files_suffixes = ['maize_test.parquet', 'maize_train.parquet', 'wheat_test.parquet', 'wheat_train.parquet']
for suffix in files_suffixes:
    line_counts = check_file_line_count(list_files, suffix)
    if len(set(line_counts)) == 1:
        print(f"All files ending with {suffix} have the same number of lines.")
    else:
        print(f"Files ending with {suffix} do not all have the same number of lines.")


All files ending with maize_test.parquet have the same number of lines.
All files ending with maize_train.parquet have the same number of lines.
All files ending with wheat_test.parquet have the same number of lines.
All files ending with wheat_train.parquet have the same number of lines.


==> They can be joinded on the 'id' column

---

What are the columns that are different from train and test sets?

In [24]:
# Load train and test DataFrames
train_df = pd.read_parquet(os.path.join(data_path, 'pr_wheat_train.parquet'))
test_df = pd.read_parquet(os.path.join(data_path, 'pr_wheat_test.parquet'))

# Find the difference in column names
diff_cols = set(train_df.columns) ^ set(test_df.columns)

print(f"Columns that are in train but not in test: {diff_cols & set(train_df.columns)}")
print(f"Columns that are in test but not in train: {diff_cols & set(test_df.columns)}")

del train_df
del test_df


Columns that are in train but not in test: set()
Columns that are in test but not in train: set()


---

For example, for wheat training, what does the 5 files of daily data looks like?  

In [34]:
for file in list_files:
    if file.endswith('wheat_train.parquet'):
        if file.startswith('soil_co2')==False:
            df = pd.read_parquet(os.path.join(data_path, file))
            print(file)
            print(df.describe())
            del df

pr_wheat_train.parquet
                year            lon            lat              0  \
count  278747.000000  278747.000000  278747.000000  278747.000000   
mean      399.924627      18.953007      35.781651       0.000027   
std        11.301375      71.029895      25.641007       0.000073   
min       381.000000    -123.250000     -41.250000       0.000000   
25%       390.000000      -5.750000      34.750000       0.000000   
50%       400.000000      27.250000      43.750000       0.000000   
75%       410.000000      66.750000      51.250000       0.000017   
max       419.000000     152.750000      64.750000       0.002682   

                   1              2              3              4  \
count  278747.000000  278747.000000  278747.000000  278747.000000   
mean        0.000026       0.000026       0.000027       0.000027   
std         0.000070       0.000072       0.000072       0.000075   
min         0.000000       0.000000       0.000000       0.000000   
25%       

What does the file for CO2, soil, real year and fertilizers looks like?

In [29]:
df = pd.read_parquet(os.path.join(data_path, 'soil_co2_wheat_train.parquet'))
df_descr = df.describe()
del df
df_descr

Unnamed: 0,year,lon,lat,texture_class,real_year,co2,nitrogen
count,278747.0,278747.0,278747.0,278747.0,278747.0,278747.0,278747.0
mean,399.924627,18.953007,35.781651,7.926331,2000.925122,373.02978,83.143082
std,11.301375,71.029895,25.641007,2.432679,11.301383,21.295426,74.992844
min,381.0,-123.25,-41.25,1.0,1982.0,340.79,4.212
25%,390.0,-5.75,34.75,7.0,1991.0,355.02,5.351
50%,400.0,27.25,43.75,9.0,2001.0,370.47,78.544998
75%,410.0,66.75,51.25,9.0,2011.0,390.49,105.491997
max,419.0,152.75,64.75,13.0,2020.0,414.89,317.987
