In [1]:
# Import Libraries
import pandas as pd

In [2]:
# Reading the crop yield CSV file with proper parsing for quotes and initial spaces

file_path = '../data/raw/Crop_Yield_Data.csv'  

df = pd.read_csv(file_path, skipinitialspace=True, quotechar='"')

In [3]:
# Stripping leading/trailing whitespaces from column names

df.columns = df.columns.str.strip()

print(df.columns)

Index(['Domain Code', 'Domain', 'Area Code (M49)', 'Area', 'Element Code',
       'Element', 'Item Code (CPC)', 'Item', 'Year Code', 'Year', 'Unit',
       'Value', 'Flag', 'Flag Description', 'Note'],
      dtype='object')


In [4]:
# Stripping leading/trailing whitespaces from the 'Element' column

df['Element'] = df['Element'].str.strip()

print(df['Element'].unique())

['Area harvested' 'Yield' 'Production']


In [5]:
# Filtering the DataFrame to only include rows where the Element is 'Yield'

yield_df = df[df['Element'] == 'Yield'].copy()

In [132]:
# Keeping only relevant columns: Year, Item, Value, and Unit

yield_df = yield_df[['Year', 'Item', 'Value', 'Unit']]

In [133]:
# Dropping rows where 'Value' is missing (NaN)

yield_df = yield_df.dropna(subset=['Value'])

In [134]:
# Checking the unique units present before any conversion

print("Unique units before conversion:", yield_df['Unit'].unique())

Unique units before conversion: ['kg/ha']


In [135]:
# Converting values from tonnes per hectare (t/ha) to kilograms per hectare (kg/ha)

yield_df.loc[yield_df['Unit'] == 't/ha', 'Value'] *= 1000

In [136]:
# Updating the unit label from 't/ha' to 'kg/ha' after conversion

yield_df.loc[yield_df['Unit'] == 't/ha', 'Unit'] = 'kg/ha'

In [137]:
# Filtering the DataFrame to include only rows with unit 'kg/ha'

yield_df = yield_df[yield_df['Unit'] == 'kg/ha']

In [138]:
# Renaming columns for clarity

yield_df.rename(columns={
    'Item': 'Crop',
    'Value': 'Yield_kg_per_ha'
}, inplace=True)

In [139]:
# Ensuring the 'Yield_kg_per_ha' column is numeric (not string)

yield_df['Yield_kg_per_ha'] = pd.to_numeric(yield_df['Yield_kg_per_ha'], errors='coerce')

print(yield_df.dtypes)

Year                 int64
Crop                object
Yield_kg_per_ha    float64
Unit                object
dtype: object


In [140]:
# Resetting the index of the DataFrame after filtering and transformations

yield_df.reset_index(drop=True, inplace=True)

In [141]:
# Removing the 'Unit' column from the DataFrame after filtering and transformations

yield_df.drop(columns='Unit', inplace=True)

In [142]:
# Exporting the processed yield data to a new CSV file

processed_file_path = '../data/processed/Processed_Crop_Yield.csv'

yield_df.to_csv(processed_file_path, index=False)

In [143]:
# Displaying the first 5 rows of the cleaned and processed crop yield data

print("Processed Crop Yield Data (first 5 rows):")
print(yield_df.head())

Processed Crop Yield Data (first 5 rows):
   Year                                               Crop  Yield_kg_per_ha
0  1961  Anise, badian, coriander, cumin, caraway, fenn...            500.0
1  1962  Anise, badian, coriander, cumin, caraway, fenn...            500.0
2  1963  Anise, badian, coriander, cumin, caraway, fenn...            500.0
3  1964  Anise, badian, coriander, cumin, caraway, fenn...            500.0
4  1965  Anise, badian, coriander, cumin, caraway, fenn...            500.0
