In [2]:
import pandas as pd
import os 

In [3]:
#####################################################################
#   This part will combine all the weekly data in one dataframe     #
#####################################################################

directory = './yearly_new_pdsi'
# List to store dataframes
dataframes = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):  # check for Excel files
        filepath = os.path.join(directory, filename)
        
        # Read the Excel file
        df = pd.read_excel(filepath)
        
        location, extension = os.path.splitext(filename)

        # Add a column with the filename (without extension)
        df['location'] = location
        
        # Append dataframe to the list
        dataframes.append(df)

# Concatenate all the dataframes
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_excel('./combined_yearly_modified.xlsx', index=False)

In [2]:
###########################################################################################
#   Making a pivot table, the month_week form combined weekly from rows to the columns    #
###########################################################################################

df = pd.read_excel('./combined_weekly_modified.xlsx', sheet_name = 'Sheet1')
# Grouping the data based on 'month_week' and pivoting to get columns named after 'month_week'
grouped_df = df.pivot_table(
    index=["location", "year"], 
    columns="wap", 
    values=["precip" ,"Tavg","gdd","dtr","prdtr","dgdd","srad","ravg","vs", "vpd", "fdd", "hdd"]
)

# Flattening the multi-level column names
grouped_df.columns = [f"{col[1]}_{col[0]}" for col in grouped_df.columns]
grouped_df = grouped_df.reset_index()
grouped_df.to_excel('./pivot_weekly_modified.xlsx', index = False)

In [None]:
###########################################################################################
# Add variety column to the pivot environemtal variables cosidering variety year location #
###########################################################################################
# Load the sheets
df_var = pd.read_excel('sws_spring_wheat.xlsx', sheet_name='Sheet3')
varieties = df_var['variety'].unique()

for variety in varieties:
    df_alp = df_var[df_var['variety'] == variety]

    df_we = pd.read_excel('pivot_weekly.xlsx')

    # Merge df_we with df_alp on 'year' and 'location' to bring in 'grain_yield' where there's a match
    df_merged = df_we.merge(df_alp[['year', 'location', 'grain_yield']], on=['year', 'location'], how='left', suffixes=('', '_alp'))
    df_merged.dropna(subset=['grain_yield'],inplace = True)
    df_merged.to_excel(f'./varieties_weekly/{variety}.xlsx', index = False)

KeyError: "['year', 'grain_yield'] not in index"

In [4]:
import pandas as pd
import os

# Read the location-specific dataset
df_loc = pd.read_excel('sws_spring_wheat.xlsx', sheet_name='Sheet3')
locations = df_loc['location'].unique()

# Read the weekly dataset once as it is common for all locations
df_we = pd.read_excel('pivot_weekly_modified.xlsx')

# Ensure the output directory exists
output_dir = './locations_weekly'
os.makedirs(output_dir, exist_ok=True)

# List to store all processed DataFrames
data_frame = []
# Process each location
for loc in locations:
    # Filter the data for the current location
    df_alp = df_loc[df_loc['location'] == loc]
    
    # Merge the datasets on 'year' and 'location'
    df_merged = df_we.merge(
        df_alp[['year', 'location', 'grain_yield','variety']],
        on=['year', 'location'],
        how='left',
        suffixes=('', '_alp')
    )
    
    # Drop rows where 'grain_yield' is NaN
    df_merged.dropna(subset=['grain_yield'], inplace=True)
    
    # Save the merged data to an Excel file named after the location
    output_file = os.path.join(output_dir, f'{loc}.xlsx')
    df_merged.to_excel(output_file, index=False)
    # Append the processed DataFrame to the list
    data_frame.append(df_merged)
print(f"Files have been saved in the directory: {output_dir}")
# Concatenate all processed DataFrames into one
final_df = pd.concat(data_frame, ignore_index=True)
# Save the concatenated DataFrame to an Excel file
final_output_file = os.path.join(output_dir, 'merged_with_vars.xlsx')
final_df.to_excel(final_output_file, index=False)


Files have been saved in the directory: ./locations_weekly


In [8]:
import pandas as pd
import os

#######################################################################################################
# I want to make a dataframe that calcuates the average of all the varieties in each location year... #
# because otherwise I will have multiple grain yields for a single environment                        #
#######################################################################################################
df = pd.read_excel('sws_spring_wheat.xlsx')
df_mean = df.groupby(['Location','Year']).mean(numeric_only=True).reset_index()


locations = df_mean['Location'].unique()

# Read the weekly dataset once as it is common for all locations
df_we = pd.read_excel('pivot_weekly_modified.xlsx')

# Ensure the output directory exists
output_dir = './locations_weekly'
os.makedirs(output_dir, exist_ok=True)
import os
import pandas as pd

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# List to store all processed DataFrames
data_frame = []

# Process each location
for loc in locations:
    # Filter the data for the current location
    df_alp = df_mean[df_mean['Location'] == loc]
    
    # Merge the datasets on 'Year' and 'Location'
    df_merged = df_we.merge(
    df_alp[['Year', 'Location', 'Test_weight']],
    left_on=['year', 'location'],
    right_on=['Year', 'Location'],
    how='left',
    suffixes=('', '_alp')
)
    
    # Drop rows where 'grain_yield' is NaN
    df_merged.dropna(subset=['Test_weight'], inplace=True)
    
    # Append the processed DataFrame to the list
    data_frame.append(df_merged)
    
    # Save the merged data to an Excel file named after the location
    #output_file = os.path.join(output_dir, f'{loc}.xlsx')
    #df_merged.to_excel(output_file, index=False)

# Concatenate all processed DataFrames into one
final_df = pd.concat(data_frame, ignore_index=True)
final_df.to_excel('merged_varieties_tw.xlsx', index=False)


In [62]:
#####################################################################
#   This part will combine all the monthly data in one dataframe     #
#####################################################################

directory = './monthly_agg_variables'
# List to store dataframes
dataframes = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):  # check for Excel files
        filepath = os.path.join(directory, filename)
        
        # Read the Excel file
        df = pd.read_excel(filepath)
        
        location, extension = os.path.splitext(filename)

        # Add a column with the filename (without extension)
        df['location'] = location
        
        # Append dataframe to the list
        dataframes.append(df)

# Concatenate all the dataframes
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_excel('./combined_monthly.xlsx', index=False)

In [64]:
###########################################################################################
#   Making a pivot table, the month form combined monthly from rows to the columns    #
###########################################################################################

df = pd.read_excel('./combined_monthly.xlsx', sheet_name = 'Sheet1')
# Grouping the data based on 'month_week' and pivoting to get columns named after 'month_week'
grouped_df = df.pivot_table(
    index=["location", "year"], 
    columns="month", 
    values=["precip", "gdd","dtr","srad","ravg","vs", "vpd"]
)

# Flattening the multi-level column names
grouped_df.columns = [f"{col[1]}_{col[0]}" for col in grouped_df.columns]
grouped_df = grouped_df.reset_index()
grouped_df.to_excel('./pivot_monthly.xlsx', index = False)

In [66]:
###########################################################################################
# Add variety column to the pivot environemtal variables cosidering variety year location #
###########################################################################################
# Load the sheets
df_var = pd.read_excel('sws_spring_wheat.xlsx', sheet_name='Sheet3')
varieties = df_var['variety'].unique()

for variety in varieties:
    df_alp = df_var[df_var['variety'] == variety]

    df_we = pd.read_excel('pivot_monthly.xlsx')

    # Merge df_we with df_alp on 'year' and 'location' to bring in 'grain_yield' where there's a match
    df_merged = df_we.merge(df_alp[['year', 'location', 'grain_yield']], on=['year', 'location'], how='left', suffixes=('', '_alp'))
    df_merged.dropna(subset=['grain_yield'],inplace = True)
    df_merged.to_excel(f'./varieties_monthly/{variety}.xlsx', index=False)

In [77]:
#####################################################################
#   This part will combine all the yearly data in one dataframe     #
#####################################################################

directory = './yearly_agg_variables'
# List to store dataframes
dataframes = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):  # check for Excel files
        filepath = os.path.join(directory, filename)
        
        # Read the Excel file
        df = pd.read_excel(filepath)
        
        location, extension = os.path.splitext(filename)

        # Add a column with the filename (without extension)
        df['location'] = location
        
        # Append dataframe to the list
        dataframes.append(df)

# Concatenate all the dataframes
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_excel('./combined_yearly.xlsx', index=False)

In [86]:
###########################################################################################
# Add variety column to the pivot environemtal variables cosidering variety year location #
###########################################################################################
# Load the sheets
df_var = pd.read_excel('sws_spring_wheat.xlsx', sheet_name='Sheet3')
varieties = df_var['variety'].unique()

for variety in varieties:
    df_alp = df_var[df_var['variety'] == variety]

    df_we = pd.read_excel('combined_yearly.xlsx')

    # Merge df_we with df_alp on 'year' and 'location' to bring in 'grain_yield' where there's a match
    df_merged = df_we.merge(df_alp[['year', 'location', 'grain_yield']], on=['year', 'location'], how='left', suffixes=('', '_alp'))
    df_merged.dropna(subset=['grain_yield'],inplace = True)
    df_merged = df_merged[['year', 'location', 'precip', 'gdd', 'dtr', 'srad', 'ravg', 'vs', 'vpd','grain_yield']]

    df_merged.to_excel(f'./varieties_yearly/{variety}.xlsx', index=False)

In [None]:
##############################################################################
#   This part will combine all the weekly_modified data in one dataframe     #
##############################################################################

directory = './weekly_agg_variables_modified'
# List to store dataframes
dataframes = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):  # check for Excel files
        filepath = os.path.join(directory, filename)
        
        # Read the Excel file
        df = pd.read_excel(filepath)
        
        location, extension = os.path.splitext(filename)

        # Add a column with the filename (without extension)
        df['location'] = location
        
        # Append dataframe to the list
        dataframes.append(df)

# Concatenate all the dataframes
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_excel('./combined_weekly_modified.xlsx', index=False)

In [None]:
###########################################################################################
#     Making a pivot table, the week form combined weekly from rows to the columns        #
###########################################################################################

df = pd.read_excel('./combined_weekly_modified.xlsx', sheet_name = 'Sheet1')
# Grouping the data based on 'month_week' and pivoting to get columns named after 'month_week'
grouped_df = df.pivot_table(
    index=["location", "year"], 
    columns="week_after_start", 
    values=["precip", "gdd","dtr","srad","ravg","vs", "vpd"]
)

# Flattening the multi-level column names
grouped_df.columns = [f"{col[1]}_{col[0]}" for col in grouped_df.columns]
grouped_df = grouped_df.reset_index()
grouped_df.to_excel('./pivot_weekly_modified.xlsx', index = False)

In [7]:
###########################################################################################
# Add variety column to the pivot environemtal variables cosidering variety year location #
###########################################################################################
# Load the sheets
df_var = pd.read_excel('sws_spring_wheat.xlsx', sheet_name='Sheet3')
varieties = df_var['variety'].unique()

for variety in varieties:
    df_alp = df_var[df_var['variety'] == variety]

    df_we = pd.read_excel('pivot_weekly_modified.xlsx')

    # Merge df_we with df_alp on 'year' and 'location' to bring in 'grain_yield' where there's a match
    df_merged = df_we.merge(df_alp[['year', 'location', 'grain_yield']], on=['year', 'location'], how='left', suffixes=('', '_alp'))
    df_merged.dropna(subset=['grain_yield'],inplace = True)
    df_merged.to_excel(f'./varieties_weekly_modified/{variety}.xlsx', index = False)

In [None]:
###########################################################################################
#                  Removing year 2004 rows from the exxcel file                           #
###########################################################################################

import pandas as pd

# Load the dataset
df = pd.read_excel('sws_spring_wheat_with2004.xlsx', sheet_name='Sheet2')

# Filter out rows where the year is 2004
df_2004 = df[df['year'] != 2004]

# Save the filtered data to a new Excel file
df_2004.to_excel('filtered_data.xlsx', index=False)