In [16]:
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# perform pairwise correlation analysis
df_env = pd.read_csv('df_12705.csv')
df = df_env.drop(columns=['location', 'year', 'Grain_yield', 'Heading_date', 'Height','Protein'])


###########################
###   Encode soil type  ###
###########################
# Assuming 'soil_type' is the column in your DataFrame
label_encoder = LabelEncoder()
df['soil_type_encoded'] = label_encoder.fit_transform(df['soil_type'])

# Optional: Check the mapping
soil_type_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Encoding Mapping:", soil_type_mapping)
df = df.drop('soil_type', axis = 1)
df.rename(columns={"soil_type_encoded": "Soil_type"}, inplace=True)

# Impute missing values (column-wise)
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Perform pairwise correlation analysis
corr_matrix = df_imputed.corr()


Label Encoding Mapping: {'Athena-Spofford silt loams': 0, 'Bagdad silt loam': 1, 'Lickskillet silt loam': 2, 'Lickskillet-Schuelke-Rock outcrop complex': 3, 'Palouse silt loam': 4, 'Ritzville silt loam': 5, 'Roloff-Bakeoven-Rock outcrop complex': 6, 'Thatuna silt loam': 7}


In [None]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Load your dataset
df_env = pd.read_csv('data_f.csv')

# Drop non-numeric or unnecessary columns
df = df_env.drop(columns=['location', 'year', 'Heading_date', 'Height', 'Protein', 'Test_weight'])

# Impute missing values (if any)
df.fillna(df.mean(), inplace=True)

# Add a constant term for the VIF calculation
X = add_constant(df)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Drop the constant row for clarity
vif_data = vif_data[vif_data['Feature'] != 'const']

print(vif_data)

In [17]:
# Compute the correlation matrix
corr_matrix = df.corr().abs()  # Absolute correlation values

# Unstack the matrix to get pairs
corr_pairs = corr_matrix.unstack()

# Remove duplicate pairs and self-correlations
filtered_pairs = corr_pairs[(corr_pairs.index.get_level_values(0) != corr_pairs.index.get_level_values(1))]
unique_pairs = filtered_pairs.loc[filtered_pairs.index.map(frozenset).duplicated(keep='first') == False]

# Sort the pairs by correlation value
sorted_pairs = unique_pairs.sort_values(ascending=False)

# Filter for top correlations (adjust threshold as needed)
top_pairs = sorted_pairs[sorted_pairs > 0.8]  # Example threshold: > 0.8
top_pairs

Maturity_soil        Beyond Maturity_soil    0.999858
Grain fill_soil      Beyond Maturity_soil    0.998882
Flowering_soil       Maturity_soil           0.998743
Maturity_pdsi        Beyond Maturity_pdsi    0.998451
Grain fill_pdsi      Maturity_pdsi           0.998220
                                               ...   
Flowering_fdd        Flowering_#days         0.806278
Jointing_Tavg        Jointing_hdd            0.805135
Maturity_rmin        Maturity_vpd            0.804397
Grain fill_etr       Grain fill_ravg         0.801806
Beyond Maturity_etr  Beyond Maturity_ravg    0.800896
Length: 206, dtype: float64

In [18]:
# Compute the correlation matrix
correlation_matrix = df.corr()

# Identify pairs of highly correlated variables (correlation > 0.7)
high_corr_pairs = []
for i, col1 in enumerate(df.columns[:-1]):  # Exclude the target column
    for col2 in df.columns[i+1:-1]:
        if abs(correlation_matrix.loc[col1, col2]) > 0.8:
            high_corr_pairs.append((col1, col2))

# Decide which variable to remove based on correlation with target
removed_features = set()
for col1, col2 in high_corr_pairs:
    corr1 = abs(correlation_matrix.loc[col1, 'Test_weight'])
    corr2 = abs(correlation_matrix.loc[col2, 'Test_weight'])
    if corr1 < corr2:
        removed_features.add(col1)
    else:
        removed_features.add(col2)

# Filter the DataFrame to remove the highly correlated variables
filtered_df = df.drop(columns=list(removed_features))

filtered_df.to_csv('correlation_tw_12705.csv', index  = False)

removed_features

{'Beyond Maturity_#days',
 'Beyond Maturity_Tavg',
 'Beyond Maturity_dgdd',
 'Beyond Maturity_etr',
 'Beyond Maturity_pdsi',
 'Beyond Maturity_pet',
 'Beyond Maturity_prdtr',
 'Beyond Maturity_ravg',
 'Beyond Maturity_soil',
 'Beyond Maturity_srad',
 'Beyond Maturity_tmin',
 'Beyond Maturity_vpd',
 'Emergence_#days',
 'Emergence_Tavg',
 'Emergence_pdsi',
 'Emergence_prdtr',
 'Emergence_rmax',
 'Emergence_rmin',
 'Emergence_srad',
 'Emergence_tmax',
 'Emergence_tmin',
 'Emergence_vpd',
 'Flowering_#days',
 'Flowering_Tavg',
 'Flowering_fdd',
 'Flowering_hdd',
 'Flowering_pdsi',
 'Flowering_prdtr',
 'Flowering_precip',
 'Flowering_ravg',
 'Flowering_rmax',
 'Flowering_rmin',
 'Flowering_soil',
 'Flowering_tmax',
 'Flowering_tmin',
 'Flowering_vpd',
 'Grain fill_Tavg',
 'Grain fill_fdd',
 'Grain fill_hdd',
 'Grain fill_pdsi',
 'Grain fill_pet',
 'Grain fill_prdtr',
 'Grain fill_ravg',
 'Grain fill_rmin',
 'Grain fill_soil',
 'Grain fill_tmax',
 'Grain fill_vpd',
 'Heading_#days',
 'Headin

In [5]:
max_value = correlation_matrix['Longitude'].nlargest(10)
max_value

Longitude               1.000000
Grain fill_soil         0.795510
Beyond Maturity_soil    0.759357
preseason_soil          0.755518
Heading_soil            0.734869
Flowering_soil          0.718673
Emergence_soil          0.703227
Maturity_soil           0.700727
Tillering_soil          0.692405
Jointing_soil           0.684062
Name: Longitude, dtype: float64

In [9]:
import pandas as pd
import numpy as np
import plotly.express as px

# Generate a sample dataset with 200 features and 500 rows
data = df_imputed

# Calculate the correlation matrix
corr_matrix = data.corr()

# Keep only the upper triangle of the correlation matrix
mask = np.tril(np.ones_like(corr_matrix, dtype=bool))
corr_matrix = corr_matrix.where(mask)

# Interactive heatmap with Plotly
fig = px.imshow(
    corr_matrix,
    color_continuous_scale='rainbow',  # A valid Plotly colorscale
    title="Interactive Correlation Matrix Heatmap (Upper Triangle)",
    labels={'x': "Features", 'y': "Features", 'color': "Correlation"}
)
fig.update_layout(width=1000, height=1000)
fig.show()
fig.write_html("correlation_heatmap_hd.html")