In [40]:
# Importing the Packages

import geopandas as gpd
import pandas as pd

In [41]:
# Importing and reading the files from the data folder 

samples_df = pd.read_csv(r'data\RaCA_samples.csv')

In [42]:
# Making the List of Columns to Understand the Unique Identifier for both the datasets (Will be helpful for merging as well)

identifiers_samples = ['samp', 'sample.id', 'rcasiteid', 'pedon_no', 'upedonid', 'upedon']

In [43]:
samples_df[identifiers_samples].head(3)

Unnamed: 0,samp,sample.id,rcasiteid,pedon_no,upedonid,upedon
0,C0101F011-1,1.0,C0101F01,1,C0101F01-1,C0101F011
1,C0101F011-2,1.0,C0101F01,1,C0101F01-1,C0101F011
2,C0101F011-3,2.0,C0101F01,1,C0101F01-1,C0101F011


* `samp` - Looks the most informative, with having the rca site id (C0101F01), along with the pedon number (01) and the sample number (-1)*
* `samp` - XXXXXXXXY-Z (Where X is the rca site ID, y is the pedon number of the site, and Z is the sample number)
* We can use `upedon` and `samp` and `rcasiteid` columns and remove the others
* `samp` - To get the sample IDs
* `rcasiteid` - To Merge with General location data
* `upedon` - Has the pedon number with the rcasiteid

In [44]:
samples_df.drop(columns=['sample.id', 'pedon_no', 'upedonid'], inplace=True)

### Samples Data Set

In [45]:
# Taking the list of columns which are needed for data analysis - Along with the Identifiers. 

samples_columns_needed = ['samp','rcasiteid','upedon','TOP','BOT','Bulkdensity','SOC_pred1','Texture','fragvolc','c_tot_ncs','n_tot_ncs','s_tot_ncs','caco3']
samples_df = samples_df[samples_columns_needed]

In [46]:
samples_df.head()

Unnamed: 0,samp,rcasiteid,upedon,TOP,BOT,Bulkdensity,SOC_pred1,Texture,fragvolc,c_tot_ncs,n_tot_ncs,s_tot_ncs,caco3
0,C0101F011-1,C0101F01,C0101F011,0,5,0.881002,49.674091,PM,0.0,48.09415,2.40134,0.187482,
1,C0101F011-2,C0101F01,C0101F011,5,22,0.881002,49.674091,PM,0.0,50.142574,1.62164,0.095915,
2,C0101F011-3,C0101F01,C0101F011,22,100,0.967665,55.53,PM,0.0,55.529768,1.177027,0.031538,
3,C0101F012-1,C0101F01,C0101F012,0,7,0.884497,43.37,PM,0.0,43.365442,2.165691,0.078548,
4,C0101F012-2,C0101F01,C0101F012,7,41,0.91051,44.86,PM,0.0,44.857623,2.438623,0.082074,


In [47]:
# Checking if the sample ids entries have any duplicate columns or not.

len(samples_df) == samples_df['samp'].nunique()

False

In [48]:
samples_df['samp'].value_counts()

C0408C011-1    2
C0101F011-1    1
C0310W051-1    1
C0310W051-3    1
C0310W051-4    1
              ..
C0208C044-1    1
C0208C044-2    1
C0208C044-3    1
C0208C044-4    1
C0508C012-1    1
Name: samp, Length: 38204, dtype: int64

In [49]:
# Printing the index of the duplicated sampled id.
print(samples_df.loc[samples_df['samp'] == 'C0408C011-1'])

              samp rcasiteid     upedon  TOP  BOT  Bulkdensity  SOC_pred1  \
30320  C0408C011-1  C0408C01  C0408C011    0    5      1.02724   3.774521   
30321  C0408C011-1  C0408C01  C0408C011    5   20      1.02724   3.774521   

      Texture  fragvolc  c_tot_ncs  n_tot_ncs  s_tot_ncs  caco3  
30320    None       0.0   2.808927   0.278951   0.004087    NaN  
30321    None       0.0   2.397246   0.222952   0.000000    NaN  


In [50]:
samples_df.drop(index=30321, inplace=True)

In [51]:
len(samples_df) == samples_df['samp'].nunique()

True

In [52]:
samples_df.to_csv(r'processed_data\sample_data_processed.csv', index=None)