# Programming for Data Analysis - Project 2

**Ciaran Moran**

***

**Standard imports**

In [144]:
# receiving some user wraning messages, so i found this to prevent them being displayed
# https://stackoverflow.com/questions/9134795/how-to-get-rid-of-specific-warning-messages-in-python-while-keeping-all-other-wa
import warnings
warnings.simplefilter("ignore", category=Warning)

# Imports
import matplotlib.pyplot as plt 
import random
import datetime
import pandas as pd 
import seaborn as sns
import numpy as np
import os

## Open the .csv files
#### We can skip the first X rows in the csv as they are not pure data columns
#### Initially received the error "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 33: invalid start byte"
####
#### Looking online I tried various suggestions from 
#### https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s
####
#### The working solution appears to be encoding='unicode_escape'
####
#### The next issue was rows with all Nan values, which may cause issues later on.
#### For this I tried keep_default_na=False and also skip_blank_lines=True from 
#### https://stackoverflow.com/questions/39297878/how-to-skip-an-unknown-number-of-empty-lines-before-header-on-pandas-read-csv


### Data standardisation

##### File: 41586_2008_BFnature06949_MOESM31_ESM.csv

Here I attempt to standardise the data.

The initial issue is that we have 2 sets of data side by side.

So I extract the data for University of Berlin into a seperate dataframe.

Then I extract the data for LGGE in Grenoble into its own dataframe.

I then rename the column titles to match those of University of Berlin.

Then the dataframes are concatinated together into one dataframe.

The result is a .csv with the data listed in a more consistant order.


In [145]:
# Here we read in 41586_2008_BFnature06949_MOESM31_ESM.csv
#
df = pd.read_csv('data/41586_2008_BFnature06949_MOESM31_ESM.csv', \
                 skiprows=6, encoding='unicode_escape',  skip_blank_lines=True, keep_default_na=False)

####################
# University of Bern
####################
# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
moesm31_1 = df.iloc[0:247, 0:4] # This will copy columns 0 to 3, for rows 0-246 
#Now add in some constants to standardise the data frame
moesm31_1['station'] = 'moesm31'
moesm31_1['uni'] = 'University of Bern'

# Insert new column in position
moesm31_1.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file

print('--------------------------')
print('moesm31_1 - head and tail')
print('--------------------------')
print(moesm31_1.head())
print(moesm31_1.tail())

save_filename = 'data/moesm31_1.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
moesm31_1.to_csv(save_filename, index=False)

print(moesm31_1.tail())

###################
# LGGE in Grenoble
###################
moesm31_2 = df.iloc[0:47, 4:7] # This will give you all rows for columns 4 to 6

print(moesm31_2.head())
# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
moesm31_2.rename(columns={'Depth (m).1': 'Depth (m)', 'EDC3_gas_a (yr).1': 'EDC3_gas_a (yr)', \
                   'CO2 (ppmv).1': 'CO2 (ppmv)' }, inplace=True)
print(moesm31_2.head())

#Now add in some constants to standardise the data frame
moesm31_2['sigma (ppmv)'] = '' # this data isn't present
moesm31_2['station'] = 'moesm31'
moesm31_2['uni'] = 'LGGE in Grenoble'

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
moesm31_2.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		
print('--------------------------')
print('moesm31_2 - head and tail')
print('--------------------------')
print(moesm31_2.head())
print(moesm31_2.tail())

save_filename = 'data/moesm31_2.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
moesm31_2.to_csv(save_filename, index=False)




--------------------------
moesm31_1 - head and tail
--------------------------
  Depth (m) EDC3_gas_a (yr) Gasage (AICC2012, yr BP) CO2 (ppmv) sigma (ppmv)  \
0   3026.58          611269                               257.8          2.1   
1   3030.97          616164                               252.6          0.6   
2   3036.44          623109                               243.3          2.8   
3   3040.87          630183                               204.4          0.9   
4   3043.07          635364                               195.0          1.5   

   station                 uni  
0  moesm31  University of Bern  
1  moesm31  University of Bern  
2  moesm31  University of Bern  
3  moesm31  University of Bern  
4  moesm31  University of Bern  
    Depth (m) EDC3_gas_a (yr) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
242   3187.87          794608                               199.4   
243   3188.23          795202                               195.2   
244   3188.98          796467     

### Now we concatinate both files into one standard format

In [146]:
# We can append (concat) both of the new .csv files  
# Ref: https://www.usepandas.com/csv/append-csv-files
moesm31_combined=pd.concat([moesm31_1, moesm31_2])
# Now add in the source file name, may come in handy later on
moesm31_combined['source file'] = '41586_2008_BFnature06949_MOESM31_ESM.csv'

# write out to csv, may not be necessary, but handy for checking data
save_filename = 'data/moesm31_combined.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
moesm31_combined.to_csv(save_filename, index=False)

print(moesm31_combined.head)
print(moesm31_combined.tail)


<bound method NDFrame.head of    Depth (m) EDC3_gas_a (yr) Gasage (AICC2012, yr BP) CO2 (ppmv) sigma (ppmv)  \
0    3026.58          611269                               257.8          2.1   
1    3030.97          616164                               252.6          0.6   
2    3036.44          623109                               243.3          2.8   
3    3040.87          630183                               204.4          0.9   
4    3043.07          635364                               195.0          1.5   
..       ...             ...                      ...        ...          ...   
42   3183.68          789126                               239.3                
43   3185.88          791767                               217.3                
44   3186.98          793255                               224.2                
45   3188.08          794949                               206.7                
46   3190.28          798893                               193.6               

### File data/grl52461-sup-0003-supplementary.csv

#### Now we process the larger of the two files.
#### I can re-use some of the code I created for the smaller file.
#### Also the lessons learned and knowledge garnered for the smaller file will be invaluable.

***
##  Read in supplementary file grl52461-sup-0003-supplementary.csv


In [147]:
#
# Here we read in grl52461-sup-0003-supplementary.csv
#
df = pd.read_csv('data/grl52461-sup-0003-supplementary.csv', \
                 skiprows=6, encoding='unicode_escape',  skip_blank_lines=True, keep_default_na=False)



***
##  supplementary.csv - Dome C - University of Bern

In [148]:
#########################################
# supplementary.csv - University of Bern
#########################################
row_offset = -7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
print('Processing file grl52461-sup-0003-supplementary.csv...')

station = 'Dome C (0-22 kyr BP)P'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_1 = df.iloc[0:183, 0:5].copy() # This will copy columns 0 to 3, for rows 0-246 
#Now add in some constants to standardise the data frame
suppl_1['station'] = station
suppl_1['uni'] = uni

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_1.to_csv(save_filename, index=False)

print(suppl_1.head())
print(suppl_1.tail())



Processing file grl52461-sup-0003-supplementary.csv...
save_filename : data/suppl_Dome C (0-22 kyr BP)P_University of Bern.csv
  Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
0    102.83               137.00                   350.11     280.40   
1    106.89               268.00                   486.69     274.90   
2    107.20               279.00                   501.20     277.90   
3    110.25               395.00                   539.65     279.10   
4    110.50               404.00                   539.89     281.90   

  sigma mean CO2 (ppmv)                station                 uni  
0                  1.80  Dome C (0-22 kyr BP)P  University of Bern  
1                  0.70  Dome C (0-22 kyr BP)P  University of Bern  
2                  0.70  Dome C (0-22 kyr BP)P  University of Bern  
3                  1.30  Dome C (0-22 kyr BP)P  University of Bern  
4                  1.10  Dome C (0-22 kyr BP)P  University of Bern  
    Depth (m) Gasage (EDC3

***
## supplementary.csv - Vostok - LGGE Grenoble

In [149]:

#########################################
# supplementary.csv  - Vostok - LGGE Grenoble
#########################################
#
# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Vostok (0-440 kyr BP)'
uni = 'LGGE Grenoble'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

print('Processing file grl52461-sup-0003-supplementary.csv...')
suppl_2 = df.iloc[0:372, 5:9].copy() 

#Depth (m).1	Gasage (EDC3, yr BP).1	Gasage (AICC2012, yr BP).1	CO2 (ppmv).1
# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
suppl_2.rename(columns={'Depth (m).1': 'Depth (m)', 'Gasage (EDC3, yr BP).1': 'Gasage (EDC3, yr BP)', \
                    'Gasage (AICC2012, yr BP).1': 'Gasage (AICC2012, yr BP)', \
                   'CO2 (ppmv).1': 'CO2 (ppmv)' }, inplace=True)


#Now add in some constants to standardise the data frame
suppl_2['sigma mean CO2 (ppmv)'] = '0' # add in missing column
suppl_2['station'] = station
suppl_2['uni'] = uni

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_2.to_csv(save_filename, index=False)

print(suppl_2.head())
print(suppl_2.tail())



save_filename : data/suppl_Vostok (0-440 kyr BP)_LGGE Grenoble.csv
Processing file grl52461-sup-0003-supplementary.csv...
  Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
0    149.10              2690.00                              284.70   
1    173.10              3897.00                  3661.93     272.70   
2    177.40              4124.00                  3746.63     268.10   
3    228.60              6735.00                  6449.18     262.20   
4    250.30              7873.00                  7567.35     254.50   

  sigma mean CO2 (ppmv)                station            uni  
0                     0  Vostok (0-440 kyr BP)  LGGE Grenoble  
1                     0  Vostok (0-440 kyr BP)  LGGE Grenoble  
2                     0  Vostok (0-440 kyr BP)  LGGE Grenoble  
3                     0  Vostok (0-440 kyr BP)  LGGE Grenoble  
4                     0  Vostok (0-440 kyr BP)  LGGE Grenoble  
    Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) C

***
## supplementary.csv - Taylor Dome - University of Bern

In [150]:
#########################################
# supplementary.csv - Taylor Dome - University of Bern
#########################################

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Taylor Dome (19-63 kyr BP)'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_3 = df.iloc[0:66-row_offset, 9:13].copy() # This will copy columns from the main .csv file

# Depth (m).2	tentatively synchronized on EDC3 gasage (yr)	CO2 (ppmv).2	sigma mean CO2 (ppmv).1	station	uni
# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
suppl_3.rename(columns={'Depth (m).2': 'Depth (m)', \
                        'tentatively synchronized on EDC3 gasage (yr)': 'Gasage (EDC3, yr BP)', \
                                           'CO2 (ppmv).2': 'CO2 (ppmv)' , \
                                            'sigma mean CO2 (ppmv).1': 'sigma mean CO2 (ppmv)' }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_3['station'] = station
suppl_3['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
suppl_3.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_3.to_csv(save_filename, index=False)

print(suppl_3.head())
print(suppl_3.tail())



save_filename : data/suppl_Taylor Dome (19-63 kyr BP)_University of Bern.csv
  Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
0    380.82             18906.00                              190.50   
1    382.42             23379.00                              189.10   
2    382.76             24011.00                              189.00   
3    383.54             25337.00                              187.70   
4    385.33             27507.00                              195.20   

  sigma mean CO2 (ppmv)                     station                 uni  
0                  1.00  Taylor Dome (19-63 kyr BP)  University of Bern  
1                  0.50  Taylor Dome (19-63 kyr BP)  University of Bern  
2                  1.40  Taylor Dome (19-63 kyr BP)  University of Bern  
3                  1.00  Taylor Dome (19-63 kyr BP)  University of Bern  
4                  0.50  Taylor Dome (19-63 kyr BP)  University of Bern  
   Depth (m) Gasage (EDC3, yr BP) Gasage (AICC

***
## supplementary.csv - Dome C (393-664 kyr BP) - University of Bern

In [151]:
#########################################
# supplementary.csv - Dome C (393-664 kyr BP) - University of Bern
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Dome C (393-664 kyr BP)'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_4 = df.iloc[0:329-row_offset, 13:18].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#Depth (m).3	Gasage (EDC3, yr BP).2	Gasage (AICC2012, yr BP)	Gasage (AICC2012, yr BP).2	CO2 (ppmv).3	
# sigma mean CO2 (ppmv).2	station	uni
suppl_4.rename(columns={'Depth (m).3': 'Depth (m)', \
                        'Gasage (AICC2012, yr BP).2': 'Gasage (AICC2012, yr BP)', \
                        'Gasage (EDC3, yr BP).2': 'Gasage (EDC3, yr BP)', \
                                           'CO2 (ppmv).3': 'CO2 (ppmv)' , \
                                            'sigma mean CO2 (ppmv).2': 'sigma mean CO2 (ppmv)' }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_4['station'] = station
suppl_4['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_4.to_csv(save_filename, index=False)

print(suppl_4.head())
print(suppl_4.tail())



save_filename : data/suppl_Dome C (393-664 kyr BP)_University of Bern.csv
  Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
0   2761.85            415717.00                415945.61     276.40   
1   2762.89            416193.00                416425.32     271.70   
2   2765.08            417191.00                417440.44     273.40   
3   2766.18            417698.00                417956.78     271.80   
4   2767.33            418245.00                418505.30     274.60   

  sigma mean CO2 (ppmv)                  station                 uni  
0                  1.60  Dome C (393-664 kyr BP)  University of Bern  
1                  1.20  Dome C (393-664 kyr BP)  University of Bern  
2                  1.50  Dome C (393-664 kyr BP)  University of Bern  
3                  1.70  Dome C (393-664 kyr BP)  University of Bern  
4                  1.80  Dome C (393-664 kyr BP)  University of Bern  
    Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (pp

In [152]:
#########################################
# supplementary.csv - Dome C (393-664 kyr BP) - LGGE Grenoble
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Dome C (393-664 kyr BP)'
uni = 'LGGE Grenoble'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_5 = df.iloc[0:38-row_offset, 18:23].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#REQUIRED HEADINGS
#Depth (m)	Gasage (EDC3, yr BP)	Gasage (AICC2012, yr BP)	CO2 (ppmv)	sigma mean CO2 (ppmv)	station	uni


suppl_5.rename(columns={'Depth (m).4': 'Depth (m)', \
                        'Gasage (EDC3, yr BP).3': 'Gasage (EDC3, yr BP)', \
                        'Gasage (AICC2012, yr BP).3': 'Gasage (AICC2012, yr BP)', \
                        'CO2 (ppmv).4': 'CO2 (ppmv)' , \
                        'sigma mean CO2 (ppmv).3': 'sigma mean CO2 (ppmv)' }, \
                        inplace=True) 


#Now add in some constants to standardise the data frame
suppl_5['station'] = station
suppl_5['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_5.to_csv(save_filename, index=False)

print(suppl_5.head())
print(suppl_5.tail())



save_filename : data/suppl_Dome C (393-664 kyr BP)_LGGE Grenoble.csv
  Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
0   2700.77            392544.00                392977.15     259.50   
1   2702.97            393579.00                394003.69     273.60   
2   2705.17            394560.00                394906.10     260.70   
3   2713.97            398086.00                398275.13     276.30   
4   2718.37            399722.00                399926.16     277.10   

  Depth (m).5                  station            uni  
0     3026.58  Dome C (393-664 kyr BP)  LGGE Grenoble  
1     3030.97  Dome C (393-664 kyr BP)  LGGE Grenoble  
2     3036.44  Dome C (393-664 kyr BP)  LGGE Grenoble  
3     3040.87  Dome C (393-664 kyr BP)  LGGE Grenoble  
4     3043.07  Dome C (393-664 kyr BP)  LGGE Grenoble  
   Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
26   2775.57            422374.00                422889.84     274.30   
27   2777.77    

***
## supplementary.csv - Dome C (611-800 kyr BP) - University of Bern

In [153]:
#########################################
# supplementary.csv - Dome C (611-800 kyr BP) - University of Bern
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Dome C (611-800 kyr BP)'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_5 = df.iloc[0:258-row_offset, 22:27].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#Depth (m).3	Gasage (EDC3, yr BP).2	Gasage (AICC2012, yr BP)	Gasage (AICC2012, yr BP).2	CO2 (ppmv).3	
# sigma mean CO2 (ppmv).2	station	uni
suppl_5.rename(columns={'Depth (m).5': 'Depth (m)', \
                        'Gasage (EDC3, yr BP).4': 'Gasage (EDC3, yr BP)', \
                        'Gasage (AICC2012, yr BP).4': 'Gasage (AICC2012, yr BP)', \
                        'sigma mean CO2 (ppmv).3': 'sigma mean CO2 (ppmv)', \
                                           'CO2 (ppmv).5': 'CO2 (ppmv)' }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_5['station'] = station
suppl_5['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_5.to_csv(save_filename, index=False)

print(suppl_5.head())
print(suppl_5.tail())



save_filename : data/suppl_Dome C (611-800 kyr BP)_University of Bern.csv
  Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
0   3026.58            611269.00                611555.76     257.80   
1   3030.97            616164.00                616298.56     252.60   
2   3036.44            623109.00                622946.80     243.30   
3   3040.87            630183.00                629768.93     204.40   
4   3043.07            635364.00                634879.92     195.00   

  sigma mean CO2 (ppmv)                  station                 uni  
0                  2.10  Dome C (611-800 kyr BP)  University of Bern  
1                  0.60  Dome C (611-800 kyr BP)  University of Bern  
2                  2.80  Dome C (611-800 kyr BP)  University of Bern  
3                  0.90  Dome C (611-800 kyr BP)  University of Bern  
4                  1.50  Dome C (611-800 kyr BP)  University of Bern  
    Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (pp

In [154]:
#########################################
# supplementary.csv - Dome C (611-800 kyr BP) - LGGE Grenoble
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Dome C (611-800 kyr BP)'
uni = 'LGGE Grenoble'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_6 = df.iloc[0:54-row_offset, 27:31].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#Depth (m).6	Gasage (EDC3, yr BP).5	Gasage (AICC2012, yr BP).5	CO2 (ppmv).6	

suppl_6.rename(columns={'Depth (m).6': 'Depth (m)', \
                        'Gasage (EDC3, yr BP).5': 'Gasage (EDC3, yr BP)', \
                        'Gasage (AICC2012, yr BP).5': 'Gasage (AICC2012, yr BP)', \
                                           'CO2 (ppmv).6': 'CO2 (ppmv)' }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_6['station'] = station
suppl_6['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_6.to_csv(save_filename, index=False)

print(suppl_6.head())
print(suppl_6.tail())



save_filename : data/suppl_Dome C (611-800 kyr BP)_LGGE Grenoble.csv
  Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
0   3061.71            667435.00                667256.80     178.50   
1   3063.98            670124.00                670241.01     189.00   
2   3085.78            688035.00                690566.62     234.00   
3   3086.88            688751.00                691327.29     235.40   
4   3087.98            689444.00                692134.74     241.00   

                   station            uni  
0  Dome C (611-800 kyr BP)  LGGE Grenoble  
1  Dome C (611-800 kyr BP)  LGGE Grenoble  
2  Dome C (611-800 kyr BP)  LGGE Grenoble  
3  Dome C (611-800 kyr BP)  LGGE Grenoble  
4  Dome C (611-800 kyr BP)  LGGE Grenoble  
   Depth (m) Gasage (EDC3, yr BP) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
42   3183.68            789126.00                789716.55     239.30   
43   3185.88            791767.00                792311.24     217.30   
44   3186.98   

***
## supplementary.csv - Talos Dome (35-68 kyr BP) - University of Bern

In [155]:
#########################################
# supplementary.csv - Talos Dome (35-68 kyr BP) - University of Bern
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Talos Dome (35-68 kyr BP)'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_7 = df.iloc[0:123-row_offset, 31:37].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#Depth (m).3	Gasage (EDC3, yr BP).2	Gasage (AICC2012, yr BP)	Gasage (AICC2012, yr BP).2	CO2 (ppmv).3	
# sigma mean CO2 (ppmv).2	station	uni
suppl_7.rename(columns={'Depth (m).7': 'Depth (m)', \
                        'Gasage (EDC3, yr BP).6': 'Gasage (EDC3, yr BP)', \
                        'Gasage (AICC2012, yr BP).6': 'Gasage (AICC2012, yr BP)', \
                                           'CO2 (ppmv).7': 'CO2 (ppmv)', \
                                            'sigma mean CO2 (ppmv).4': 'sigma mean CO2 (ppmv)' \
                                            }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_7['station'] = station
suppl_7['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_7.to_csv(save_filename, index=False)

print(suppl_7.head())
print(suppl_7.tail())



save_filename : data/suppl_Talos Dome (35-68 kyr BP)_University of Bern.csv
  Depth (m) Gasage (TALDICE-1a, yr BP) Gasage (EDML1 Sz4, yr BP)  \
0   1009.64                   34166.74                  34164.76   
1   1017.91                   35017.60                  34741.72   
2   1025.91                   35521.63                  35326.02   
3   1033.92                   36159.93                  36028.26   
4   1041.89                   36820.90                  36692.96   

  Gasage (AICC2012, yr BP) CO2 (ppmv) sigma mean CO2 (ppmv)  \
0                 34359.95     152.65                  4.47   
1                 35093.65     176.36                 12.66   
2                 35663.78     203.36                 10.37   
3                 36258.59     184.04                 16.93   
4                 36912.69     204.89                  6.68   

                     station                 uni  
0  Talos Dome (35-68 kyr BP)  University of Bern  
1  Talos Dome (35-68 kyr BP)  Univ

### Now we concatinate all the supplementary files into one standard format

In [156]:
# We can append (concat) all of the new .csv files  
# Ref: https://www.usepandas.com/csv/append-csv-files
suppl_combined=pd.concat([suppl_3,suppl_4,suppl_5,suppl_6,suppl_7])
# Now add in the source file name, may come in handy later on
suppl_combined['source file'] = 'grl52461-sup-0003-supplementary.csv'

# write out to csv, may not be necessary, but handy for checking data
save_filename = 'data/suppl_combined.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_combined.to_csv(save_filename, index=False)
#print(suppl_combined.head)
#print(suppl_combined.tail)

### Now finally combine both file

In [157]:
all_combined=pd.concat([moesm31_combined,suppl_combined])
# write out to csv, may not be necessary, but handy for checking data
save_filename = 'data/all_combined.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
all_combined.to_csv(save_filename, index=False)
print(all_combined.head)
print(all_combined.tail)

<bound method NDFrame.head of     Depth (m) EDC3_gas_a (yr) Gasage (AICC2012, yr BP) CO2 (ppmv)  \
0     3026.58          611269                               257.8   
1     3030.97          616164                               252.6   
2     3036.44          623109                               243.3   
3     3040.87          630183                               204.4   
4     3043.07          635364                               195.0   
..        ...             ...                      ...        ...   
111   1291.43             NaN                 66422.36     199.11   
112   1293.23             NaN                 67065.32     196.79   
113   1295.58             NaN                 67964.88     202.48   
114   1297.93             NaN                 68952.11     208.16   
115   1299.53             NaN                 69672.40     207.65   

    sigma (ppmv)                    station                 uni  \
0            2.1                    moesm31  University of Bern   
1      

***

## End