# Programming for Data Analysis - Project 2

**Ciaran Moran**

***

**Standard imports**

In [None]:
# receiving some user wraning messages, so i found this to prevent them being displayed
# https://stackoverflow.com/questions/9134795/how-to-get-rid-of-specific-warning-messages-in-python-while-keeping-all-other-wa
import warnings
warnings.simplefilter("ignore", category=Warning)

# Imports
import matplotlib.pyplot as plt 
import random
import datetime
import pandas as pd 
import seaborn as sns
import numpy as np
import os

## Open the .csv files
#### We can skip the first X rows in the csv as they are not pure data columns
#### Initially received the error "UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 33: invalid start byte"
####
#### Looking online I tried various suggestions from 
#### https://stackoverflow.com/questions/22216076/unicodedecodeerror-utf8-codec-cant-decode-byte-0xa5-in-position-0-invalid-s
####
#### The working solution appears to be encoding='unicode_escape'
####
#### The next issue was rows with all Nan values, which may cause issues later on.
#### For this I tried keep_default_na=False and also skip_blank_lines=True from 
#### https://stackoverflow.com/questions/39297878/how-to-skip-an-unknown-number-of-empty-lines-before-header-on-pandas-read-csv


### Data standardisation

##### File: 41586_2008_BFnature06949_MOESM31_ESM.csv

Here I attempt to standardise the data.

The initial issue is that we have 2 sets of data side by side.

So I extract the data for University of Berlin into a seperate dataframe.

Then I extract the data for LGGE in Grenoble into its own dataframe.

I then rename the column titles to match those of University of Berlin.

Then the dataframes are concatinated together into one dataframe.

The result is a .csv with the data listed in a more consistant order.


In [None]:
# Here we read in 41586_2008_BFnature06949_MOESM31_ESM.csv
#
df = pd.read_csv('data/41586_2008_BFnature06949_MOESM31_ESM.csv', \
                 skiprows=6, encoding='unicode_escape',  skip_blank_lines=True, keep_default_na=False)

####################
# University of Bern
####################
# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
moesm31_1 = df.iloc[0:247, 0:4] # This will copy columns 0 to 3, for rows 0-246 
#Now add in some constants to standardise the data frame
moesm31_1['station'] = 'moesm31'
moesm31_1['uni'] = 'University of Bern'

# Insert new column in position
moesm31_1.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file

print('--------------------------')
print('moesm31_1 - head and tail')
print('--------------------------')
print(moesm31_1.head())
print(moesm31_1.tail())

save_filename = 'data/moesm31_1.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
moesm31_1.to_csv(save_filename, index=False)

print(moesm31_1.tail())

###################
# LGGE in Grenoble
###################
moesm31_2 = df.iloc[0:47, 4:7] # This will give you all rows for columns 4 to 6

print(moesm31_2.head())
# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
moesm31_2.rename(columns={'Depth (m).1': 'Depth (m)', 'EDC3_gas_a (yr).1': 'EDC3_gas_a (yr)', \
                   'CO2 (ppmv).1': 'CO2 (ppmv)' }, inplace=True)
print(moesm31_2.head())

#Now add in some constants to standardise the data frame
moesm31_2['sigma (ppmv)'] = '' # this data isn't present
moesm31_2['station'] = 'moesm31'
moesm31_2['uni'] = 'LGGE in Grenoble'

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
moesm31_2.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		
print('--------------------------')
print('moesm31_2 - head and tail')
print('--------------------------')
print(moesm31_2.head())
print(moesm31_2.tail())

save_filename = 'data/moesm31_2.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
moesm31_2.to_csv(save_filename, index=False)




### Now we concatinate both files into one standard format

In [None]:
# We can append (concat) both of the new .csv files  
# Ref: https://www.usepandas.com/csv/append-csv-files
moesm31_combined=pd.concat([moesm31_1, moesm31_2])
# Now add in the source file name, may come in handy later on
moesm31_combined['source file'] = '41586_2008_BFnature06949_MOESM31_ESM.csv'

# write out to csv, may not be necessary, but handy for checking data
save_filename = 'data/moesm31_combined.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
moesm31_combined.to_csv(save_filename, index=False)

print(moesm31_combined.head)
print(moesm31_combined.tail)


### File data/grl52461-sup-0003-supplementary.csv

#### Now we process the larger of the two files.
#### I can re-use some of the code I created for the smaller file.
#### Also the lessons learned and knowledge garnered for the smaller file will be invaluable.

***
##  Read in supplementary file grl52461-sup-0003-supplementary.csv


In [None]:
#
# Here we read in grl52461-sup-0003-supplementary.csv
#
df = pd.read_csv('data/grl52461-sup-0003-supplementary.csv', \
                 skiprows=6, encoding='unicode_escape',  skip_blank_lines=True, keep_default_na=False)



***
##  supplementary.csv - Dome C - University of Bern

In [None]:
#########################################
# supplementary.csv - University of Bern
#########################################
row_offset = -7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
print('Processing file grl52461-sup-0003-supplementary.csv...')

station = 'Dome C (0-22 kyr BP)P'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_1 = df.iloc[0:183, 0:5].copy() # This will copy columns 0 to 3, for rows 0-246 
#Now add in some constants to standardise the data frame
suppl_1['station'] = station
suppl_1['uni'] = uni

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_1.to_csv(save_filename, index=False)

print(suppl_1.head())
print(suppl_1.tail())



***
## supplementary.csv - Vostok - LGGE Grenoble

In [None]:

#########################################
# supplementary.csv  - Vostok - LGGE Grenoble
#########################################
#
# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Vostok (0-440 kyr BP)'
uni = 'LGGE Grenoble'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

print('Processing file grl52461-sup-0003-supplementary.csv...')
suppl_2 = df.iloc[0:372, 5:9].copy() 

#Depth (m).1	Gasage (EDC3, yr BP).1	Gasage (AICC2012, yr BP).1	CO2 (ppmv).1
# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
suppl_2.rename(columns={'Depth (m).1': 'Depth (m)', 'Gasage (EDC3, yr BP).1': 'Gasage (EDC3, yr BP)', \
                    'Gasage (AICC2012, yr BP).1': 'Gasage (AICC2012, yr BP)', \
                   'CO2 (ppmv).1': 'CO2 (ppmv)' }, inplace=True)


#Now add in some constants to standardise the data frame
suppl_2['sigma mean CO2 (ppmv)'] = '0' # add in missing column
suppl_2['station'] = station
suppl_2['uni'] = uni

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_2.to_csv(save_filename, index=False)

print(suppl_2.head())
print(suppl_2.tail())



***
## supplementary.csv - Taylor Dome - University of Bern

In [None]:
#########################################
# supplementary.csv - Taylor Dome - University of Bern
#########################################

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Taylor Dome (19-63 kyr BP)'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_3 = df.iloc[0:66-row_offset, 9:13].copy() # This will copy columns from the main .csv file

# Depth (m).2	tentatively synchronized on EDC3 gasage (yr)	CO2 (ppmv).2	sigma mean CO2 (ppmv).1	station	uni
# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
suppl_3.rename(columns={'Depth (m).2': 'Depth (m)', \
                        'tentatively synchronized on EDC3 gasage (yr)': 'Gasage (EDC3, yr BP)', \
                                           'CO2 (ppmv).2': 'CO2 (ppmv)' , \
                                            'sigma mean CO2 (ppmv).1': 'sigma mean CO2 (ppmv)' }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_3['station'] = station
suppl_3['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
suppl_3.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_3.to_csv(save_filename, index=False)

print(suppl_3.head())
print(suppl_3.tail())



***
## supplementary.csv - Dome C (393-664 kyr BP) - University of Bern

In [None]:
#########################################
# supplementary.csv - Dome C (393-664 kyr BP) - University of Bern
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Dome C (393-664 kyr BP)'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_4 = df.iloc[0:329-row_offset, 13:18].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#Depth (m).3	Gasage (EDC3, yr BP).2	Gasage (AICC2012, yr BP)	Gasage (AICC2012, yr BP).2	CO2 (ppmv).3	
# sigma mean CO2 (ppmv).2	station	uni
suppl_4.rename(columns={'Depth (m).3': 'Depth (m)', \
                        'Gasage (AICC2012, yr BP).2': 'Gasage (AICC2012, yr BP)', \
                                           'CO2 (ppmv).3': 'CO2 (ppmv)' , \
                                            'sigma mean CO2 (ppmv).2': 'sigma mean CO2 (ppmv)' }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_4['station'] = station
suppl_4['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_4.to_csv(save_filename, index=False)

print(suppl_4.head())
print(suppl_4.tail())



In [None]:
#########################################
# supplementary.csv - Dome C (393-664 kyr BP) - LGGE Grenoble
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Dome C (393-664 kyr BP)'
uni = 'LGGE Grenoble'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_5 = df.iloc[0:38-row_offset, 18:23].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#REQUIRED HEADINGS
#Depth (m)	Gasage (EDC3, yr BP)	Gasage (AICC2012, yr BP)	CO2 (ppmv)	sigma mean CO2 (ppmv)	station	uni


suppl_5.rename(columns={'Depth (m).4': 'Depth (m)', \
                        'Gasage (EDC3, yr BP).3': 'Gasage (EDC3, yr BP)', \
                        'Gasage (AICC2012, yr BP).3': 'Gasage (AICC2012, yr BP)', \
                        'CO2 (ppmv).4': 'CO2 (ppmv)' , \
                        'sigma mean CO2 (ppmv).3': 'sigma mean CO2 (ppmv)' }, \
                        inplace=True) 


#Now add in some constants to standardise the data frame
suppl_5['station'] = station
suppl_5['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_5.to_csv(save_filename, index=False)

print(suppl_5.head())
print(suppl_5.tail())



***
## supplementary.csv - Dome C (611-800 kyr BP) - University of Bern

In [None]:
#########################################
# supplementary.csv - Dome C (611-800 kyr BP) - University of Bern
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Dome C (611-800 kyr BP)'
uni = 'University of Bern'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_5 = df.iloc[0:258-row_offset, 22:27].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#Depth (m).3	Gasage (EDC3, yr BP).2	Gasage (AICC2012, yr BP)	Gasage (AICC2012, yr BP).2	CO2 (ppmv).3	
# sigma mean CO2 (ppmv).2	station	uni
suppl_5.rename(columns={'Depth (m).5': 'Depth (m)', \
                        'Gasage (EDC3, yr BP).4': 'Gasage (EDC3, yr BP)', \
                        'Gasage (AICC2012, yr BP).4': 'Gasage (AICC2012, yr BP)', \
                                           'CO2 (ppmv).5': 'CO2 (ppmv)' }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_5['station'] = station
suppl_5['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_5.to_csv(save_filename, index=False)

print(suppl_5.head())
print(suppl_5.tail())



In [None]:
#########################################
# supplementary.csv - Dome C (611-800 kyr BP) - LGGE Grenoble
#########################################

row_offset = 7

# Ref https://stackoverflow.com/questions/61553063/read-csv-file-by-column-number-in-pandas-python
#
station = 'Dome C (611-800 kyr BP)'
uni = 'LGGE Grenoble'
save_filename = 'data/suppl_' + station + '_' + uni + '.csv'
print('save_filename :', save_filename)

suppl_6 = df.iloc[0:54-row_offset, 27:31].copy() # This will copy columns from the main .csv file

# https://stackoverflow.com/questions/11346283/renaming-column-names-in-pandas
#Depth (m).6	Gasage (EDC3, yr BP).5	Gasage (AICC2012, yr BP).5	CO2 (ppmv).6	

suppl_6.rename(columns={'Depth (m).6': 'Depth (m)', \
                        'Gasage (EDC3, yr BP).5': 'Gasage (EDC3, yr BP)', \
                        'Gasage (AICC2012, yr BP).5': 'Gasage (AICC2012, yr BP)', \
                                           'CO2 (ppmv).6': 'CO2 (ppmv)' }, \
                                            inplace=True) 


#Now add in some constants to standardise the data frame
suppl_6['station'] = station
suppl_6['uni'] = uni

# Insert new column in position
# https://discuss.codecademy.com/t/can-we-add-a-new-column-at-a-specific-position-in-a-pandas-dataframe/355842
#suppl_4.insert(2,'Gasage (AICC2012, yr BP)',' ') # As this column exists in the supplementary file
		

if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_6.to_csv(save_filename, index=False)

print(suppl_6.head())
print(suppl_6.tail())



### Now we concatinate all the supplementary files into one standard format

In [None]:
# We can append (concat) all of the new .csv files  
# Ref: https://www.usepandas.com/csv/append-csv-files
suppl_combined=pd.concat([suppl_3,suppl_4,suppl_5,suppl_6])
# Now add in the source file name, may come in handy later on
suppl_combined['source file'] = 'grl52461-sup-0003-supplementary.csv'

# write out to csv, may not be necessary, but handy for checking data
save_filename = 'data/suppl_combined.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
suppl_combined.to_csv(save_filename, index=False)
print(suppl_combined.head)
print(suppl_combined.tail)

### Now finally combine both file

In [None]:
all_combined=pd.concat([moesm31_combined,suppl_combined])
# write out to csv, may not be necessary, but handy for checking data
save_filename = 'data/all_combined.csv'
if os.path.isfile(save_filename): os.remove(save_filename) # delete if exists
all_combined.to_csv(save_filename, index=False)
print(all_combined.head)
print(all_combined.tail)

***

## End