# eODP expeditions

Creates a csv containing eODP expedition information.

In [1]:
import sys
sys.path.append('../scripts/')

import pandas as pd
import numpy as np

from normalize_data import convert_column_names, create_directory

## Import raw data

In [2]:
df = pd.read_excel('./raw_data/Summary of description workbooks_reformatted.xlsx', sheet_name="Column Headers")
df.head(2)

Unnamed: 0,Expedition,Site/Hole,workbook tab name,Sample,Expedition.1,Site,Hole,Core,Core-Section,Type,...,Principal lithology and prefix,Full lithology name,Principal lithology and suffix,Minor lithology Prefix,Minor lithology name,Minor lithology suffix,Minor lithology and prefix,Full minor lithology name,Minor lithology and suffix,misc
0,376,U1528A,volcaniclastic_sedimentary,Sample,,,,,,,...,,Lithology prefix+name+suffix [read only],,,,,,,,
1,375,U1520C,sediment,Sample,,,,,,,...,Lithology prefix+name [read only],Lithology prefix+name+suffix [read only],Lithology name+suffix [read only],,,,,,,


## Data wrangling

### Add data source 

In [3]:
df['data_source_notes'] = "Excel file created by Leah LeVay that normalizes the vocabulary used in OPD workbooks."

### Change column names

In [4]:
new_headers = convert_column_names(['workbook tab name'])
new_headers['Expedition'] = 'name'

In [5]:
df.rename(columns=new_headers, inplace=True)
df.columns

Index(['name', 'Site/Hole', 'workbook_tab_name', 'Sample', 'Expedition.1',
       'Site', 'Hole', 'Core', 'Core-Section', 'Type', 'Section', 'A/W',
       'Top [cm]', 'Bottom [cm]', 'Top Depth [m]', 'Bottom Depth [m]',
       'Lithology Prefix', 'Lithology principal name', 'Lithology suffix',
       'Principal lithology and prefix', 'Full lithology name',
       'Principal lithology and suffix', 'Minor lithology Prefix',
       'Minor lithology name', 'Minor lithology suffix',
       'Minor lithology and prefix', 'Full minor lithology name',
       'Minor lithology and suffix', 'misc', 'data_source_notes'],
      dtype='object')

## Create CSV for database import

### Select columns to export

In [6]:
expeditions = df[['name', 'workbook_tab_name', 'data_source_notes']]

expeditions.head()

Unnamed: 0,name,workbook_tab_name,data_source_notes
0,376,volcaniclastic_sedimentary,Excel file created by Leah LeVay that normaliz...
1,375,sediment,Excel file created by Leah LeVay that normaliz...
2,374,general,Excel file created by Leah LeVay that normaliz...
3,372,sediment,Excel file created by Leah LeVay that normaliz...
4,371,sediment,Excel file created by Leah LeVay that normaliz...


### Check data

In [7]:
len(expeditions) == len(df)

True

In [8]:
expeditions.isna().any()

name                 False
workbook_tab_name    False
data_source_notes    False
dtype: bool

### Create CSV

In [9]:
new_dir = './cleaned_data/get_expeditions_from_crosswalk'

In [11]:
# create_directory(new_dir)
# expeditions.to_csv(f'{new_dir}/expeditions.csv', index=False)