# eODP expeditions

Creates a csv containing eODP expedition information.

In [1]:
import sys
sys.path.append('../scripts/')

import pandas as pd
import numpy as np

from normalize_data import convert_column_names, create_directory

## Import raw data

In [2]:
df = pd.read_excel('./raw_data/Summary of description workbooks_reformatted.xlsx', sheet_name="Column Headers")
df.head(2)

Unnamed: 0,Expedition,Site/Hole,workbook tab name,Sample,Expedition.1,Site,Hole,Core,Core-Section,Type,...,Principal lithology and prefix,Full lithology name,Principal lithology and suffix,Minor lithology Prefix,Minor lithology name,Minor lithology suffix,Minor lithology and prefix,Full minor lithology name,Minor lithology and suffix,misc
0,317,U1353A,lithology,Label ID,Exp,Site,Hole,Core,Core-Sect,Type,...,,Lithology name,,,,,,,,
1,318,,**these are a mess**,,,,,,,,...,,,,,,,,,,


In [3]:
df.shape

(38, 29)

## Data wrangling

### Add data source 

In [4]:
df['data_source_notes'] = "Excel file created by Leah LeVay that normalizes the vocabulary used in OPD workbooks."

### Change column names

In [5]:
new_headers = convert_column_names(['workbook tab name'])
new_headers['Expedition'] = 'name'

In [6]:
df.rename(columns=new_headers, inplace=True)
df.columns

Index(['name', 'Site/Hole', 'workbook_tab_name', 'Sample', 'Expedition.1',
       'Site', 'Hole', 'Core', 'Core-Section', 'Type', 'Section', 'A/W',
       'Top [cm]', 'Bottom [cm]', 'Top Depth [m]', 'Bottom Depth [m]',
       'Lithology Prefix', 'Lithology principal name', 'Lithology suffix',
       'Principal lithology and prefix', 'Full lithology name',
       'Principal lithology and suffix', 'Minor lithology Prefix',
       'Minor lithology name', 'Minor lithology suffix',
       'Minor lithology and prefix', 'Full minor lithology name',
       'Minor lithology and suffix', 'misc', 'data_source_notes'],
      dtype='object')

### Change index

Increment index so index starts with one

In [7]:
 df.index += 1

Set the index name

In [8]:
df.index.name = 'id'
df.head()

Unnamed: 0_level_0,name,Site/Hole,workbook_tab_name,Sample,Expedition.1,Site,Hole,Core,Core-Section,Type,...,Full lithology name,Principal lithology and suffix,Minor lithology Prefix,Minor lithology name,Minor lithology suffix,Minor lithology and prefix,Full minor lithology name,Minor lithology and suffix,misc,data_source_notes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,317,U1353A,lithology,Label ID,Exp,Site,Hole,Core,Core-Sect,Type,...,Lithology name,,,,,,,,,Excel file created by Leah LeVay that normaliz...
2,318,,**these are a mess**,,,,,,,,...,,,,,,,,,,Excel file created by Leah LeVay that normaliz...
3,320,U1335B,lithology,Label ID,Exp,Site,Hole,Core,Core-Sect,Type,...,Lithology name,,,,,,,,,Excel file created by Leah LeVay that normaliz...
4,321,U1338B,**these are a mess**,,,,,,,,...,,,,,,,,,,Excel file created by Leah LeVay that normaliz...
5,323,U1343A,lithology & constituents,Label ID,Exp,Site,Hole,Core,Core-Sect,Type,...,COMPLETE NAME,,,,,,,,,Excel file created by Leah LeVay that normaliz...


## Create CSV for database import

### Select columns to export

In [9]:
expeditions = df[['name', 'workbook_tab_name', 'data_source_notes']]

expeditions.head()

Unnamed: 0_level_0,name,workbook_tab_name,data_source_notes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,317,lithology,Excel file created by Leah LeVay that normaliz...
2,318,**these are a mess**,Excel file created by Leah LeVay that normaliz...
3,320,lithology,Excel file created by Leah LeVay that normaliz...
4,321,**these are a mess**,Excel file created by Leah LeVay that normaliz...
5,323,lithology & constituents,Excel file created by Leah LeVay that normaliz...


### Check data

In [10]:
len(expeditions) == len(df)

True

In [11]:
expeditions.isna().any()

name                 False
workbook_tab_name     True
data_source_notes    False
dtype: bool

### Create CSV

In [12]:
new_dir = './cleaned_data/get_expeditions_from_crosswalk'

In [13]:
create_directory(new_dir)
expeditions.to_csv(f'{new_dir}/expeditions.csv' )