In [23]:
# Import libraries
import pandas as pd
import numpy as np

In [24]:
# Import data
df =  pd.read_csv("everest.csv", low_memory=False)
df

Unnamed: 0,expid,membid,peakid,myear,mseason,fname,lname,sex,age,birthdate,...,totmembers,smtmembers,mdeaths,tothired,nohired,smthired,hdeaths,bcdate,pkname,heightm
0,EVER88401,15,EVER,1988,4,Maurits,Vreudge,M,0,- -,...,17,0,0,10,False,0,1,10/11/1988,Everest,8849
1,EVER88401,13,EVER,1988,4,Christa,Van Schaerdenburg,F,0,- -,...,17,0,0,10,False,0,1,10/11/1988,Everest,8849
2,EVER88401,14,EVER,1988,4,Rudy,Van Snick,M,0,- -,...,17,0,0,10,False,0,1,10/11/1988,Everest,8849
3,EVER88401,19,EVER,1988,4,Ang Rita,Sherpa,M,0,- -,...,17,0,0,10,False,0,1,10/11/1988,Everest,8849
4,EVER88401,20,EVER,1988,4,Lhakpa Dorje (Ang Lhakpa),Sherpa,M,0,- -,...,17,0,0,10,False,0,1,10/11/1988,Everest,8849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21896,EVER20103,10,EVER,2020,1,Tai-Gang,Huang,M,0,- -,...,20,14,0,22,False,21,0,23/04/2020,Everest,8849
21897,EVER20103,11,EVER,2020,1,Wei,Li,M,0,- -,...,20,14,0,22,False,21,0,23/04/2020,Everest,8849
21898,EVER20103,9,EVER,2020,1,Hong-Zhi,Hu,M,0,- -,...,20,14,0,22,False,21,0,23/04/2020,Everest,8849
21899,EVER20103,12,EVER,2020,1,Xiao-Lin,Li,M,0,- -,...,20,14,0,22,False,21,0,23/04/2020,Everest,8849


In [25]:
df.columns

Index(['expid', 'membid', 'peakid', 'myear', 'mseason', 'fname', 'lname',
       'sex', 'age', 'birthdate', 'yob', 'calcage', 'citizen', 'status',
       'residence', 'occupation', 'leader', 'deputy', 'bconly', 'nottobc',
       'support', 'disabled', 'hired', 'sherpa', 'tibetan', 'msuccess',
       'mclaimed', 'mdisputed', 'msolo', 'mtraverse', 'mski', 'mparapente',
       'mspeed', 'mhighpt', 'mperhighpt', 'msmtdate1', 'msmtdate2',
       'msmtdate3', 'msmttime1', 'msmttime2', 'msmttime3', 'mroute1',
       'mroute2', 'mroute3', 'mascent1', 'mascent2', 'mascent3', 'mo2used',
       'mo2none', 'mo2climb', 'mo2descent', 'mo2sleep', 'mo2medical',
       'mo2note', 'death', 'deathdate', 'deathtime', 'deathtype', 'deathhgtm',
       'deathclass', 'msmtbid', 'msmtterm', 'hcn', 'mchksum', 'host', 'comrte',
       'stdrte', 'route1', 'route2', 'route3', 'route4', 'nation', 'leaders',
       'sponsor', 'termreason', 'totmembers', 'smtmembers', 'mdeaths',
       'tothired', 'nohired', 'smthire

### Definitions

<b>expid</b> - Expedition id 

<b>membid</b> - Member id

<b>*NOTE:*</b> - Individua record id is compound key of expedition id + member id

-----

<b>bconly</b> - BC / Advanced BC only (Y/N) if yes, The member did not climb above base camp (or advanced
base camp in cases where the path from base camp does not require technical climbing skills) 
  
<b>nottobc</b> - Not to base camp (Y/N) if yes, The member did not reach base camp

<b>hired</b> - The person was hired by the expedition

<b>msolo</b> - Solo (Y/N)

<b>sherpa</b> - Sherpa (Y/N) 

<b>tibetan</b> - Tibetan (Y/N)

<b>mhighpt</b> - Expedition high-point reached (Y/N)

<b>mperhighpt</b> - Personal high-point (m)

<b>msmdate1</b> - 1st summit / high-point date

<b>msmdate2</b> - 2nd summit date

<b>msmdate3</b> - 3rd summit date

<b>mroute1</b> - 1st ascent route

<b>mroute2</b> - 2nd ascent route

<b>mroute3</b> - 3rd ascent route

<b>mo2used</b> - Oxygen used (Y/N)

<b>mo2none</b> - Oxygen not used (Y/N)

<b>deathhgtm</b> - Death Height (m)

<b>msmtbid</b> - Summit Bid:
```
0 – Unspecified
1 – No summit bid
2 – Aborted below high camp
3 – Aborted at high camp
4 – Aborted above high camp
5 – Successful summit bid
```
<b>nohired</b> - No hired personnel used (above BC)



In [26]:
# Select the required columns 
filtered_df = df[['expid', 'membid', 'myear', 'fname', 'lname', 'sex', 'calcage', 'citizen', 'occupation', 'sherpa', 
                  'tibetan', 'msolo', 'msuccess', 'mhighpt', 'mperhighpt', 'msmtdate1', 'msmtdate2', 'msmtdate3', 
                  'msmttime1', 'msmttime2', 'msmttime3', 'bconly', 'nottobc', 'route1', 'route2', 'route3', 
                  'route4', 'mo2used', 'mo2none', 'death', 'deathdate', 'deathhgtm', 'msmtbid', 'nohired', 'hired']]

In [27]:
filtered_df

Unnamed: 0,expid,membid,myear,fname,lname,sex,calcage,citizen,occupation,sherpa,...,route3,route4,mo2used,mo2none,death,deathdate,deathhgtm,msmtbid,nohired,hired
0,EVER88401,15,1988,Maurits,Vreudge,M,49,Belgium,Physician,False,...,,,False,True,False,- -,0,1,False,False
1,EVER88401,13,1988,Christa,Van Schaerdenburg,F,30,Netherlands,Physician,False,...,,,False,True,False,- -,0,1,False,False
2,EVER88401,14,1988,Rudy,Van Snick,M,32,Belgium,"Furniture maker, cabinet maker & artisan",False,...,,,True,False,False,- -,0,4,False,False
3,EVER88401,19,1988,Ang Rita,Sherpa,M,40,Nepal,High-altitude porter,True,...,,,False,True,False,- -,0,4,False,True
4,EVER88401,20,1988,Lhakpa Dorje (Ang Lhakpa),Sherpa,M,29,Nepal,,True,...,,,True,False,True,23/12/1988,8700,4,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21896,EVER20103,10,2020,Tai-Gang,Huang,M,47,China,,False,...,,,True,False,False,- -,0,1,False,False
21897,EVER20103,11,2020,Wei,Li,M,0,China,,False,...,,,True,False,False,- -,0,5,False,False
21898,EVER20103,9,2020,Hong-Zhi,Hu,M,57,China,,False,...,,,True,False,False,- -,0,5,False,False
21899,EVER20103,12,2020,Xiao-Lin,Li,M,35,China,,False,...,,,True,False,False,- -,0,5,False,False


In [28]:
# Find the number of null items in route column
len(df[df['route1'].isnull()==True])

200

In [29]:
# Remove the 200 items with missing route
df = df[df['route1'].isnull() != True]

In [93]:
x = df[df['route1'] == 'N Col-N Face'] 
len(x)

# ) or (df['route'] == 'S Col-N Face')]

479

In [76]:
filtered_df = filtered_df[filtered_df['route1'].isnull() != True]

In [86]:
# Simplify the route names to either northern or souther approach
for index, row in filtered_df.iterrows():
    if ('Col-N' in row['route1']) or ('N Col' in row['route1']) or ('N Face' in row['route1']) or \
       ('from N' in row['route1']) :
        filtered_df.loc[index,'new_route'] = 'North'
    if ('Col-S' in row['route1']) or ('S Col' in row['route1']) or ('S Face' in row['route1']) or \
       ('from S' in row['route1']) or ('SW Face' in row['route1']):
        filtered_df.loc[index,'new_route'] = 'South'
    if ('Lho' in row['route1']) or ('Lho' in row['route1']) or ('S Pillar' in row['route1']):
        filtered_df.loc[index,'new_route'] = 'Other'

In [79]:
# drop list:
'Lho La-W Ridge (to 7800m)'
 'Lho La-W Ridge (not to Hornbein Couloir)', 'W Cwm-W Ridge'



filtered_df.route1.unique()

array(['S Col-SE Ridge', 'SW Face (Bonington rte)', 'N Col-N Face',
       'N Face (Japanese Couloir)', 'N Col-N Ridge-N Face',
       'NE Ridge (to 7030m)', 'N Face (Great Couloir)',
       'N Face (Hornbein Couloir)', 'N Col-NE Ridge',
       'W Ridge-N Arete-N Face (to 7420m)', 'Lho La-W Ridge',
       'W Ridge-N Face (Hornbein Couloir)', 'Lho La-W Ridge (to 7200m)',
       'N Col (to 7000m)', 'N Face (Japanese & Hornbein Couloirs)',
       'NE Ridge', 'W Ridge from N (7170m)',
       'SW Face (Bonington 1972 rte) (to 8250m)',
       'S Pillar (Polish route to C1)', 'N Col-N Face (Great Couloir)',
       'N Col-N Face (Great Couloir)-W Ridge',
       'N Face (Great Couloir)-W Ridge', 'S Pillar (USA 1981 rte)',
       'W Ridge from N', 'Eastern most buttress of E Face', 'E Face',
       'NE Ridge-N Face', 'Face between N and NE Ridges',
       'S Pillar-SE Ridge', 'Lho La-W Ridge from S', 'SW Face',
       'S Pillar (to 7000m)',
       'Khumbutse-W Ridge-N Face (Hornbein Couloir)',
 

In [87]:
filtered_df['new_route'].isna().sum()


693

In [70]:
# Export the file to csv
filtered_df.to_csv('clean_data.csv', index=False)

### Columns with missing values (issues for deep learning)   
  
fname (14)

lname (60)

Occupation (733) 

calcage = 0 (813)
 

In [76]:
from sqlalchemy import create_engine

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import inspect

from sqlalchemy import Column, Float, Integer, String
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

In [9]:
# Establish database connection
# engine = create_engine('postgres///postgres@localhost:5432/everest_db')
test = df.copy()



# Establish connection to the youth_crime database
# connection_string = "postgres:postgres@localhost:5432/youth_crime_db"
engine = create_engine(f'postgresql://postgres:Garden12@localhost:5432/everest_db')
conn = engine.connect()

NameError: name 'create_engine' is not defined