### EDA

#### 20240726_Rockhead_Seoul(False_coordinate_sys).csv

#### columns info

- 앞으로 추가될 comlumns
    - 지질 시대
    - DEM 경사
    - 지형 정보(강, 산 ...)


|column name|Description|test input|
|---|---|---|
|borehole_code|시추공 코드|X|
|x|위도|O|
|y|경도|O|
|Elevation|표고|O|
|depth_start|rockhead 시작 심도|output|
|depth_end|rockhead 종료 심도|X|
|Ground name|지반명- WR: 풍화암, SR: 연암, MR: 보통암, HR: 경암|X|
|Density|1km^2당 시추공 밀도|O|

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#data load
csv = pd.read_csv('data/240726_Rockhead_Seoul(False_coordinate_sys).csv')
df_row = pd.DataFrame(csv)

#### Data check
- info
- distribution

In [None]:
df_row.info()

In [None]:
df_row['Ground name'].unique()

In [None]:
df_row.describe().T

In [None]:
#error값 제거
df_row['depth_fault']=df_row['depth_start'] > df_row['depth_end']
df_row['depth_fault'].value_counts()

In [None]:
df = df_row[df_row['depth_fault']==False]

df.drop(['depth_fault'], axis=1, inplace=True)
df.info()

#### Data Visualization

In [None]:
import utils.eda_utils as eda_utils 

In [None]:
df_v = df.copy()

In [None]:
eda_utils.scatter_relation(df_v, 'x', 'y', 'x, y, depth_start','depth_start')

In [None]:
eda_utils.scatter_relation(df_v, 'x', 'y', 'x, y, Ground name','Ground name')

In [None]:
eda_utils.scatter_relation(df_v, 'x', 'y', 'x, y, Density','Density')

In [None]:
eda_utils.correlation_matrix(df_v, 'correlation of rockhead_seoul', drop_column='borehole_code')

In [None]:
eda_utils.pairplot(df_v, 'correlation of rockhead_seoul', drop_column='borehole_code')

In [None]:
eda_utils.box_plot(df_v, 'Ground name', 'depth_start', 'Ground name & depth_start')

In [None]:
#depth distribution
eda_utils.distribution_histogram(df_v, 'Density', 'Density distribution')

In [None]:
#elevation log scale distribution
eda_utils.distribution_histogram(df_v, 'depth_start', 'depth_start distribution', 'Ground name')

df_v['log_depth_start'] = pd.Series(np.log(df_v['depth_start'].to_numpy()))
eda_utils.distribution_histogram(df_v, 'log_depth_start', 'depth_start log scale distribution', 'Ground name')

#### Data Cleansing
- delete: depth end, borehole_code
- replace: ground name
- normalization: x, y, elevation, density

In [None]:
# replace

def df_row_cleaning(df):

    # normalize
    density_mean, density_std = df['Density'].mean(), df['Density'].std()
    df['Density'] = (df['Density']-density_mean)/density_std

    return df

In [None]:
df = df_row_cleaning(df)

In [None]:
df.head()

In [None]:
df.to_csv('data/240907_Rockhead_Seoul(False_coordinate_sys, density-norm).csv', index=False)

In [None]:
# df_o
den_min = df['Density'].min()
den_max = df['Density'].max()

den_gap = (den_max-den_min)/30

df_o = df.reset_index().copy()

df_o['den_bin'] = (df_o['Density']-den_min)//den_gap

print(df_o.info())

In [None]:
df_o.groupby('den_bin').count()

In [None]:
df_org_test = pd.DataFrame(columns=df_o.columns)

for i in range(0,31):
    df_ot_i = (df_o[df_o['den_bin'] == i]).sample(frac=0.2, random_state=1)

    df_org_test = pd.concat([df_org_test, df_ot_i])

print(df_org_test.info())

In [None]:
df_o = pd.merge(df_o, df_org_test, how='outer', indicator=True)
df_o = df_o.query('_merge == "left_only"').drop(columns=['_merge'])

print(df_o.info())

In [None]:
df_org_val = pd.DataFrame(columns=df_o.columns)

for i in range(0,31):
    df_ov_i = (df_o[df_o['den_bin'] == i]).sample(frac=0.1, random_state=1)

    df_org_val = pd.concat([df_org_val, df_ov_i])

print(df_org_val.info())

In [None]:
df_org_train = pd.merge(df_o, df_org_val, how='outer', indicator=True)
df_org_train = df_org_train.query('_merge == "left_only"').drop(columns=['_merge'])

print(df_org_train.info())

In [None]:
df_org_train = df_org_train.drop(['index', 'den_bin'],axis=1)
df_org_val = df_org_val.drop(['index', 'den_bin'],axis=1)
df_org_test = df_org_test.drop(['index', 'den_bin'],axis=1)

In [None]:
df_org_train.to_csv('data/240907_Rockhead_Seoul_train(false, before_aumentation).csv', index=False)
df_org_val.to_csv('data/240907_Rockhead_Seoul_val(false, before_aumentation).csv', index=False)
df_org_test.to_csv('data/240907_Rockhead_Seoul_test(false, before_aumentation).csv', index=False)