<a href="https://colab.research.google.com/github/cskipper07/Data-Science/blob/main/1_Intraobserver_error_cranmet_copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Intraobserver error**
This file compares two rounds of analysis (i.e., round1 and round2) collected by a single observer to assess intraobserver error. Some outputs have been removed to protect PII and/or the raw data.

### Import libraries

In [None]:
import pandas as pd
import scipy
import numpy as np
import seaborn as sns
# import pingouin as pt
import os

In [None]:
!pip install --upgrade openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 5.4 MB/s 
Installing collected packages: openpyxl
  Attempting uninstall: openpyxl
    Found existing installation: openpyxl 2.5.9
    Uninstalling openpyxl-2.5.9:
      Successfully uninstalled openpyxl-2.5.9
Successfully installed openpyxl-3.0.9


# --- Craniometrics ---

### Import data

### **Input data files:**
*   *USCran_intraobserver_round1.xlsx* and *USCran_intraobserver_round2.xlsx*
*   These files were created from the original Microsoft Access database and include all variables for the two rounds of data collection intended for intraobserver error analysis.
*   *USCran_original.xlsx*
*   *craniometric trait codes and names.xlsx*


In [None]:
cranmet_US = pd.read_excel('USCran_original.xlsx')

In [None]:
cranmet_US.head()

In [None]:
round1 = pd.read_excel('USCran_intraobserver_round1.xlsx')
round1.head(10)

In [None]:
round2 = pd.read_excel('USCran_intraobserver_round2.xlsx')
round2.head(10)

#### Explore data

In [None]:
round1_colnames = list(round1.columns)
for c in round1_colnames:
    print(c)

SkelID
Collection
 GOL
 NOL
 BNL
 BBH
 XCB
 XFB
 WFB
 ZYB
 AUB
 ASB
 BPL
 NPH
 NLH
 JUB
 NLB
 MAB
 MAL
 MDH
 OBH
 OBB
 DKB
 NDS
 WNB
 SIS
 ZMB
 SSS
 FMB
 NAS
 EKB
 DKS
 IML
 XML
 MLS
 WMH
 GLS
 STB
 FRC
 FRS
 FRF
 PAC
 PAS
 PAF
 OCC
 OCS
 OCF
 FOL
 FOB
 NAR
 SSR
 PRR
 DKR
 ZOR
 FMR
 EKR
 ZMR
 AVR
 BRR
 VRR
 LAR
 OSR
 BAR
 MOW
 UFBR
 UFHT
 NAA
 PRA
 BAA
 NBA
 BBA
 BRA
 SSA
 NFA
 DKA
 NDA
 SIA
 FRA
 PAA
 OCA
 RFA
 RPA
 ROA
 BSA
 SBA
 SLA
 TBA


##### Drops columns

In [None]:
# drop 'Collection' and 'SkelID' columns in round1

round1 = round1.drop(columns=['Collection'], axis=1)
round1 = round1.drop(columns=['SkelID'], axis=1)
round1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 85 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0    GOL    3 non-null      float64
 1    NOL    3 non-null      float64
 2    BNL    4 non-null      int64  
 3    BBH    3 non-null      float64
 4    XCB    4 non-null      int64  
 5    XFB    3 non-null      float64
 6    WFB    4 non-null      int64  
 7    ZYB    4 non-null      int64  
 8    AUB    4 non-null      int64  
 9    ASB    3 non-null      float64
 10   BPL    4 non-null      int64  
 11   NPH    4 non-null      int64  
 12   NLH    4 non-null      int64  
 13   JUB    4 non-null      int64  
 14   NLB    3 non-null      float64
 15   MAB    2 non-null      float64
 16   MAL    3 non-null      float64
 17   MDH    4 non-null      int64  
 18   OBH    4 non-null      int64  
 19   OBB    4 non-null      int64  
 20   DKB    4 non-null      int64  
 21   NDS    4 non-null      int64  
 22   WNB  

In [None]:
# remove random space before the measurement variable names

for c in round1.columns:
  print(c.strip())
  round1
  round1.rename(columns={c:c.strip()}, inplace=True)

GOL
NOL
BNL
BBH
XCB
XFB
WFB
ZYB
AUB
ASB
BPL
NPH
NLH
JUB
NLB
MAB
MAL
MDH
OBH
OBB
DKB
NDS
WNB
SIS
ZMB
SSS
FMB
NAS
EKB
DKS
IML
XML
MLS
WMH
GLS
STB
FRC
FRS
FRF
PAC
PAS
PAF
OCC
OCS
OCF
FOL
FOB
NAR
SSR
PRR
DKR
ZOR
FMR
EKR
ZMR
AVR
BRR
VRR
LAR
OSR
BAR
MOW
UFBR
UFHT
NAA
PRA
BAA
NBA
BBA
BRA
SSA
NFA
DKA
NDA
SIA
FRA
PAA
OCA
RFA
RPA
ROA
BSA
SBA
SLA
TBA


In [None]:
# drop unknown columns in round1

round1 = round1.drop(columns=['RFA'], axis=1)
round1 = round1.drop(columns=['RPA'], axis=1)
round1 = round1.drop(columns=['ROA'], axis=1)
round1 = round1.drop(columns=['BSA'], axis=1)
round1 = round1.drop(columns=['SBA'], axis=1)
round1 = round1.drop(columns=['SLA'], axis=1)
round1 = round1.drop(columns=['TBA'], axis=1)
round1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 78 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GOL     3 non-null      float64
 1   NOL     3 non-null      float64
 2   BNL     4 non-null      int64  
 3   BBH     3 non-null      float64
 4   XCB     4 non-null      int64  
 5   XFB     3 non-null      float64
 6   WFB     4 non-null      int64  
 7   ZYB     4 non-null      int64  
 8   AUB     4 non-null      int64  
 9   ASB     3 non-null      float64
 10  BPL     4 non-null      int64  
 11  NPH     4 non-null      int64  
 12  NLH     4 non-null      int64  
 13  JUB     4 non-null      int64  
 14  NLB     3 non-null      float64
 15  MAB     2 non-null      float64
 16  MAL     3 non-null      float64
 17  MDH     4 non-null      int64  
 18  OBH     4 non-null      int64  
 19  OBB     4 non-null      int64  
 20  DKB     4 non-null      int64  
 21  NDS     4 non-null      int64  
 22  WNB   

In [None]:
# drop 'Collection' and 'SkelID' column in round2

round2 = round2.drop(columns=['Collection'])
round2 = round2.drop(columns=['SkelID'])
round2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 85 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0    GOL    3 non-null      float64
 1    NOL    3 non-null      float64
 2    BNL    4 non-null      int64  
 3    BBH    3 non-null      float64
 4    XCB    4 non-null      int64  
 5    XFB    3 non-null      float64
 6    WFB    4 non-null      int64  
 7    ZYB    4 non-null      int64  
 8    AUB    4 non-null      int64  
 9    ASB    3 non-null      float64
 10   BPL    4 non-null      int64  
 11   NPH    4 non-null      int64  
 12   NLH    4 non-null      int64  
 13   JUB    4 non-null      int64  
 14   NLB    3 non-null      float64
 15   MAB    2 non-null      float64
 16   MAL    3 non-null      float64
 17   MDH    4 non-null      int64  
 18   OBH    4 non-null      int64  
 19   OBB    4 non-null      int64  
 20   DKB    4 non-null      int64  
 21   NDS    4 non-null      int64  
 22   WNB  

In [None]:
# remove random space before the measurement variable names

for c in round2.columns:
  print(c.strip())
  round2
  round2.rename(columns={c:c.strip()}, inplace=True)

GOL
NOL
BNL
BBH
XCB
XFB
WFB
ZYB
AUB
ASB
BPL
NPH
NLH
JUB
NLB
MAB
MAL
MDH
OBH
OBB
DKB
NDS
WNB
SIS
ZMB
SSS
FMB
NAS
EKB
DKS
IML
XML
MLS
WMH
GLS
STB
FRC
FRS
FRF
PAC
PAS
PAF
OCC
OCS
OCF
FOL
FOB
NAR
SSR
PRR
DKR
ZOR
FMR
EKR
ZMR
AVR
BRR
VRR
LAR
OSR
BAR
MOW
UFBR
UFHT
NAA
PRA
BAA
NBA
BBA
BRA
SSA
NFA
DKA
NDA
SIA
FRA
PAA
OCA
RFA
RPA
ROA
BSA
SBA
SLA
TBA


In [None]:
# drop unknown columns in round2

round2 = round2.drop(columns=['RFA'], axis=1)
round2 = round2.drop(columns=['RPA'], axis=1)
round2 = round2.drop(columns=['ROA'], axis=1)
round2 = round2.drop(columns=['BSA'], axis=1)
round2 = round2.drop(columns=['SBA'], axis=1)
round2 = round2.drop(columns=['SLA'], axis=1)
round2 = round2.drop(columns=['TBA'], axis=1)
round2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 78 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   GOL     3 non-null      float64
 1   NOL     3 non-null      float64
 2   BNL     4 non-null      int64  
 3   BBH     3 non-null      float64
 4   XCB     4 non-null      int64  
 5   XFB     3 non-null      float64
 6   WFB     4 non-null      int64  
 7   ZYB     4 non-null      int64  
 8   AUB     4 non-null      int64  
 9   ASB     3 non-null      float64
 10  BPL     4 non-null      int64  
 11  NPH     4 non-null      int64  
 12  NLH     4 non-null      int64  
 13  JUB     4 non-null      int64  
 14  NLB     3 non-null      float64
 15  MAB     2 non-null      float64
 16  MAL     3 non-null      float64
 17  MDH     4 non-null      int64  
 18  OBH     4 non-null      int64  
 19  OBB     4 non-null      int64  
 20  DKB     4 non-null      int64  
 21  NDS     4 non-null      int64  
 22  WNB   

In [None]:
round2.head()

In [None]:
round1.head()

---

## Remove NAs with For Loop
#### 1) Check dataframe1 for NaNs. For any NaNs in df1, check against corresponding cells in dataframe2. If df2 also has NaN in that cell, set both cells (in df1 and df2) to 0.
#### 2) Find all NaNs in dataframe 2 and substitute with corresponding values from df1. (No need to check df1 for NaNs since Step 1 already removed all NaNs from df1).

In [None]:
def remove_nan(round1, round2):
    column1 = list(round1.columns)
    column2 = list(round2.columns)
    for i in range(len(column1)):
        na_indx1 = list(np.where(round1[column1[i]].isna()))
        na_indx2 = list(np.where(round2[column2[i]].isna()))
        print(na_indx1)
        for j in na_indx1[0]:
            if not (np.isnan(round2[column2[i]][j])):# != 'nan' :
                round1[column1[i]][j] = round2[column2[i]][j]
                #print(type(round2[column[i]][j]))
                #input(round1[column[i]][j])
                #print("I am here")
            else:
                round2[column2[i]][j] = 0
                round1[column1[i]][j] = 0
                #input(round1[column][j])
        for j in na_indx2[0]:
                round2[column2[i]][j] = round1[column1[i]][j]
    return round1, round2
        #indx1 = set()
        #print(type(na_indx1))
        #indx1 = set(na_indx1)
        #indx2 = set((na_indx2))
        #print(round1[column[i]][na_indx1[0][1]])
        #print(na_indx1[0])
        #print(round1[column[i]].isnull())
        #print(round1[column[i]], round2[column[i]])

In [None]:
remove_nan(round1, round2)

[array([0])]
[array([0])]
[array([], dtype=int64)]
[array([0])]
[array([], dtype=int64)]
[array([0])]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([0])]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([2])]
[array([2, 3])]
[array([2])]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([0])]
[array([0])]
[array([0])]
[array([0])]
[array([0])]
[array([0])]
[array([0])]
[array([0])]
[array([0])]
[array([0])]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dt

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(     GOL    NOL  BNL    BBH  XCB    XFB  ...  DKA  NDA  SIA    FRA    PAA    OCA
 0    0.0    0.0  108    0.0  156    0.0  ...  147   93  106    0.0    0.0    0.0
 1  180.0  175.0   98  141.0  148  122.0  ...  148  101   99  131.0  130.0  120.0
 2  171.0  168.0   96  136.0  152  122.0  ...  141   97  128  124.0  126.0  129.0
 3  183.0  180.0   96  131.0  139  117.0  ...  158   90   91  127.0  144.0  112.0
 
 [4 rows x 78 columns],
      GOL    NOL  BNL    BBH  XCB    XFB  ...  DKA  NDA  SIA    FRA    PAA    OCA
 0    0.0    0.0  108    0.0  156    0.0  ...  147   93  108    0.0    0.0    0.0
 1  179.0  175.0   98  141.0  148  122.0  ...  146  101  113  129.0  130.0  122.0
 2  170.0  168.0   96  136.0  152  122.0  ...  141   97  120  124.0  128.0  127.0
 3  184.0  180.0   95  131.0  139  117.0  ...  158   90   95  126.0  142.0  112.0
 
 [4 rows x 78 columns])

In [None]:
r1 = pd.DataFrame()
r2 = pd.DataFrame()

In [None]:
#print(round2[column[1]][j])
r1, r2 = remove_nan(round1, round2)    # prints the location (indices) of all NaNs in the r1 and r2 DFs

[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]
[array([], dtype=int64)]


In [None]:
r2.head()

In [None]:
round1.head()

In [None]:
round2.head()

## TEM Intraobserver Error

In [None]:
column_names = ['Trait Code', 'n pairs', 'TEM', 'Relative TEM']       # since NaNs were replace with zeros, n=45 for every variable
TEMoutput = pd.DataFrame(columns = column_names)
column1 = list(round1.columns)
column2 = list(round2.columns)
for i in range(len(column1)):
        ob1 = round1[column1[i]]
        ob2 = round2[column2[i]]
        y = (ob1 - ob2)**2
        TEM = (sum(y))/(2*(len(y)))
        z1 = np.mean(ob1)
        z2 = np.mean(ob2)
        VAV = np.mean([z1 , z2])
        relTEM=(TEM/VAV)*100
        print(column2[i], '\t', len(ob1), '\t', round(TEM, 5), '\t', round(relTEM, 5))
        df2 = {'Trait Code': column2[i], 'n pairs': len(ob1), 'TEM': round(TEM,5), 'Relative TEM': round(relTEM,5)}
        TEMoutput.loc[i] = [ column2[i],  len(ob1), round(TEM,5), round(relTEM,5)]

GOL 	 4 	 0.375 	 0.28116
NOL 	 4 	 0.0 	 0.0
BNL 	 4 	 0.125 	 0.12579
BBH 	 4 	 0.0 	 0.0
XCB 	 4 	 0.0 	 0.0
XFB 	 4 	 0.0 	 0.0
WFB 	 4 	 0.25 	 0.25974
ZYB 	 4 	 0.0 	 0.0
AUB 	 4 	 21.375 	 16.33238
ASB 	 4 	 0.125 	 0.14728
BPL 	 4 	 0.125 	 0.13245
NPH 	 4 	 0.25 	 0.37037
NLH 	 4 	 0.125 	 0.24213
JUB 	 4 	 0.25 	 0.21413
NLB 	 4 	 0.125 	 0.62112
MAB 	 4 	 0.125 	 0.35587
MAL 	 4 	 0.625 	 1.47493
MDH 	 4 	 0.875 	 2.5641
OBH 	 4 	 0.0 	 0.0
OBB 	 4 	 0.125 	 0.28818
DKB 	 4 	 0.0 	 0.0
NDS 	 4 	 0.0 	 0.0
WNB 	 4 	 0.02 	 0.3252
SIS 	 4 	 0.0275 	 1.20879
ZMB 	 4 	 0.125 	 0.12837
SSS 	 4 	 0.0 	 0.0
FMB 	 4 	 0.125 	 0.12392
NAS 	 4 	 0.25 	 1.5625
EKB 	 4 	 0.125 	 0.12361
DKS 	 4 	 0.125 	 1.05263
IML 	 4 	 0.25 	 0.74074
XML 	 4 	 0.625 	 1.14943
MLS 	 4 	 0.125 	 0.95238
WMH 	 4 	 0.125 	 0.54054
GLS 	 4 	 0.125 	 2.32558
STB 	 4 	 0.125 	 0.14599
FRC 	 4 	 0.0 	 0.0
FRS 	 4 	 0.125 	 0.5848
FRF 	 4 	 3.375 	 8.51735
PAC 	 4 	 0.125 	 0.15175
PAS 	 4 	 0.375 	 2.0979
PA

In [None]:
TEMoutput.head(20)

Unnamed: 0,Trait Code,n pairs,TEM,Relative TEM
0,GOL,4,0.375,0.28116
1,NOL,4,0.0,0.0
2,BNL,4,0.125,0.12579
3,BBH,4,0.0,0.0
4,XCB,4,0.0,0.0
5,XFB,4,0.0,0.0
6,WFB,4,0.25,0.25974
7,ZYB,4,0.0,0.0
8,AUB,4,21.375,16.33238
9,ASB,4,0.125,0.14728


In [None]:
TEMoutput_sorted = TEMoutput.sort_values(by='Relative TEM', ascending=False)

In [None]:
TEMoutput_sorted.head(15)

Unnamed: 0,Trait Code,n pairs,TEM,Relative TEM
74,SIA,4,35.0,32.55814
8,AUB,4,21.375,16.33238
41,PAF,4,6.375,15.13353
60,BAR,4,1.5,10.52632
38,FRF,4,3.375,8.51735
44,OCF,4,2.25,5.29412
59,OSR,4,1.25,3.0303
17,MDH,4,0.875,2.5641
47,NAR,4,2.25,2.41935
34,GLS,4,0.125,2.32558


In [None]:
# identify variables with a Relative TEM greater than 5 to remove these columns

TEMoutput_sorted.loc[TEMoutput_sorted['Relative TEM'] > 5]

Unnamed: 0,Trait Code,n pairs,TEM,Relative TEM
74,SIA,4,35.0,32.55814
8,AUB,4,21.375,16.33238
41,PAF,4,6.375,15.13353
60,BAR,4,1.5,10.52632
38,FRF,4,3.375,8.51735
44,OCF,4,2.25,5.29412


### List of traits to remove

In [None]:
remove_traits = TEMoutput_sorted.loc[TEMoutput['Relative TEM'] > 5]

### Add trait codes and names to TEMoutput file

In [None]:
codes_names = pd.read_excel('craniometric trait codes and names.xlsx')
codes_names.head()

Unnamed: 0,Trait Code,Trait Name
0,GOL,Glabello-occipital length
1,NOL,Nasio-occipital length
2,BNL,Basion-nasion length
3,BBH,Basion-bregma height
4,XCB,Maximum cranial breadth


In [None]:
# remove random space before the measurement variable names

for c in codes_names.columns:
  print(c.strip())
  codes_names
  codes_names.rename(columns={c:c.strip()}, inplace=True)

Trait Code
Trait Name


In [None]:
TEMresults_named = pd.merge(TEMoutput, codes_names, on='Trait Code')
TEMresults_named.head()

Unnamed: 0,Trait Code,n pairs,TEM,Relative TEM,Trait Name


In [None]:
TEMresults_named.tail()

Unnamed: 0,Trait Code,n pairs,TEM,Relative TEM,Trait Name


In [None]:
from google.colab import  drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
TEMresults_named.to_csv('/drive/My Drive/Colab Notebooks/Pre-statistical treatments/1_Observer error/cranmet_intraobserver_TEM_results.csv')

---

## Merge craniometric dfs with demographics

#### US data

In [None]:
cranmet_US = pd.read_excel('USCran_for_analysis.xlsx')
cranmet_US.info(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 87 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SkelID      48 non-null     int64  
 1   Collection  48 non-null     object 
 2    GOL        44 non-null     float64
 3    NOL        44 non-null     float64
 4    BNL        47 non-null     float64
 5    BBH        46 non-null     float64
 6    XCB        48 non-null     int64  
 7    XFB        47 non-null     float64
 8    WFB        48 non-null     int64  
 9    ZYB        48 non-null     int64  
 10   AUB        48 non-null     int64  
 11   ASB        41 non-null     float64
 12   BPL        42 non-null     float64
 13   NPH        41 non-null     float64
 14   NLH        47 non-null     float64
 15   JUB        48 non-null     int64  
 16   NLB        45 non-null     float64
 17   MAB        26 non-null     float64
 18   MAL        41 non-null     float64
 19   MDH        48 non-null     int

In [None]:
cranmet_US_demographics = pd.read_excel('US demographics.xlsx')
cranmet_US_demographics.info(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   SkelID       241 non-null    int64 
 1   Sex          241 non-null    object
 2   Age          241 non-null    int64 
 3   Population   241 non-null    object
 4   Population2  44 non-null     object
 5   Population3  13 non-null     object
 6   Population4  1 non-null      object
dtypes: int64(2), object(5)
memory usage: 13.3+ KB


In [None]:
cranmet_US_merged = pd.merge(cranmet_US, cranmet_US_demographics, on='SkelID')

In [None]:
cranmet_US_merged.head(10)

In [None]:
from google.colab import  drive
drive.mount('/drive')

Mounted at /drive


In [None]:
cranmet_US_merged.to_csv('/drive/My Drive/Colab Notebooks/Pre-statistical treatments/cranmet_US_merged.csv')

In [None]:
del cranmet_US_merged['Population']
del cranmet_US_merged['Population2']
del cranmet_US_merged['Population3']
del cranmet_US_merged['Population4']


#### Japan data

In [None]:
cranmet_Japan = pd.read_excel('JapanCran_for_analysis.xlsx')

In [None]:
cranmet_Japan.head()

In [None]:
cranmet_Japan_demographics = pd.read_excel('Japan_demographics.xlsx')
cranmet_Japan_demographics.head()

In [None]:
del cranmet_Japan_demographics['Notes']
del cranmet_Japan_demographics['Date']
del cranmet_Japan_demographics['DOB']
del cranmet_Japan_demographics['DOD']
del cranmet_Japan_demographics['Year of death']
del cranmet_Japan_demographics['Year/ind# of death (Japan)']

In [None]:
cranmet_Japan_demographics.head()

In [None]:
cranmet_Japan_merged = pd.merge(cranmet_Japan, cranmet_Japan_demographics, on='SkelID')

In [None]:
cranmet_Japan_merged.head()

In [None]:
cranmet_Japan_merged.to_csv('/drive/My Drive/Colab Notebooks/Pre-statistical treatments/cranmet_Japan_merged.csv')

### Merge US and Japan craniometrics

In [None]:
cranmet_Japan_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34 entries, 0 to 33
Data columns (total 89 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SkelID      34 non-null     int64  
 1   Collection  34 non-null     object 
 2    GOL        34 non-null     int64  
 3    NOL        34 non-null     int64  
 4    BNL        34 non-null     int64  
 5    BBH        34 non-null     int64  
 6    XCB        34 non-null     int64  
 7    XFB        34 non-null     int64  
 8    WFB        34 non-null     int64  
 9    ZYB        34 non-null     int64  
 10   AUB        34 non-null     int64  
 11   ASB        32 non-null     float64
 12   BPL        33 non-null     float64
 13   NPH        33 non-null     float64
 14   NLH        34 non-null     int64  
 15   JUB        34 non-null     int64  
 16   NLB        34 non-null     int64  
 17   MAB        34 non-null     int64  
 18   MAL        33 non-null     float64
 19   MDH        34 non-null     int

In [None]:
cranmet_US_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 47
Data columns (total 89 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SkelID      48 non-null     int64  
 1   Collection  48 non-null     object 
 2    GOL        44 non-null     float64
 3    NOL        44 non-null     float64
 4    BNL        47 non-null     float64
 5    BBH        46 non-null     float64
 6    XCB        48 non-null     int64  
 7    XFB        47 non-null     float64
 8    WFB        48 non-null     int64  
 9    ZYB        48 non-null     int64  
 10   AUB        48 non-null     int64  
 11   ASB        41 non-null     float64
 12   BPL        42 non-null     float64
 13   NPH        41 non-null     float64
 14   NLH        47 non-null     float64
 15   JUB        48 non-null     int64  
 16   NLB        45 non-null     float64
 17   MAB        26 non-null     float64
 18   MAL        41 non-null     float64
 19   MDH        48 non-null     int

In [None]:
cranmet_merged = cranmet_US_merged.append(cranmet_Japan_merged)

In [None]:
cranmet_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82 entries, 0 to 33
Data columns (total 89 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SkelID      82 non-null     int64  
 1   Collection  82 non-null     object 
 2    GOL        78 non-null     float64
 3    NOL        78 non-null     float64
 4    BNL        81 non-null     float64
 5    BBH        80 non-null     float64
 6    XCB        82 non-null     int64  
 7    XFB        81 non-null     float64
 8    WFB        82 non-null     int64  
 9    ZYB        82 non-null     int64  
 10   AUB        82 non-null     int64  
 11   ASB        73 non-null     float64
 12   BPL        75 non-null     float64
 13   NPH        74 non-null     float64
 14   NLH        81 non-null     float64
 15   JUB        82 non-null     int64  
 16   NLB        79 non-null     float64
 17   MAB        60 non-null     float64
 18   MAL        74 non-null     float64
 19   MDH        82 non-null     int

In [None]:
cranmet_merged.head()

In [None]:
cranmet_merged.tail()

##### Remove random space before the measurement variable names

In [None]:
cranmet_merged.columns

Index(['SkelID', 'Collection', ' GOL', ' NOL', ' BNL', ' BBH', ' XCB', ' XFB',
       ' WFB', ' ZYB', ' ASB', ' BPL', ' NPH', ' NLH', ' JUB', ' NLB', ' MAB',
       ' MAL', ' MDH', ' OBH', ' OBB', ' DKB', ' NDS', ' WNB', ' SIS', ' ZMB',
       ' SSS', ' FMB', ' NAS', ' EKB', ' DKS', ' IML', ' XML', ' MLS', ' WMH',
       ' GLS', ' STB', ' FRC', ' FRS', ' FRF', ' PAC', ' PAS', ' PAF', ' OCC',
       ' OCS', ' OCF', ' FOL', ' FOB', ' NAR', ' SSR', ' PRR', ' DKR', ' ZOR',
       ' FMR', ' EKR', ' ZMR', ' AVR', ' BRR', ' VRR', ' LAR', ' OSR', ' BAR',
       ' MOW', ' UFBR', ' UFHT', ' NAA', ' PRA', ' BAA', ' NBA', ' BBA',
       ' BRA', ' SSA', ' NFA', ' DKA', ' NDA', ' SIA', ' FRA', ' PAA', ' OCA',
       ' RFA', ' RPA', ' ROA', ' BSA', ' SBA', ' SLA', ' TBA', 'Sex', 'Age'],
      dtype='object')

In [None]:
# remove random space before the measurement variable names

for c in cranmet_merged.columns:
  print(c.strip())
  cranmet_merged
  cranmet_merged.rename(columns={c:c.strip()}, inplace=True)

SkelID
Collection
GOL
NOL
BNL
BBH
XCB
XFB
WFB
ZYB
AUB
ASB
BPL
NPH
NLH
JUB
NLB
MAB
MAL
MDH
OBH
OBB
DKB
NDS
WNB
SIS
ZMB
SSS
FMB
NAS
EKB
DKS
IML
XML
MLS
WMH
GLS
STB
FRC
FRS
FRF
PAC
PAS
PAF
OCC
OCS
OCF
FOL
FOB
NAR
SSR
PRR
DKR
ZOR
FMR
EKR
ZMR
AVR
BRR
VRR
LAR
OSR
BAR
MOW
UFBR
UFHT
NAA
PRA
BAA
NBA
BBA
BRA
SSA
NFA
DKA
NDA
SIA
FRA
PAA
OCA
RFA
RPA
ROA
BSA
SBA
SLA
TBA
Sex
Age


In [None]:
cranmet_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82 entries, 0 to 33
Data columns (total 89 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SkelID      82 non-null     int64  
 1   Collection  82 non-null     object 
 2   GOL         78 non-null     float64
 3   NOL         78 non-null     float64
 4   BNL         81 non-null     float64
 5   BBH         80 non-null     float64
 6   XCB         82 non-null     int64  
 7   XFB         81 non-null     float64
 8   WFB         82 non-null     int64  
 9   ZYB         82 non-null     int64  
 10  AUB         82 non-null     int64  
 11  ASB         73 non-null     float64
 12  BPL         75 non-null     float64
 13  NPH         74 non-null     float64
 14  NLH         81 non-null     float64
 15  JUB         82 non-null     int64  
 16  NLB         79 non-null     float64
 17  MAB         60 non-null     float64
 18  MAL         74 non-null     float64
 19  MDH         82 non-null     int

---

## Remove traits

#### Remove traits with low intraobserver agreement (low Relative TEM)





In [None]:
# list of traits to be removed that have Relative TEM < 5

remove_traits

Unnamed: 0,Trait Name,n,TEM,Relative TEM
8,AUB,4,21.375,16.33238
38,FRF,4,3.375,8.51735
41,PAF,4,6.375,15.13353
44,OCF,4,2.25,5.29412
60,BAR,4,1.5,10.52632
74,SIA,4,35.0,32.55814


In [None]:
cranmet_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82 entries, 0 to 33
Data columns (total 89 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SkelID      82 non-null     int64  
 1   Collection  82 non-null     object 
 2   GOL         78 non-null     float64
 3   NOL         78 non-null     float64
 4   BNL         81 non-null     float64
 5   BBH         80 non-null     float64
 6   XCB         82 non-null     int64  
 7   XFB         81 non-null     float64
 8   WFB         82 non-null     int64  
 9   ZYB         82 non-null     int64  
 10  AUB         82 non-null     int64  
 11  ASB         73 non-null     float64
 12  BPL         75 non-null     float64
 13  NPH         74 non-null     float64
 14  NLH         81 non-null     float64
 15  JUB         82 non-null     int64  
 16  NLB         79 non-null     float64
 17  MAB         60 non-null     float64
 18  MAL         74 non-null     float64
 19  MDH         82 non-null     int

In [None]:
## Remove traits with low intraobserver agreement:
# AUB (Biauricular breadth)
# FRF (Frontal fraction/Nasion-subtense fraction)
# PAF (Parietal fraction/Bregma-subtense fraction)
# OCF (Occipital fraction/Lambda-subtense fraction)
# BAR (Basion radius)
# SIA (Simotic angle)

del cranmet_merged['AUB']
del cranmet_merged['FRF']
del cranmet_merged['PAF']
del cranmet_merged['OCF']
del cranmet_merged['BAR']
del cranmet_merged['SIA']

In [None]:
cranmet_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82 entries, 0 to 33
Data columns (total 83 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SkelID      82 non-null     int64  
 1   Collection  82 non-null     object 
 2   GOL         78 non-null     float64
 3   NOL         78 non-null     float64
 4   BNL         81 non-null     float64
 5   BBH         80 non-null     float64
 6   XCB         82 non-null     int64  
 7   XFB         81 non-null     float64
 8   WFB         82 non-null     int64  
 9   ZYB         82 non-null     int64  
 10  ASB         73 non-null     float64
 11  BPL         75 non-null     float64
 12  NPH         74 non-null     float64
 13  NLH         81 non-null     float64
 14  JUB         82 non-null     int64  
 15  NLB         79 non-null     float64
 16  MAB         60 non-null     float64
 17  MAL         74 non-null     float64
 18  MDH         82 non-null     int64  
 19  OBH         81 non-null     flo

In [None]:
from google.colab import  drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
cranmet_merged.to_csv('/drive/My Drive/Colab Notebooks/Pre-statistical treatments/1_Observer error/cranmet_merged.csv', index=False)

---

### **Output data files:**
*   *cranmet_merged.csv*
*   *cranmet_intraobserver_TEMoutput_results.csv*
*   *cranmet_TEMresults_named.csv*
*   The merged output file is in the (Intra)Observer Error folder and has had the traits removed with low IO agreement
*   The *cranmet_TEMresults_named.csv* file is the *cranmet_TEMoutput.csv* with the trait codes AND trait names