In [266]:
import pandas as pd 
import numpy as np
import glob

In [267]:
# Obtaine all necessary files from one folder: R_files
path=glob.glob('/Users/dasha/Documents/Capstone_2_files/R_files/*.xlsx')

In [268]:
#read the files (in excel format) and append the dataframes to a list 
df_list=[]
for file in path: 
    file_name=pd.read_excel(file)
    df_list.append(file_name)

In [269]:
# Generate a fuction for elimination and engineering of features 

irrelevant_columns=['Lower CI Bound','Upper CI Bound','Element Name','Scale Name','O*NET-SOC Code','N','Element ID','Scale ID','Recommend Suppress','Date','Domain Source','Not Relevant','Category']

def dropper(df): 
    # for files with a category column 
    if 'Category' in df.columns: 
        #some columns are NaN, replace with blanks for computation downstream
        df['Category'].replace(np.nan,'',inplace=True) 
        df['Category']=df['Category'].astype(str)
        df['Element_Name']=df['Element Name']+str('_Context_')+df['Category']
    else: 
        df['Element_Name']=df['Element Name']+str('_')+df['Scale Name']
    # drop unnecessary columns 
    for column in df.columns: 
        if column in irrelevant_columns: 
            df.drop(columns=column,inplace=True)

In [281]:
# execute the dropper function on all the loaded files 
[dropper(df_list[idx]) for idx,df in enumerate(df_list)]

[None, None, None, None, None, None]

In [298]:
# How many unique elements does each Element_Name column in each file contain?
num_unique_elements=[df['Element_Name'].nunique() for df in df_list]
print('Number of unique elements: ',[unique/2 for unique in num_unique_elements])
unique_elements=[df['Element_Name'].value_counts() for df in df_list]
print(unique_elements)

Number of unique elements:  [52.0, 35.0, 41.0, 33.0, 4.5, 169.0]
[Far Vision_Importance                     967
Visual Color Discrimination_Importance    967
Peripheral Vision_Importance              967
Night Vision_Importance                   967
Rate Control_Level                        967
Oral Comprehension_Level                  967
Visual Color Discrimination_Level         967
Oral Comprehension_Importance             967
Auditory Attention_Importance             967
Deductive Reasoning_Importance            967
Dynamic Flexibility_Importance            967
Mathematical Reasoning_Importance         967
Static Strength_Level                     967
Gross Body Equilibrium_Level              967
Trunk Strength_Importance                 967
Selective Attention_Level                 967
Glare Sensitivity_Level                   967
Control Precision_Importance              967
Written Comprehension_Importance          967
Written Comprehension_Level               967
Oral Expressio

In [282]:
# Concatenate all the files together into a single dataframe 
df=pd.concat(df_list,axis=0,ignore_index=True,sort=True)
df=df.iloc[:,[3,1,0,2]] # re-order the columns 
df.head()

Unnamed: 0,Title,Element_Name,Data Value,Standard Error
0,Chief Executives,Oral Comprehension_Importance,4.5,0.19
1,Chief Executives,Oral Comprehension_Level,4.88,0.13
2,Chief Executives,Written Comprehension_Importance,4.25,0.16
3,Chief Executives,Written Comprehension_Level,4.62,0.18
4,Chief Executives,Oral Expression_Importance,4.38,0.18


In [284]:
# unstack the 'Element_Name' column into 'Data Value' and 'Standard Error'
df_expanded=df.pivot_table(index=['Title'],columns='Element_Name',values=['Data Value','Standard Error'])

Unnamed: 0_level_0,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,...,Standard Error,Standard Error,Standard Error,Standard Error,Standard Error,Standard Error,Standard Error,Standard Error,Standard Error,Standard Error
Element_Name,Active Learning_Importance,Active Learning_Level,Active Listening_Importance,Active Listening_Level,Administration and Management_Importance,Administration and Management_Level,Analyzing Data or Information_Importance,Analyzing Data or Information_Level,Arm-Hand Steadiness_Importance,Arm-Hand Steadiness_Level,...,Work With Work Group or Team_Context_5.0,Work With Work Group or Team_Context__,Wrist-Finger Speed_Importance,Wrist-Finger Speed_Level,Writing_Importance,Writing_Level,Written Comprehension_Importance,Written Comprehension_Level,Written Expression_Importance,Written Expression_Level
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Accountants,3.12,3.25,3.88,4.00,3.11,3.40,3.93,4.93,1.38,0.75,...,15.22,0.55,0.18,0.32,0.18,0.16,0.00,0.16,0.16,0.13
Actors,2.62,2.62,3.75,3.62,2.56,2.23,1.78,1.38,2.00,1.75,...,0.00,0.00,0.13,0.25,0.13,0.18,0.13,0.19,0.13,0.18
Actuaries,3.38,4.12,4.00,4.12,3.31,4.72,4.88,6.50,1.12,0.12,...,,,0.16,0.16,0.18,0.00,0.19,0.18,0.16,0.19
Acupuncturists,3.12,3.50,3.75,3.62,2.67,2.57,3.06,3.54,3.12,3.25,...,12.65,0.34,0.18,0.32,0.16,0.19,0.19,0.23,0.16,0.16
Acute Care Nurses,3.75,3.88,4.00,3.88,2.89,2.96,3.48,4.37,1.75,1.00,...,,,0.18,0.18,0.19,0.18,0.00,0.00,0.16,0.00
Adapted Physical Education Specialists,3.50,3.62,4.12,4.12,2.44,2.56,3.59,4.22,3.12,2.75,...,,,0.25,0.38,0.18,0.16,0.19,0.13,0.13,0.13
Adhesive Bonding Machine Operators and Tenders,2.62,2.38,3.12,2.75,2.74,2.42,2.58,2.28,3.62,3.00,...,15.70,0.39,0.16,0.18,0.16,0.16,0.13,0.00,0.16,0.16
"Administrative Law Judges, Adjudicators, and Hearing Officers",3.50,4.25,4.12,5.75,2.96,3.69,4.68,5.47,1.50,0.50,...,10.89,0.39,0.12,0.25,0.12,0.19,0.12,0.12,0.12,0.19
Administrative Services Managers,3.12,3.38,4.00,4.00,4.24,4.56,3.50,3.74,2.12,2.00,...,8.75,0.10,0.16,0.26,0.16,0.16,0.00,0.00,0.13,0.00
Adult Basic and Secondary Education and Literacy Teachers and Instructors,3.50,3.25,3.75,3.75,2.91,3.02,2.85,2.68,1.00,0.00,...,12.80,0.21,0.00,0.00,0.18,0.13,0.00,0.00,0.00,0.00
