In [2]:
import pandas as pd 
import numpy as np
import glob

In [3]:
# Obtaine all necessary files from one folder: R_files
path=glob.glob('/Users/dasha/Documents/Capstone_2_files/R_files/*.xlsx')

In [4]:
#read the files (in excel format) and append the dataframes to a list 
df_list=[]
for file in path: 
    file_name=pd.read_excel(file)
    df_list.append(file_name)

In [5]:
# Generate a fuction for elimination and engineering of features 

irrelevant_columns=['Lower CI Bound','Upper CI Bound','Element Name','Scale Name','O*NET-SOC Code','N','Element ID','Scale ID','Recommend Suppress','Date','Domain Source','Not Relevant','Category']

def dropper(df): 
    # for files with a category column 
    if 'Category' in df.columns: 
        #some columns are NaN, replace with blanks for computation downstream
        df['Category'].replace(np.nan,'',inplace=True) 
        df['Category']=df['Category'].astype(str)
        df['Element_Name']=df['Element Name']+str('_Context_')+df['Category']
    else: 
        df['Element_Name']=df['Element Name']+str('_')+df['Scale Name']
    # drop unnecessary columns 
    for column in df.columns: 
        if column in irrelevant_columns: 
            df.drop(columns=column,inplace=True)

In [6]:
# execute the dropper function on all the loaded files 
[dropper(df_list[idx]) for idx,df in enumerate(df_list)]

[None, None, None, None, None, None]

In [7]:
# How many unique elements does each Element_Name column in each file contain?
num_unique_elements=[df['Element_Name'].nunique() for df in df_list]
print('Number of unique elements: ',[unique/2 for unique in num_unique_elements])
unique_elements=[df['Element_Name'].value_counts() for df in df_list]
print(unique_elements)

Number of unique elements:  [52.0, 35.0, 41.0, 33.0, 4.5, 169.0]
[Visual Color Discrimination_Importance    967
Written Comprehension_Importance          967
Speech Recognition_Importance             967
Arm-Hand Steadiness_Importance            967
Gross Body Coordination_Importance        967
Memorization_Level                        967
Peripheral Vision_Importance              967
Oral Comprehension_Importance             967
Wrist-Finger Speed_Importance             967
Manual Dexterity_Level                    967
Selective Attention_Level                 967
Oral Comprehension_Level                  967
Perceptual Speed_Level                    967
Written Expression_Level                  967
Wrist-Finger Speed_Level                  967
Explosive Strength_Importance             967
Reaction Time_Level                       967
Category Flexibility_Level                967
Problem Sensitivity_Importance            967
Depth Perception_Level                    967
Spatial Orient

In [8]:
# Concatenate all the files together into a single dataframe 
df=pd.concat(df_list,axis=0,ignore_index=True,sort=True)
df=df.iloc[:,[3,1,0,2]] # re-order the columns 
df.head()

Unnamed: 0,Title,Element_Name,Data Value,Standard Error
0,Chief Executives,Oral Comprehension_Importance,4.5,0.19
1,Chief Executives,Oral Comprehension_Level,4.88,0.13
2,Chief Executives,Written Comprehension_Importance,4.25,0.16
3,Chief Executives,Written Comprehension_Level,4.62,0.18
4,Chief Executives,Oral Expression_Importance,4.38,0.18


In [9]:
# unstack the 'Element_Name' column into 'Data Value' and 'Standard Error'
df_expanded=df.pivot_table(index=['Title'],columns='Element_Name',values=['Data Value','Standard Error'])