In [527]:
import pandas as pd 
import numpy as np
import glob

In [528]:
# Obtaine all necessary files from one folder: R_files
path=glob.glob('/Users/dasha/Documents/Capstone_2_files/Original_files2/*.xlsx')

In [529]:
#read the files (in excel format) and append the dataframes to a list 
df_list=[]
for file in path: 
    file_name=pd.read_excel(file)
    df_list.append(file_name)

In [530]:
# Generate a fuction for elimination and engineering of features 

irrelevant_columns=['Standard Error','Lower CI Bound','Upper CI Bound','Element Name','Scale Name','O*NET-SOC Code','N','Element ID','Scale ID','Recommend Suppress','Date','Domain Source','Not Relevant','Category']

def dropper(df): 
    # for files with a category column 
    if 'Category' in df.columns: 
        #some columns are NaN, replace with blanks for computation downstream
        df['Category'].replace(np.nan,'',inplace=True) 
        df['Category']=df['Category'].astype(str)
        df['Element_Name']=df['Element Name']+str('_Context_')+df['Category']
    else: 
        df['Element_Name']=df['Element Name']+str('_')+df['Scale Name']
    # drop unnecessary columns 
    for column in df.columns: 
        if column in irrelevant_columns: 
            df.drop(columns=column,inplace=True)

In [531]:
# execute the dropper function on all the loaded files 
[dropper(df_list[idx]) for idx,df in enumerate(df_list)]

[None, None, None, None, None, None, None]

In [532]:
# Concatenate all the files together into a single dataframe 
df=pd.concat(df_list,axis=0,ignore_index=True,sort=True)
df=df.iloc[:,[2,1,0]] # re-order the columns 
df=df[~df['Element_Name'].str.contains('_Level')]
df.head()

Unnamed: 0,Title,Element_Name,Data Value
0,Chief Executives,Oral Comprehension_Importance,4.5
2,Chief Executives,Written Comprehension_Importance,4.25
4,Chief Executives,Oral Expression_Importance,4.38
6,Chief Executives,Written Expression_Importance,4.12
8,Chief Executives,Fluency of Ideas_Importance,3.88


In [533]:
# unstack the 'Element_Name' column into 'Data Value' and 'Standard Error'
df_expanded=df.pivot_table(index=['Title'],columns='Element_Name',values=['Data Value'])
df_expanded.head()

Unnamed: 0_level_0,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value,Data Value
Element_Name,Achievement/Effort_Importance,Active Learning_Importance,Active Listening_Importance,Adaptability/Flexibility_Importance,Administration and Management_Importance,Analytical Thinking_Importance,Analyzing Data or Information_Importance,Arm-Hand Steadiness_Importance,Artistic_Occupational Interests,Assisting and Caring for Others_Importance,...,Work With Work Group or Team_Context_,Work With Work Group or Team_Context_1.0,Work With Work Group or Team_Context_2.0,Work With Work Group or Team_Context_3.0,Work With Work Group or Team_Context_4.0,Work With Work Group or Team_Context_5.0,Wrist-Finger Speed_Importance,Writing_Importance,Written Comprehension_Importance,Written Expression_Importance
Title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Accountants,3.63,3.12,3.88,3.78,3.11,4.21,3.93,1.38,1.0,2.44,...,3.66,16.3,17.37,6.01,4.52,55.81,1.38,3.62,4.0,3.75
Actors,4.73,2.62,3.75,4.78,2.56,3.25,1.78,2.0,6.67,2.39,...,5.0,0.0,0.0,0.0,0.0,100.0,1.12,2.88,3.88,2.88
Actuaries,4.44,3.38,4.0,3.81,3.31,4.81,4.88,1.12,1.67,1.69,...,3.97,0.0,6.45,19.35,45.16,29.03,1.25,3.38,4.0,3.75
Acupuncturists,3.65,3.12,3.75,3.81,2.67,3.95,3.06,3.12,2.33,4.32,...,3.56,13.6,3.14,29.95,20.79,32.53,1.38,3.25,3.5,3.25
Acute Care Nurses,4.25,3.75,4.0,4.71,2.89,4.11,3.48,1.75,2.0,4.78,...,4.77,0.0,0.0,3.85,15.38,80.77,1.38,3.5,4.0,3.75


In [534]:
#How many professions contain missing values?
print(df_expanded[df_expanded.isnull().any(axis=1)].shape[0])

8


In [535]:
#Why did some rows have missing standard dev.? Maybe there were insufficient 
# measuremnts to determine std? Will drop any rows containing NaNs 
df_expanded=df_expanded.dropna(how='any',axis=0)
assert df_expanded.notnull().all().all()

In [536]:
df_dv=df_expanded['Data Value']
df_dv.reset_index(inplace=True)
df_dv.set_index('Title',inplace=True)
df_dv.head()

Element_Name,Achievement/Effort_Importance,Active Learning_Importance,Active Listening_Importance,Adaptability/Flexibility_Importance,Administration and Management_Importance,Analytical Thinking_Importance,Analyzing Data or Information_Importance,Arm-Hand Steadiness_Importance,Artistic_Occupational Interests,Assisting and Caring for Others_Importance,...,Work With Work Group or Team_Context_,Work With Work Group or Team_Context_1.0,Work With Work Group or Team_Context_2.0,Work With Work Group or Team_Context_3.0,Work With Work Group or Team_Context_4.0,Work With Work Group or Team_Context_5.0,Wrist-Finger Speed_Importance,Writing_Importance,Written Comprehension_Importance,Written Expression_Importance
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accountants,3.63,3.12,3.88,3.78,3.11,4.21,3.93,1.38,1.0,2.44,...,3.66,16.3,17.37,6.01,4.52,55.81,1.38,3.62,4.0,3.75
Actors,4.73,2.62,3.75,4.78,2.56,3.25,1.78,2.0,6.67,2.39,...,5.0,0.0,0.0,0.0,0.0,100.0,1.12,2.88,3.88,2.88
Actuaries,4.44,3.38,4.0,3.81,3.31,4.81,4.88,1.12,1.67,1.69,...,3.97,0.0,6.45,19.35,45.16,29.03,1.25,3.38,4.0,3.75
Acupuncturists,3.65,3.12,3.75,3.81,2.67,3.95,3.06,3.12,2.33,4.32,...,3.56,13.6,3.14,29.95,20.79,32.53,1.38,3.25,3.5,3.25
Acute Care Nurses,4.25,3.75,4.0,4.71,2.89,4.11,3.48,1.75,2.0,4.78,...,4.77,0.0,0.0,3.85,15.38,80.77,1.38,3.5,4.0,3.75
