In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
import seaborn as sns

In [None]:
# Upload the data
path = '/Users/eddiehuang/Documents/Other/Anthropic/economic index report v2/'
onet = pd.read_csv(path+'onet_task_statements.csv')
soc_struct = pd.read_csv(path+'SOC_Structure.csv')
task_v1 = pd.read_csv(path+'task_pct_v1.csv')
task_v2 = pd.read_csv(path+'task_pct_v2.csv')
task_think = pd.read_csv(path+'task_thinking_fractions.csv')
auto_aug_v1 = pd.read_csv(path+'automation_vs_augmentation_v1.csv')
auto_aug_v2 = pd.read_csv(path+'automation_vs_augmentation_v2.csv')
auto_aug_task = pd.read_csv(path+'automation_vs_augmentation_by_task.csv')

In [None]:
# Explore the data
df = onet

In [None]:
#Functions used to explore dfs
print(df.shape)

df.head()

df.dtypes

df['Domain Source'].unique()

df.describe()

df['Date'].value_counts().sort_index()

pd.DataFrame(df.groupby('Title')['Task'].nunique())

for column in df.columns:
    print('Column: ' + column + ' | Unique Values: ' + str(df[column].nunique()))

Unnamed: 0_level_0,Task
Title,Unnamed: 1_level_1
Accountants,16
Actors,19
Actuaries,14
Acupuncturists,18
Acute Care Nurses,27
...,...
Wind Energy Project Managers,15
Wind Turbine Service Technicians,13
"Woodworking Machine Setters, Operators, and Tenders, Except Sawing",26
Word Processors and Typists,19


# Notes

### onet_task_statements
Table is a dictionary of SOC titles and the ONET tasks that are relevant to the title.
- Shape: (19530, 8)
- Columns: ['O*NET-SOC Code', 'Title', 'Task ID', 'Task', 'Task Type', 'Incumbents Responding', 'Date', 'Domain Source']
- Key: 'O*NET-SOC Code'
- Title: 974 unique job titles
- Task ID: every row is a unique task ID
- Task: Description of the task
- Task Type: Whether the task is core or supplemental to the job ['Core' (13487), 'Supplemental' (5385)]
- Incumbents Responding
- Date: Ranges from Mar '03 to Jul '15, without consistent intervals. Most data comes from the 2010 onwards. Need to convert dtype to datetime.
- Domain Source: Indicates how reliable or representative the data point might be. Incumbent is reported by people working in the hob, OEs are subject matter experts, and Analysts are O*Net research analysts ['Incumbent' (14511), 'Occupational Expert' (4361), 'Analyst' (658)]

### SOC_Structure
Table is a list of SOC Titles, with columns showing the categorization of the title. It is structured in a waterfall format, basically Major Group is an umbrella to Minor Group, which is an umbrella to Broad Occupation, and so on. The umbrella categories do not repeat down the waterfall, leading to many NaNs (ie Chief Executives is a Broad Occupation 11-1010, but the Major Group and Minor Group are NaN even though we know it's part of Major Group 11-000 and Minor Group 11-1000 based on its coding.)
- Shape: (1596, 6)
- Columns: ['Major Group', 'Minor Group', 'Broad Occupation', 'Detailed Occupation', 'Detailed O*NET-SOC', 'SOC or O*NET-SOC 2019 Title']
- Key: 'SOC or O*NET-SOC 2019 Title'
- dTypes: all objects
- Note: 'SOC or O*NET-SOC 2019 Title' can have duplicates, because the title can be both a Broad Occupation and Detailed Occupation, for example.


### task_pct_v1 and task_pct_v2
List of X tasks and the percent share of total use cases
- Shape: (3514, 2)
- Columns: ['task_name','pct']
- Key: task_name

### automation_vs_augmentation_v1 and automation_vs_augmentation_v2
Categorizes the type of interaction the user gives the AI tool
- Shape: (6, 2)
- Columns: ['interaction_type','pct']
- Key: interaction_type [directive, feedback loop, learning, none, task iteration, validation]

### automation_vs_augmentation_by_task
splits each task by the % share of the below types of interactions. However, it has more than the interation types in automation_vs_augmentation_vX files (ie filtered)
- Shape: (3364, 7)
- Columns: ['task_name','feedback_loop','directive','task_iteration','validation','learning','filtered']
- Key: task_name

### task_thinking_fractions
Lists all tasks and the fraction of thinking time they require?
- Columns: ['task_name','thinking_fraction']
- Key: task_name

In [None]:
# Data Cleaning
onet['Date'] = pd.to_datetime(onet['Date'])