# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import numpy as np
import pandas as pd

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [4]:
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'
users = pd.read_csv(url, sep='|')
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Step 4. Discover what is the mean age per occupation

In [6]:
# get a look at basic info
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     943 non-null    int64 
 1   age         943 non-null    int64 
 2   gender      943 non-null    object
 3   occupation  943 non-null    object
 4   zip_code    943 non-null    object
dtypes: int64(2), object(3)
memory usage: 37.0+ KB


In [7]:
# no nulls, pythonic column names, but user_id is essentially a repeated index
# and is an int. Get back to that if necessary.
users.occupation.unique()

array(['technician', 'other', 'writer', 'executive', 'administrator',
       'student', 'lawyer', 'educator', 'scientist', 'entertainment',
       'programmer', 'librarian', 'homemaker', 'artist', 'engineer',
       'marketing', 'none', 'healthcare', 'retired', 'salesman', 'doctor'],
      dtype=object)

In [11]:
users.groupby('occupation').mean()[['age']].sort_values('age')

Unnamed: 0_level_0,age
occupation,Unnamed: 1_level_1
student,22.081633
none,26.555556
entertainment,29.222222
artist,31.392857
homemaker,32.571429
programmer,33.121212
technician,33.148148
other,34.52381
scientist,35.548387
salesman,35.666667


### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [13]:
# take a look at values for gender:
users.gender.unique()

array(['M', 'F'], dtype=object)

In [37]:
# make a boolean column for is_male
users['is_male'] = users.gender == 'M'
# groupby occupation and aggregate on the mean of is_male
users.groupby('occupation').mean()['is_male'].sort_values(ascending = False)

occupation
doctor           1.000000
engineer         0.970149
technician       0.962963
retired          0.928571
programmer       0.909091
executive        0.906250
scientist        0.903226
entertainment    0.888889
lawyer           0.833333
salesman         0.750000
educator         0.726316
student          0.693878
other            0.657143
marketing        0.615385
writer           0.577778
none             0.555556
administrator    0.544304
artist           0.535714
librarian        0.431373
healthcare       0.312500
homemaker        0.142857
Name: is_male, dtype: float64

### Step 6. For each occupation, calculate the minimum and maximum ages

In [28]:
min_ages = users.groupby('occupation').age.min()
max_ages = users.groupby('occupation').age.max()
pd.merge(left=min_ages, right=max_ages, left_index=True, right_index=True, suffixes=['_min', '_max'])

Unnamed: 0_level_0,age_min,age_max
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,21,70
artist,19,48
doctor,28,64
educator,23,63
engineer,22,70
entertainment,15,50
executive,22,69
healthcare,22,62
homemaker,20,50
lawyer,21,53


### Step 7. For each combination of occupation and gender, calculate the mean age

In [32]:
users.groupby(['occupation', 'gender']).age.mean()

occupation     gender
administrator  F         40.638889
               M         37.162791
artist         F         30.307692
               M         32.333333
doctor         M         43.571429
educator       F         39.115385
               M         43.101449
engineer       F         29.500000
               M         36.600000
entertainment  F         31.000000
               M         29.000000
executive      F         44.000000
               M         38.172414
healthcare     F         39.818182
               M         45.400000
homemaker      F         34.166667
               M         23.000000
lawyer         F         39.500000
               M         36.200000
librarian      F         40.000000
               M         40.000000
marketing      F         37.200000
               M         37.875000
none           F         36.500000
               M         18.600000
other          F         35.472222
               M         34.028986
programmer     F         32.16666

### Step 8.  For each occupation present the percentage of women and men

In [47]:
# make a boolean column for is_female
users['is_female'] = users.gender == 'F'
occupation_by_percent_gender = users.groupby(['occupation']).mean()[['is_male', 'is_female']]
(occupation_by_percent_gender * 100).round(2)
occupation_by_percent_gender.rename(columns={'is_male': 'percent_male', 'is_female': 'percent_female'}, inplace=True)

In [48]:
occupation_by_percent_gender

Unnamed: 0_level_0,percent_male,percent_female
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
administrator,0.544304,0.455696
artist,0.535714,0.464286
doctor,1.0,0.0
educator,0.726316,0.273684
engineer,0.970149,0.029851
entertainment,0.888889,0.111111
executive,0.90625,0.09375
healthcare,0.3125,0.6875
homemaker,0.142857,0.857143
lawyer,0.833333,0.166667
