# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("occupation").getOrCreate()
spark

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [2]:
from pyspark import SparkFiles

In [3]:
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user"
spark.sparkContext.addFile(url)
users = spark.read.csv(SparkFiles.get("u.user"), header=True, inferSchema=True, sep='|')
users.show(5)

+-------+---+------+----------+--------+
|user_id|age|gender|occupation|zip_code|
+-------+---+------+----------+--------+
|      1| 24|     M|technician|   85711|
|      2| 53|     F|     other|   94043|
|      3| 23|     M|    writer|   32067|
|      4| 24|     M|technician|   43537|
|      5| 33|     F|     other|   15213|
+-------+---+------+----------+--------+
only showing top 5 rows



### Step 4. Discover what is the mean age per occupation

In [6]:
users.groupBy("occupation").mean().select("occupation","avg(age)").show(5)

+----------+------------------+
|occupation|          avg(age)|
+----------+------------------+
| librarian|              40.0|
|   retired| 63.07142857142857|
|    lawyer|             36.75|
|      none|26.555555555555557|
|    writer| 36.31111111111111|
+----------+------------------+
only showing top 5 rows



### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [13]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [20]:
def gender_num(x):
    if x == "M":
        return int(1)
    if x == "F":
        return int(0)
udf_gender_num = udf(lambda x: gender_num(x),IntegerType())    

In [22]:
users_df = users.withColumn("gender_num", udf_gender_num(col("gender")))
users_df.show(5)

+-------+---+------+----------+--------+----------+
|user_id|age|gender|occupation|zip_code|gender_num|
+-------+---+------+----------+--------+----------+
|      1| 24|     M|technician|   85711|         1|
|      2| 53|     F|     other|   94043|         0|
|      3| 23|     M|    writer|   32067|         1|
|      4| 24|     M|technician|   43537|         1|
|      5| 33|     F|     other|   15213|         0|
+-------+---+------+----------+--------+----------+
only showing top 5 rows



In [23]:
users_df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- gender_num: integer (nullable = true)



In [68]:
users_male = users_df.filter(users_df.gender == "M").groupBy("occupation").count()
users_male = users_male.withColumnRenamed("count", "male_count")
users_male.show(5)

+----------+----------+
|occupation|male_count|
+----------+----------+
| librarian|        22|
|   retired|        13|
|    lawyer|        10|
|      none|         5|
|    writer|        26|
+----------+----------+
only showing top 5 rows



In [69]:
users_male.printSchema()

root
 |-- occupation: string (nullable = true)
 |-- male_count: long (nullable = false)



In [70]:
users_all = users_df.groupBy("occupation").count()
users_all = users_all.withColumnRenamed("count","occ_total")
users_all.show(5)

+----------+---------+
|occupation|occ_total|
+----------+---------+
| librarian|       51|
|   retired|       14|
|    lawyer|       12|
|      none|        9|
|    writer|       45|
+----------+---------+
only showing top 5 rows



In [71]:
users_all.printSchema()

root
 |-- occupation: string (nullable = true)
 |-- occ_total: long (nullable = false)



In [74]:
# result = users_male.join(users_all, users_male["occupation"] == users_all["occupation"])
#produces duplicate column "occupation"

#produces no duplicate columns
result = users_male.join(users_all, ["occupation"])
result.show(5)

+----------+----------+---------+
|occupation|male_count|occ_total|
+----------+----------+---------+
| librarian|        22|       51|
|   retired|        13|       14|
|    lawyer|        10|       12|
|      none|         5|        9|
|    writer|        26|       45|
+----------+----------+---------+
only showing top 5 rows



In [77]:
result_male_ratio = result.withColumn("male_ratio",(result['male_count']/result['occ_total'])*100 )
result_male_ratio.orderBy("male_ratio", ascending=0).show(5)
# result_male_ratio.show(5)

+----------+----------+---------+-----------------+
|occupation|male_count|occ_total|       male_ratio|
+----------+----------+---------+-----------------+
|    doctor|         7|        7|            100.0|
|  engineer|        65|       67|97.01492537313433|
|technician|        26|       27|96.29629629629629|
|   retired|        13|       14|92.85714285714286|
|programmer|        60|       66| 90.9090909090909|
+----------+----------+---------+-----------------+
only showing top 5 rows



### Step 6. For each occupation, calculate the minimum and maximum ages

In [80]:
# users.groupBy("occupation").agg({"age":"min", "age":"max"}).show(5)
#performs the last agg in the dict

#from pyspark.sql.functions import *
# min & max must be already imported from pyspark.sql.functions

users.groupBy("occupation").agg(min('age'), max('age')).show(5)

+----------+--------+--------+
|occupation|min(age)|max(age)|
+----------+--------+--------+
| librarian|      23|      69|
|   retired|      51|      73|
|    lawyer|      21|      53|
|      none|      11|      55|
|    writer|      18|      60|
+----------+--------+--------+
only showing top 5 rows



### Step 7. For each combination of occupation and gender, calculate the mean age

In [83]:
users.groupBy("occupation","gender").agg(mean('age')).orderBy("occupation").show(5)

+-------------+------+------------------+
|   occupation|gender|          avg(age)|
+-------------+------+------------------+
|administrator|     M| 37.16279069767442|
|administrator|     F|40.638888888888886|
|       artist|     M|32.333333333333336|
|       artist|     F|30.307692307692307|
|       doctor|     M| 43.57142857142857|
+-------------+------+------------------+
only showing top 5 rows



### Step 8.  For each occupation present the percentage of women and men

In [85]:
# create a data frame and apply count to gender
# gender_ocup = users.groupby(['occupation', 'gender']).agg({'gender': 'count'})
gender_ocup = users.groupBy("occupation","gender").agg({"gender": "count"})
gender_ocup = gender_ocup.withColumnRenamed("count(gender)", "gender_count")
gender_ocup.show(5)

+-------------+------+------------+
|   occupation|gender|gender_count|
+-------------+------+------------+
|   technician|     M|          26|
|     educator|     F|          26|
|       lawyer|     F|           2|
|entertainment|     F|           2|
|       lawyer|     M|          10|
+-------------+------+------------+
only showing top 5 rows



In [87]:
# create a DataFrame and apply count for each occupation
# occup_count = users.groupby(['occupation']).agg('count')
occup_count = users.groupBy("occupation").count()
occup_count = occup_count.withColumnRenamed("count","occup_count")
occup_count.show(5)

+----------+-----------+
|occupation|occup_count|
+----------+-----------+
| librarian|         51|
|   retired|         14|
|    lawyer|         12|
|      none|          9|
|    writer|         45|
+----------+-----------+
only showing top 5 rows



In [89]:
# divide the gender_ocup per the occup_count and multiply per 100
# occup_gender = gender_ocup.div(occup_count, level = "occupation") * 100
occup_gender = gender_ocup.join(occup_count,['occupation'])
occup_gender.show(5)

+-------------+------+------------+-----------+
|   occupation|gender|gender_count|occup_count|
+-------------+------+------------+-----------+
|   technician|     M|          26|         27|
|     educator|     F|          26|         95|
|       lawyer|     F|           2|         12|
|entertainment|     F|           2|         18|
|       lawyer|     M|          10|         12|
+-------------+------+------------+-----------+
only showing top 5 rows



In [93]:
occup_gender = occup_gender.withColumn("percent_presence", (occup_gender['gender_count']/occup_gender['occup_count'])*100)
occup_gender.orderBy(["occupation","gender"], ascending=1).show(5)

+-------------+------+------------+-----------+-----------------+
|   occupation|gender|gender_count|occup_count| percent_presence|
+-------------+------+------------+-----------+-----------------+
|administrator|     F|          36|         79|45.56962025316456|
|administrator|     M|          43|         79|54.43037974683544|
|       artist|     F|          13|         28|46.42857142857143|
|       artist|     M|          15|         28|53.57142857142857|
|       doctor|     M|           7|          7|            100.0|
+-------------+------+------------+-----------+-----------------+
only showing top 5 rows

