In [1]:
import pyspark 
from pyspark.sql import SQLContext, functions as F
from pyspark import SparkFiles


sc = pyspark.SparkContext('local[*]')
sqlContext = SQLContext(sc)

#### Load data 

In [2]:
data_url = "https://raw.githubusercontent.com/thomaspernet/data_csv_r/master/data/adult.csv"
sc.addFile(data_url)
df = sqlContext.read.csv(SparkFiles.get("adult.csv"), header=True, inferSchema= True)

#### Operations 

In [3]:
df.count()

32561

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: integer (nullable = true)
 |-- capital_loss: integer (nullable = true)
 |-- hours_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- label: string (nullable = true)



In [5]:
df.show(10)

+---+----------------+------+---------+-------------+--------------------+-----------------+-------------+-----+------+------------+------------+----------+--------------+-----+
|age|       workclass|fnlwgt|education|education_num|             marital|       occupation| relationship| race|   sex|capital_gain|capital_loss|hours_week|native_country|label|
+---+----------------+------+---------+-------------+--------------------+-----------------+-------------+-----+------+------------+------------+----------+--------------+-----+
| 39|       State-gov| 77516|Bachelors|           13|       Never-married|     Adm-clerical|Not-in-family|White|  Male|        2174|           0|        40| United-States|<=50K|
| 50|Self-emp-not-inc| 83311|Bachelors|           13|  Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|           0|           0|        13| United-States|<=50K|
| 38|         Private|215646|  HS-grad|            9|            Divorced|Handlers-cleaners|Not-in-family|Whit

In [6]:
df.select("occupation").distinct().show()

+-----------------+
|       occupation|
+-----------------+
|            Sales|
|  Exec-managerial|
|   Prof-specialty|
|Handlers-cleaners|
|  Farming-fishing|
|     Craft-repair|
| Transport-moving|
|  Priv-house-serv|
|  Protective-serv|
|    Other-service|
|     Tech-support|
|Machine-op-inspct|
|     Armed-Forces|
|                ?|
|     Adm-clerical|
+-----------------+



In [7]:
df.groupBy("occupation").count().show()

+-----------------+-----+
|       occupation|count|
+-----------------+-----+
|            Sales| 3650|
|  Exec-managerial| 4066|
|   Prof-specialty| 4140|
|Handlers-cleaners| 1370|
|  Farming-fishing|  994|
|     Craft-repair| 4099|
| Transport-moving| 1597|
|  Priv-house-serv|  149|
|  Protective-serv|  649|
|    Other-service| 3295|
|     Tech-support|  928|
|Machine-op-inspct| 2002|
|     Armed-Forces|    9|
|                ?| 1843|
|     Adm-clerical| 3770|
+-----------------+-----+



In [8]:
df.groupBy("occupation").count().sort("count", ascending=False).show()

+-----------------+-----+
|       occupation|count|
+-----------------+-----+
|   Prof-specialty| 4140|
|     Craft-repair| 4099|
|  Exec-managerial| 4066|
|     Adm-clerical| 3770|
|            Sales| 3650|
|    Other-service| 3295|
|Machine-op-inspct| 2002|
|                ?| 1843|
| Transport-moving| 1597|
|Handlers-cleaners| 1370|
|  Farming-fishing|  994|
|     Tech-support|  928|
|  Protective-serv|  649|
|  Priv-house-serv|  149|
|     Armed-Forces|    9|
+-----------------+-----+



In [9]:
df.select("age").describe().show()

+-------+------------------+
|summary|               age|
+-------+------------------+
|  count|             32561|
|   mean| 38.58164675532078|
| stddev|13.640432553581356|
|    min|                17|
|    max|                90|
+-------+------------------+



In [23]:
df.filter(df.age > 25).count()

26150

In [22]:
df.filter("age > 25").count()

26150

### Convert to Pandas

In [11]:
pandas_df = df.toPandas()

In [1]:
pandas_df

NameError: name 'pandas_df' is not defined

# Stop context 

In [2]:
sc.stop()

NameError: name 'sc' is not defined