In [4]:
# Initialize PySpark
import os, sys
os.environ["JAVA_HOME"]="/lrz/sys/compilers/java/jdk1.8.0_112"
APP_NAME = "PySpark Lecture"
SPARK_MASTER="local[1]"
import pyspark
import pyspark.sql
from pyspark.sql import Row
conf=pyspark.SparkConf()
conf=pyspark.SparkConf().setAppName(APP_NAME).set("spark.local.dir", os.path.join(os.getcwd(), "tmp"))
sc = pyspark.SparkContext(master=SPARK_MASTER, conf=conf)
spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()

print("PySpark initiated...")

PySpark initiated...


In [28]:
# Load the text file using the SparkContext
csv_lines = sc.textFile("../data/example.csv")

# Map the data to split the lines into a list
data = csv_lines.map(lambda line: line.split(","))

# Collect the dataset into local RAM
data.collect()

[['Russell Jurney', 'Relato', 'CEO'],
 ['Florian Liebert', 'Mesosphere', 'CEO'],
 ['Don Brown', 'Rocana', 'CIO'],
 ['Steve Jobs', 'Apple', 'CEO'],
 ['Donald Trump', 'The Trump Organization', 'CEO'],
 ['Russell Jurney', 'Data Syndrome', 'Principal Consultant']]

In [62]:
# Turn the CSV lines into objects
def csv_to_record(line):
    parts = line.split(",")
    record = {
      "name": parts[0],
      "company": parts[1],
      "title": parts[2]
    }
    return record

# Apply the function to every record
records = csv_lines.map(csv_to_record)

# Inspect the first item in the dataset
records.first()

{'name': 'Russell Jurney', 'company': 'Relato', 'title': 'CEO'}

In [63]:
# Group the records by the name of the person
grouped_records = records.groupBy(lambda x: x["name"])

# Show the first group
grouped_records.first()

# Count the groups
job_counts = grouped_records.map(
  lambda x: {
    "name": x[0],
    "job_count": len(x[1])
  }
)

job_counts.first()

job_counts.collect()

[{'name': 'Russell Jurney', 'job_count': 2},
 {'name': 'Florian Liebert', 'job_count': 1},
 {'name': 'Don Brown', 'job_count': 1},
 {'name': 'Steve Jobs', 'job_count': 1},
 {'name': 'Donald Trump', 'job_count': 1}]

In [64]:
# Compute a relation of words by line
words_by_line = csv_lines\
  .map(lambda line: line.split(","))

print(words_by_line.collect())

# Compute a relation of words
flattened_words = csv_lines\
  .map(lambda line: line.split(","))\
  .flatMap(lambda x: x)

flattened_words.collect()

[['Russell Jurney', 'Relato', 'CEO'], ['Florian Liebert', 'Mesosphere', 'CEO'], ['Don Brown', 'Rocana', 'CIO'], ['Steve Jobs', 'Apple', 'CEO'], ['Donald Trump', 'The Trump Organization', 'CEO'], ['Russell Jurney', 'Data Syndrome', 'Principal Consultant']]


['Russell Jurney',
 'Relato',
 'CEO',
 'Florian Liebert',
 'Mesosphere',
 'CEO',
 'Don Brown',
 'Rocana',
 'CIO',
 'Steve Jobs',
 'Apple',
 'CEO',
 'Donald Trump',
 'The Trump Organization',
 'CEO',
 'Russell Jurney',
 'Data Syndrome',
 'Principal Consultant']

In [108]:
# Compute a relation of words by line
words_by_line = csv_lines\
  .map(lambda line: line.split(","))

# Compute a relation of words
flattened_words = csv_lines\
  .map(lambda line: line.split(","))\
  .flatMap(lambda x: x)

count = flattened_words.map(lambda x: (x,1))
count = count.reduceByKey(lambda x, y: x+y)
uniq = count.map(lambda x: ("word",1))
uniq = uniq.reduceByKey(lambda x, y: x+y)
#count = count.map(lambda x: x)


print("(word,count): " +str(count.collect()))

print("uniq wc: " + str(uniq.first()[1]))


(word,count): [('Russell Jurney', 2), ('Relato', 1), ('CEO', 4), ('Florian Liebert', 1), ('Mesosphere', 1), ('Don Brown', 1), ('Rocana', 1), ('CIO', 1), ('Steve Jobs', 1), ('Apple', 1), ('Donald Trump', 1), ('The Trump Organization', 1), ('Data Syndrome', 1), ('Principal Consultant', 1)]
uniq wc: 14


In [36]:
nasa = sc.textFile("../data/nasa/NASA_access_log_Jul95")

# Compute a relation of words
flattened_words = nasa\
  .map(lambda line: line.split(" "))\
  .map(lambda x: (x[(len(x)-2)],1))

flattened_words = flattened_words.reduceByKey(lambda x, y: x+y)
flattened_words.collect()

[('304', 132627),
 ('404', 10845),
 ('200', 1701534),
 ('302', 46573),
 ('501', 14),
 ('400', 5),
 ('alyssa.p', 1),
 ('500', 62),
 ('403', 54)]