In [2]:
from pyspark.sql import Row
import pyspark

## Check Spark context loaded

In [3]:
sc

## Create RDD from Iris data

In [4]:
rdd=sc.textFile('data/iris.data.txt')

In [5]:
rdd.first()
# Each element of RDD is a line as a single string

u'5.1,3.5,1.4,0.2,Iris-setosa'

In [7]:
lines=rdd.map(lambda x:x.split(','))

In [8]:
lines.first()
# Each lines is now split into tokens

[u'5.1', u'3.5', u'1.4', u'0.2', u'Iris-setosa']

In [9]:
parsedLines=lines.map(lambda x:Row(sepalLength=float(x[0]),sepalWidth=float(x[1]),petalLength=float(x[2])\
                      ,petalWidth=float(x[3]),species=x[4]))

In [10]:
parsedLines.first()
# Each line is now names and parsed as float or string

Row(petalLength=1.4, petalWidth=0.2, sepalLength=5.1, sepalWidth=3.5, species=u'Iris-setosa')

In [11]:
parsedLines.first()['species']
# We can refer to each token by name

u'Iris-setosa'

In [12]:
del rdd
del lines

## Create an Unstructured RDD

In [23]:
sc.parallelize([Row(test=1),Row(test1='a',test2=11)]).collect()

[Row(test=1), Row(test1='a', test2=11)]

## Turn RDD into DataFrame

In [13]:
from pyspark.sql.types import StructField,StructType, FloatType,StringType

In [14]:
schema=StructType([StructField('petalLength',FloatType()),
                   StructField('petalWidth',
                FloatType()),StructField('sepalLength',
                FloatType()),StructField('sepalWidth',
                FloatType()),StructField('species',StringType())])

In [127]:
df=sqlContext.createDataFrame(parsedLines,schema=schema)

In [128]:
df.first()

Row(petalLength=1.399999976158142, petalWidth=0.20000000298023224, sepalLength=5.099999904632568, sepalWidth=3.5, species=u'Iris-setosa')

In [129]:
df.head(4)

[Row(petalLength=1.399999976158142, petalWidth=0.20000000298023224, sepalLength=5.099999904632568, sepalWidth=3.5, species=u'Iris-setosa'),
 Row(petalLength=1.399999976158142, petalWidth=0.20000000298023224, sepalLength=4.900000095367432, sepalWidth=3.0, species=u'Iris-setosa'),
 Row(petalLength=1.2999999523162842, petalWidth=0.20000000298023224, sepalLength=4.699999809265137, sepalWidth=3.200000047683716, species=u'Iris-setosa'),
 Row(petalLength=1.5, petalWidth=0.20000000298023224, sepalLength=4.599999904632568, sepalWidth=3.0999999046325684, species=u'Iris-setosa')]

In [120]:
df.schema

StructType(List(StructField(petalLength,FloatType,true),StructField(petalWidth,FloatType,true),StructField(sepalLength,FloatType,true),StructField(sepalWidth,FloatType,true),StructField(species,StringType,true)))

## Create Dataframe from cities JSON file

In [146]:
df=spark.read.json('data/city.list.clean.json')

In [147]:
df.count()

209579

In [148]:
df.head(3)

[Row(coord=Row(lat=44.549999, lon=34.283333), country=u'UA', id=707860, name=u'Hurzuf'),
 Row(coord=Row(lat=55.683334, lon=37.666668), country=u'RU', id=519188, name=u'Novinki'),
 Row(coord=Row(lat=28.0, lon=84.633331), country=u'NP', id=1283378, name=u'Gorkh\u0101')]

In [149]:
df.columns

['coord', 'country', 'id', 'name']