In [2]:
import pyspark

In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc=SparkContext()
spark=SparkSession(sc)

## CSV to dataframe 

In [4]:
brew_df=spark.read.csv('breweriesUS.csv',header=True,inferSchema=True,comment=None,encoding='UTF-8')

In [5]:
brew_df.show()

+--------------------+---------------+--------------------+--------------------+----------+---------------+
|        brewery_name|           type|             address|             website|     state|state_breweries|
+--------------------+---------------+--------------------+--------------------+----------+---------------+
|  Valley Brewing Co.|        Brewpub|PO Box 4653, Stoc...|http://www.valley...|california|            284|
|  Valley Brewing Co.|        Brewpub|157 Adams St., St...|http://www.valley...|california|            284|
|   Valley Brewing Co|   Microbrewery|1950 W Freemont, ...|http://www.valley...|california|            284|
|Ukiah Brewing Com...|        Brewpub|102 S. State St.,...|http://www.ukiahb...|california|            284|
|  Tustin Brewing Co.|        Brewpub|13011 Newport Ave...|http://www.tustin...|california|            284|
|     Trumer Brauerei|   Microbrewery|1404 4th St., Ber...|http://www.trumer...|california|            284|
|     Trumer Brauerei|Region

## dataframe to RDD

Each element in RDD is a pyspark.sql.Row which contains key,value pairs.  

In [6]:
#dataframe can be converted to RDD using rdd() function
brew_df.rdd.take(2)

[Row(brewery_name='Valley Brewing Co.', type='Brewpub', address='PO Box 4653, Stockton, California, 95204', website='http://www.valleybrew.com/', state='california', state_breweries='284'),
 Row(brewery_name='Valley Brewing Co.', type='Brewpub', address='157 Adams St., Stockton, California, 95204', website='http://www.valleybrew.com/', state='california', state_breweries='284')]

In [7]:
brew_map=brew_df.rdd.map(lambda x:(x['brewery_name'],x['type']))
brew_map.take(4)

[('Valley Brewing Co.', 'Brewpub'),
 ('Valley Brewing Co.', 'Brewpub'),
 ('Valley Brewing Co', 'Microbrewery'),
 ('Ukiah Brewing Company', 'Brewpub')]

## RDD to Dataframe

A RDD can be converted to dataframe using toDF() function with columns names given in a list

In [8]:
df=brew_map.toDF(['name','type'])

In [9]:
df.show()

+--------------------+---------------+
|                name|           type|
+--------------------+---------------+
|  Valley Brewing Co.|        Brewpub|
|  Valley Brewing Co.|        Brewpub|
|   Valley Brewing Co|   Microbrewery|
|Ukiah Brewing Com...|        Brewpub|
|  Tustin Brewing Co.|        Brewpub|
|     Trumer Brauerei|   Microbrewery|
|     Trumer Brauerei|RegionalBrewery|
|Triple Rock Brewi...|        Brewpub|
|Tied House Cafe &...|        Brewpub|
|Tied House Cafe &...|        Brewpub|
|Thirsty Bear Brew...|   Microbrewery|
|Third Street Ale ...|        Brewpub|
|       The Brewhouse|        Brewpub|
|    The Beach Chalet|        Brewpub|
|Telegraph Brewing...|   Microbrewery|
|Taylor's Restaura...|        Brewpub|
|Taps Fishouse & B...|        Brewpub|
|Tailgate Brewery,...|ContractBrewery|
|Sudwerk Privatbra...|        Brewpub|
|Sudwerk Brewery &...|        Brewpub|
+--------------------+---------------+
only showing top 20 rows



we can create an RDD using a textfile and later converting it to dataframe using SparkSession.createDataFrame

In [10]:
#Each row in rdd is a Row object
rdd=sc.textFile('breweriesUS.csv')
rdd.take(3)

['brewery_name,type,address,website,state,state_breweries',
 'Valley Brewing Co.,Brewpub,"PO Box 4653, Stockton, California, 95204",http://www.valleybrew.com/,california,284',
 'Valley Brewing Co.,Brewpub,"157 Adams St., Stockton, California, 95204",http://www.valleybrew.com/,california,284']

In [11]:
#first row contains column names
header=rdd.map(lambda x:x.split(',')).collect()[0]
header

['brewery_name', 'type', 'address', 'website', 'state', 'state_breweries']

In [19]:
#save other rows to an rdd
first_row=rdd.first()
rdd_data=rdd.filter(lambda x:x!= first_row).map(lambda x:x.split(','))
rdd_data.take(2)

[['Valley Brewing Co.',
  'Brewpub',
  '"PO Box 4653',
  ' Stockton',
  ' California',
  ' 95204"',
  'http://www.valleybrew.com/',
  'california',
  '284'],
 ['Valley Brewing Co.',
  'Brewpub',
  '"157 Adams St.',
  ' Stockton',
  ' California',
  ' 95204"',
  'http://www.valleybrew.com/',
  'california',
  '284']]

rdd_data contains lists of values which doesn't have column names assigned to it.We need to convert 
the list rows into Row objects. Below function is used to form key,value pairs for each row in the rdd.
We can use a dictionary as an argument list and use the ** to unpack the argument list.


In [23]:
from pyspark.sql import Row
my_dict=dict(zip(['a','b','c'],range(1,4)))
Row(**my_dict) # ** is used to unpack the argument list into a Row object

Row(a=1, b=2, c=3)

In [24]:
#let's define a function
def list_to_row(keys,values):
    row_dict=dict(zip(keys,values))
    return Row(**row_dict)

In [25]:
rdd_rows=rdd_data.map(lambda x:list_to_row(header,x))
rdd_rows.take(2)

[Row(address='"PO Box 4653', brewery_name='Valley Brewing Co.', state=' California', state_breweries=' 95204"', type='Brewpub', website=' Stockton'),
 Row(address='"157 Adams St.', brewery_name='Valley Brewing Co.', state=' California', state_breweries=' 95204"', type='Brewpub', website=' Stockton')]

In [26]:
df_data=spark.createDataFrame(rdd_rows)
df_data.show()

+--------------------+--------------------+-----------+---------------+---------------+----------------+
|             address|        brewery_name|      state|state_breweries|           type|         website|
+--------------------+--------------------+-----------+---------------+---------------+----------------+
|        "PO Box 4653|  Valley Brewing Co.| California|         95204"|        Brewpub|        Stockton|
|      "157 Adams St.|  Valley Brewing Co.| California|         95204"|        Brewpub|        Stockton|
|    "1950 W Freemont|   Valley Brewing Co| California|         95203"|   Microbrewery|        Stockton|
|   "102 S. State St.|Ukiah Brewing Com...| California|         95482"|        Brewpub|           Ukiah|
|"13011 Newport Av...|  Tustin Brewing Co.| California|         92780"|        Brewpub|          Tustin|
|       "1404 4th St.|     Trumer Brauerei| California|         94608"|   Microbrewery|        Berkeley|
|    "1404 Fourth St.|     Trumer Brauerei| California|