In [1]:
dbutils.fs.ls('/FileStore/tables/pc5ly1131502251351157/italianPosts.csv')

#### italianPosts.csv
1. commentCount—Number of comments related to the question/answer
1. lastActivityDate—Date and time of the last modification
1. ownerUserId—User ID of the owner
1. body—Textual contents of the question/answer
1. score—Total score based on upvotes and downvotes
1. creationDate—Date and time of creation
1. viewCount—View count
1. title—Title of the question
1. tags—Set of tags the question has been marked with
1. answerCount—Number of related answers
1. acceptedAnswerId—If a question contains the ID of its accepted answer
1. postTypeId—Type of the post; 1 is for questions, 2 for answers
1. id—Post’s unique ID

In [3]:
# load into rdd
rddItalianPosts_1 = sc.textFile('/FileStore/tables/pc5ly1131502251351157/italianPosts.csv') \
                    .map(lambda line: line.split('~'))
rddItalianPosts_1.count()
rddItalianPosts_1.take(1)

#### 5.1.1
#### Creating a DataFrame from an RDD of tuples

In [5]:
# convert each RDD array to tuple
rddItalianPosts_2 = rddItalianPosts_1.map(lambda l: (l[0], l[1], l[2], l[3], l[4], l[5], l[6], l[7], l[8], l[9], l[10], l[11], l[12]))
rddItalianPosts_2.take(1)

In [6]:
# now convert to DF
dfItalianPosts = rddItalianPosts_2.toDF(['commentCount', 'lastActivityDate', 'ownerUserId', 'body', 'score', 'creationDate', 'viewCount', 'title', 'tags', 'answerCount', 'acceptedAnswerId', 'postTypeId', 'id'])
dfItalianPosts.show(2)

##### Note that the field data type are incorrect

In [8]:
dfItalianPosts.printSchema()

We'll need to define schema manually to set the proper datatypes

#### Converting RDDs to DataFrames by specifying a schema

In [11]:
from datetime import datetime

def toIntSafe(val):
  try:
    return int(val)
  except ValueError:
    return None
  
def toLongSafe(val):
  try:
    return long(val)
  except ValueError:
    return None
  
def toTimeSafe(val):
  try:
    return datetime.strptime(val, "%Y-%m-%d %H:%M:%S.%f")
  except ValueError:
    return None
  
# Method to convert string to a Row
from pyspark.sql import Row
def stringToPost(string):
  l_s = string.encode('utf8').strip().split('~')
  return Row(
    toIntSafe(l_s[0]),   # commentCount
    toTimeSafe(l_s[1]),  # lastActivityDate
    toLongSafe(l_s[2]),  # ownerUserId
    l_s[3],              # body
    toIntSafe(l_s[4]),   # score
    toTimeSafe(l_s[5]),  # creationDate
    toIntSafe(l_s[6]),   # viewCount
    l_s[7],              # title
    l_s[8],              # tags
    toIntSafe(l_s[9]),   # answerCount
    toLongSafe(l_s[10]), # acceptedAnswerId
    toLongSafe(l_s[11]), # postTypeId
    toLongSafe(l_s[12]), # id
  )
  
# Define schema
from pyspark.sql.types import *
postSchema = StructType([
    StructField('commentCount', IntegerType(), True),
    StructField('lastActivityDate', TimestampType(), True),
    StructField('ownerUserId', LongType(), True),
    StructField('body', StringType(), True),
    StructField('score', IntegerType(), True),
    StructField('creationDate', TimestampType(), True),
    StructField('viewCount', IntegerType(), True),
    StructField('title', StringType(), True),
    StructField('tags', StringType(), True),
    StructField('answerCount', IntegerType(), True),
    StructField('acceptedAnswerId', LongType(), True),
    StructField('postTypeId', LongType(), True),
    StructField('id', LongType(), False),
  ])

In [12]:
# Now, create RDD using schema
rddItalianPosts_3 = sc.textFile('/FileStore/tables/pc5ly1131502251351157/italianPosts.csv') \
                      .map(lambda line: stringToPost(line))
rddItalianPosts_3.count()

In [13]:
dfItalianPosts_1 = sqlContext.createDataFrame(rddItalianPosts_3, schema=postSchema)
dfItalianPosts_1.take(1)

In [14]:
dfItalianPosts_1.printSchema()

In [15]:
dfItalianPosts_1.columns

In [16]:
dfItalianPosts_1.dtypes