In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql import Row

In [8]:
spark=SparkSession.builder \
    .appName("") \
    .enableHiveSupport() \
    .getOrCreate()

In [9]:
rdd0=spark.read.text("data.txt").rdd
rdd0.collect()

[Row(value='name,age,gender'),
 Row(value='Paul,40,male'),
 Row(value='John,40,male'),
 Row(value='David,15,male'),
 Row(value='Susan,40,female'),
 Row(value='Karen,34,female')]

In [10]:
rdd1=rdd0.collect()[1:-1]
rdd1

[Row(value='Paul,40,male'),
 Row(value='John,40,male'),
 Row(value='David,15,male'),
 Row(value='Susan,40,female')]

In [11]:
rdd2=spark.sparkContext.parallelize(rdd1)
rdd2.collect()

[Row(value='Paul,40,male'),
 Row(value='John,40,male'),
 Row(value='David,15,male'),
 Row(value='Susan,40,female')]

In [6]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", FloatType(), True),
    StructField("gender", StringType(), True)
])

In [7]:
rdd3=rdd2.map(lambda line:Row(line[0].split(",")[0],float(line[0].split(",")[1]),line[0].split(",")[2]))
rdd3.collect()

[<Row('Paul', 40.0, 'male')>,
 <Row('John', 40.0, 'male')>,
 <Row('David', 15.0, 'male')>,
 <Row('Susan', 40.0, 'female')>]

In [8]:
df=spark.createDataFrame(rdd3,schema)
df.show(truncate=False)

+-----+----+------+
|name |age |gender|
+-----+----+------+
|Paul |40.0|male  |
|John |40.0|male  |
|David|15.0|male  |
|Susan|40.0|female|
+-----+----+------+



In [9]:
df.createOrReplaceTempView("people")

In [10]:
spark.sql("select * from people").show()

+-----+----+------+
| name| age|gender|
+-----+----+------+
| Paul|40.0|  male|
| John|40.0|  male|
|David|15.0|  male|
|Susan|40.0|female|
+-----+----+------+



In [11]:
spark.sql("select * from people where gender=='male'").show()

+-----+----+------+
| name| age|gender|
+-----+----+------+
| Paul|40.0|  male|
| John|40.0|  male|
|David|15.0|  male|
+-----+----+------+



In [12]:
spark.sql("select * from people where gender=='female'").show()

+-----+----+------+
| name| age|gender|
+-----+----+------+
|Susan|40.0|female|
+-----+----+------+



In [13]:
spark.sql("select * from people where age > 20.0").show()

+-----+----+------+
| name| age|gender|
+-----+----+------+
| Paul|40.0|  male|
| John|40.0|  male|
|Susan|40.0|female|
+-----+----+------+



In [17]:
spark.stop()