In [1]:
import os 

packages = "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1"
os.environ["PYSPARK_PYTHON"]="python3.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="python3.7"
os.environ["PYSPARK_SUBMIT_ARGS"] = ("--packages {0} pyspark-shell".format(packages))

In [2]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

KAFKA_BROKER = "kafka:9092"
KAFKA_TOPIC = "fixcer"

spark = SparkSession \
        .builder \
        .master("spark://spark-master:7077") \
        .appName("Kafka") \
        .getOrCreate()

In [3]:
df = spark.read.format("json").option("mode", "FAILFAST") \
        .option("inferSchema", "true").load("hdfs://namenode/user/root/input/data10.json")

In [4]:
df.printSchema()

root
 |-- androidVersion: string (nullable = true)
 |-- category: string (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: string (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: string (nullable = true)
 |-- title: string (nullable = true)



In [5]:
df = df.na.drop().dropDuplicates()

In [6]:
df = df.filter(size(df['comments']) >= 30)

In [7]:
import re

def convertString(string):
    try:
        string = re.sub('\D', '', string)
        return int(string)
    except:
        return 0
        

convert = udf(convertString, IntegerType())
df = df.withColumn("androidVersion", convert(col("androidVersion")))
df = df.withColumn("currentVersion", convert(col("currentVersion")))
df = df.withColumn("size", convert(col("size")))

In [8]:
df.show()

+--------------+-----------------+----------------------------------+-------------+--------------+--------+----------+-----+-------+-------+---------+----+-------------------------------+
|androidVersion|         category|                          comments|contentRating|currentVersion|installs|lastUpdate|price|ratings|reviews|    score|size|                          title|
+--------------+-----------------+----------------------------------+-------------+--------------+--------+----------+-----+-------+-------+---------+----+-------------------------------+
|            50|          Weather|              [Recent updates a...| Everyone 10+|         43552|   10000|1602129706|  0.0|    104|     51|     3.51|  32|           KX Storm Team - N...|
|            41|Books & Reference|              [no Arabic audio,...|     Everyone|           163|    5000|1559180891|  0.0|    247|     80|     4.75|  59|           Quran Audio - Urd...|
|            41|         Business|              [Not to happ

In [9]:
df.printSchema()

root
 |-- androidVersion: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: integer (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: integer (nullable = true)
 |-- title: string (nullable = true)



In [10]:
df = df.withColumn('value' ,to_json(struct([df[x] for x in df.columns])))

df.selectExpr("CAST(title AS STRING) as key", "CAST(value AS STRING) as value").show(5)

+--------------------+--------------------+
|                 key|               value|
+--------------------+--------------------+
|KX Storm Team - N...|{"androidVersion"...|
|Quran Audio - Urd...|{"androidVersion"...|
|Reflexis ESS - Be...|{"androidVersion"...|
|          Műsorújság|{"androidVersion"...|
|            e-Título|{"androidVersion"...|
+--------------------+--------------------+
only showing top 5 rows



In [11]:
df.selectExpr("CAST(title AS STRING) as key", "CAST(value AS STRING)") \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_BROKER) \
  .option("topic", KAFKA_TOPIC) \
  .save()

In [12]:
spark.stop()