# Đọc dữ liệu từ HDFS

In [1]:
import json
from functools import reduce
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
            .master("spark://spark-master:7077") \
            .appName("Tutorial") \
            .getOrCreate()

In [2]:
def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

In [3]:
df = []
for i in range(10):
    df.append(spark.read.format("json")
              .load("hdfs://namenode/user/root/input/data{}.json".format(i+1), multiLine = "true"))

AnalysisException: Path does not exist: hdfs://namenode/user/root/input/data1.json;

In [4]:
df = unionAll(df[0], df[1], df[2], df[3], df[4], df[5], df[6], df[7], df[8], df[9])

In [4]:
df = spark.read.format("json").load("hdfs://namenode/user/root/input/data10.json", multiLine = "true")

In [5]:
df.printSchema()

root
 |-- androidVersion: string (nullable = true)
 |-- category: string (nullable = true)
 |-- comments: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- contentRating: string (nullable = true)
 |-- currentVersion: string (nullable = true)
 |-- installs: long (nullable = true)
 |-- lastUpdate: long (nullable = true)
 |-- price: double (nullable = true)
 |-- ratings: long (nullable = true)
 |-- reviews: long (nullable = true)
 |-- score: double (nullable = true)
 |-- size: string (nullable = true)
 |-- title: string (nullable = true)



In [6]:
df = df.na.drop().dropDuplicates()

In [7]:
df.createOrReplaceTempView("dfTable")

In [8]:
df.count()

7695

In [47]:
df = df.filter(size(df['comments']) >= 30)

In [9]:
df.groupby('androidVersion').count().sort('count', ascending=False).show()

+--------------+-----+
|androidVersion|count|
+--------------+-----+
|           5.0| 1723|
|           4.1| 1630|
|           4.4| 1281|
|         4.0.3|  538|
|           4.2|  428|
|        Varies|  392|
|           6.0|  369|
|           4.0|  341|
|           7.0|  338|
|           4.3|  207|
|           5.1|  113|
|           2.3|   68|
|           3.0|   59|
|         2.3.3|   50|
|           2.2|   49|
|          4.4W|   32|
|           8.0|   28|
|           2.1|   14|
|           3.2|    9|
|           1.6|    9|
+--------------+-----+
only showing top 20 rows



In [None]:
df = df.withColumn("androidVersion", col("androidVersion").cast("float"))

In [None]:
df.groupby('androidVersion').count().sort('count', ascending=False).show()

In [None]:
import re

def remove_character(size):
    """
        This UDF takes size as input and returns number of size
    """
    return re.sub(r'[^\d]', '', size)

convert_to_number = udf(remove_character, IntegerType())

In [None]:
df = df.withColumn("size", convert_to_number(col("size")))
df.show(2)

In [None]:
df.groupby('size').count().sort('count', ascending=False).show(10)

In [6]:
spark.sql("Select size from dfTable").show(2)

+----+
|size|
+----+
| 32M|
| 32M|
+----+
only showing top 2 rows



In [49]:
tmp = df.select('comments').collect()

In [57]:
tmp[0][0][29]

'Great'