In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Data_wrangling").getOrCreate()

In [3]:
df = spark.read.format("csv") \
    .option("inferSchema", "True") \
    .option("header", "True") \
    .option("sep", ",") \
    .load("./data/Online_Retail.csv")

In [4]:
df

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: int, Country: string]

In [5]:
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS S

In [6]:
type(df)

pyspark.sql.dataframe.DataFrame

In [7]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



# スキーマを指定してファイル読み込み

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

In [9]:
spark.version

'3.5.0'

In [10]:
schema = StructType([
    StructField("InvoiceNo", StringType(), False),
    StructField("StockCode", StringType(), False),
    StructField("Description", StringType(), False),
    StructField("Quantity", IntegerType(), False),
    StructField("InvoiceDate", DateType(), False),
    StructField("UnitPrice", FloatType(), False),
    StructField("CustomerID", IntegerType(), False),
    StructField("Country", StringType(), False)
])

In [11]:
df = spark.read.format("csv") \
    .option("header", "True") \
    .option("sep", ",") \
    .load("./data/Online_Retail.csv", schema=schema)

In [12]:
df.show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |22752    |SET 7 BABUSHKA NESTING BOXES       |2       |2010-12-01 |7.

In [13]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'date'),
 ('UnitPrice', 'float'),
 ('CustomerID', 'int'),
 ('Country', 'string')]

In [14]:
df.select("StockCode", "Description").show()

+---------+--------------------+
|StockCode|         Description|
+---------+--------------------+
|   85123A|WHITE HANGING HEA...|
|    71053| WHITE METAL LANTERN|
|   84406B|CREAM CUPID HEART...|
|   84029G|KNITTED UNION FLA...|
|   84029E|RED WOOLLY HOTTIE...|
|    22752|SET 7 BABUSHKA NE...|
|    21730|GLASS STAR FROSTE...|
|    22633|HAND WARMER UNION...|
|    22632|HAND WARMER RED P...|
|    84879|ASSORTED COLOUR B...|
|    22745|POPPY'S PLAYHOUSE...|
|    22748|POPPY'S PLAYHOUSE...|
|    22749|FELTCRAFT PRINCES...|
|    22310|IVORY KNITTED MUG...|
|    84969|BOX OF 6 ASSORTED...|
|    22623|BOX OF VINTAGE JI...|
|    22622|BOX OF VINTAGE AL...|
|    21754|HOME BUILDING BLO...|
|    21755|LOVE BUILDING BLO...|
|    21777|RECIPE BOX WITH M...|
+---------+--------------------+
only showing top 20 rows



In [15]:
df.count()

541909

# filter: 条件を満たすレコードの抽出

In [16]:
df.filter(df['UnitPrice'] > 30).show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536392|    22827|RUSTIC  SEVENTEEN...|       1| 2010-12-01|    165.0|     13705|United Kingdom|
|   536396|    22803|IVORY EMBROIDERED...|       2| 2010-12-01|    35.75|     17850|United Kingdom|
|   536406|    22803|IVORY EMBROIDERED...|       2| 2010-12-01|    35.75|     17850|United Kingdom|
|   536540|       C2|            CARRIAGE|       1| 2010-12-01|     50.0|     14911|          EIRE|
|   536544|    22769|CHALKBOARD KITCHE...|       1| 2010-12-01|    51.02|      NULL|United Kingdom|
|   536544|    22847|BREAD BIN DINER S...|       1| 2010-12-01|     34.0|      NULL|United Kingdom|
|   536544|      DOT|      DOTCOM POSTAGE|       1| 2010-12-01|   569.77|      NULL|United Kingdom|


In [17]:
df.filter(df['Country']=="Switzerland").show()

+---------+---------+--------------------+--------+-----------+---------+----------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|    Country|
+---------+---------+--------------------+--------+-----------+---------+----------+-----------+
|   536858|    22326|ROUND SNACK BOXES...|      30| 2010-12-03|     2.95|     13520|Switzerland|
|   536858|    22554|PLASTERS IN TIN W...|      36| 2010-12-03|     1.65|     13520|Switzerland|
|   536858|    21731|RED TOADSTOOL LED...|      24| 2010-12-03|     1.65|     13520|Switzerland|
|   536858|    20677|  PINK POLKADOT BOWL|      16| 2010-12-03|     1.25|     13520|Switzerland|
|   536858|    20750|RED RETROSPOT MIN...|       2| 2010-12-03|     7.95|     13520|Switzerland|
|   536858|     POST|             POSTAGE|       2| 2010-12-03|     40.0|     13520|Switzerland|
|   539488|    22837|HOT WATER BOTTLE ...|       8| 2010-12-20|     4.65|     12377|Switzerland|
|   539488|    22112|CHOCOLATE

In [18]:
df.filter(df['Country']=="Switzerland").count()

2002

# where: 条件を満たすレコードの抽出

In [19]:
df.where(df['Description'].contains("WATER")).show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536373   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536375   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536376   |22114    |HOT WATER BOTTLE TEA AND SYMPATHY  |48      |2010-12-01 |3.45     |15291     |United Kingdom|
|536390   |21485    |RETROSPOT HEART HOT WATER BOTTLE   |24      |2010-12-01 |4.25     |17511     |United Kingdom|
|536390   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|24      |2010-12-01 |3.

In [20]:
df.where(df['Description'].contains("WATER")).count()

11374

# 列の作成と削除

In [21]:
df.withColumn("amount", df["Quantity"] * df["UnitPrice"]).show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+---------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |amount   |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+---------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|15.299999|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|22.0     |
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |


In [22]:
df = df.withColumn("amount", df["Quantity"] * df["UnitPrice"])

In [23]:
df.show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+---------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |amount   |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+---------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|15.299999|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|22.0     |
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |


In [24]:
df.drop("amount").show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6| 2010-12-01|     4.25|     17850|United Kingdom|


In [25]:
df = df.drop("amount")
df.show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |22752    |SET 7 BABUSHKA NESTING BOXES       |2       |2010-12-01 |7.

In [26]:
df.drop("InvoiceNo", "StockCode").show()

+--------------------+--------+-----------+---------+----------+--------------+
|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+--------------------+--------+-----------+---------+----------+--------------+
|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|
|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17850|United Kingdom|
|GLASS STAR FROSTE...|       6| 2010-12-01|     4.25|     17850|United Kingdom|
|HAND WARMER UNION...|       6| 2010-12-01|     1.85|     17850|United Kingdom|
|HAND WARMER RED P...|       6| 2010-12-01|     1.85|     17850|United Kingdom|
|ASSORTED COLOUR B...|      32| 2010-12-

# 列名の変更

In [27]:
df.withColumnRenamed("Country", "CountryName").show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|   CountryName|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6| 2010-12-01|     4.25|     17850|United Kingdom|


In [28]:
df.withColumnRenamed("Country", "CountryName").columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'CountryName']

# キャスト (型変換)

In [29]:
df = df.withColumn("Quantity", df["Quantity"].cast("int"))

In [30]:
df.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6| 2010-12-01|     4.25|     17850|United Kingdom|


In [31]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'date'),
 ('UnitPrice', 'float'),
 ('CustomerID', 'int'),
 ('Country', 'string')]

# 日付の処理

In [32]:
from pyspark.sql.functions import year, month, dayofmonth

In [33]:
df_tmp = df.withColumn("purchased_year", year("InvoiceDate"))
df_tmp.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|purchased_year|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|          2010|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|          2010|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17

In [34]:
df_tmp = df_tmp.withColumn("purchased_month", month("InvoiceDate"))
df_tmp.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+---------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|purchased_year|purchased_month|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+---------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|          2010|             12|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|             12|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|          2010|             12|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|             12|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850

In [35]:
df_tmp = df_tmp.withColumn("purchased_day", dayofmonth("InvoiceDate"))
df_tmp.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+---------------+-------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|purchased_year|purchased_month|purchased_day|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+---------------+-------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|          2010|             12|            1|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|             12|            1|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|          2010|             12|            1|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|             12|

# 欠損値処理

In [36]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'date'),
 ('UnitPrice', 'float'),
 ('CustomerID', 'int'),
 ('Country', 'string')]

In [37]:
df["Description"].isNull()

Column<'(Description IS NULL)'>

In [38]:
df[df["Description"].isNull()].show()

+---------+---------+-----------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-----------+---------+----------+--------------+
|   536414|    22139|       NULL|      56| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536545|    21134|       NULL|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536546|    22145|       NULL|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536547|    37509|       NULL|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536549|   85226A|       NULL|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536550|    85044|       NULL|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536552|    20950|       NULL|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536553|    37461|       NULL|       3| 2010-12-01|      0.0|      NULL|United Kingdom|

In [39]:
df[df["Description"].isNull()].count()

1454

In [40]:
# nan check -> isnan
from pyspark.sql.functions import isnan

In [41]:
df[isnan(df["Description"])]

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: date, UnitPrice: float, CustomerID: int, Country: string]

In [42]:
df[isnan(df["Description"])].count()

0

In [43]:
df[isnan(df["Quantity"])].show()

+---------+---------+-----------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+-----------+--------+-----------+---------+----------+-------+
+---------+---------+-----------+--------+-----------+---------+----------+-------+



In [44]:
# 欠損値埋め -> fillna
df_fillna = df.fillna("unknown", subset="Description")

In [45]:
df_fillna.filter(df_fillna["Description"] == "unknown").show()

+---------+---------+-----------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+-----------+--------+-----------+---------+----------+--------------+
|   536414|    22139|    unknown|      56| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536545|    21134|    unknown|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536546|    22145|    unknown|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536547|    37509|    unknown|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536549|   85226A|    unknown|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536550|    85044|    unknown|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536552|    20950|    unknown|       1| 2010-12-01|      0.0|      NULL|United Kingdom|
|   536553|    37461|    unknown|       3| 2010-12-01|      0.0|      NULL|United Kingdom|

In [46]:
df_fillna[df_fillna["Description"].isNull()].count()

0

In [47]:
# 欠損値の削除 -> dropna
df[df["CustomerID"].isNull()].count()

135080

In [48]:
df.count()

541909

In [49]:
df_dropna = df.dropna("any")
df_dropna.count()

406829

# 集計 groupby

In [50]:
df.groupby(df["Description"]).count().show()

+--------------------+-----+
|         Description|count|
+--------------------+-----+
|10 COLOUR SPACEBO...|  327|
|SET/10 BLUE POLKA...|  249|
|POTTING SHED SOW ...|    2|
|PAPERWEIGHT KINGS...|   24|
|WOVEN BERRIES CUS...|   89|
|WHITE/PINK MINI C...|    7|
|SET/3 RED GINGHAM...|  494|
|MAGNETS PACK OF 4...|  128|
|WHITE CHRYSANTHEM...|    9|
|WHITE FRANGIPANI ...|   18|
|SILVER FABRIC MIRROR|   46|
|PINK  HONEYCOMB P...|   70|
|PINK BOUDOIR T-LI...|    1|
| BLACK CHERRY LIGHTS|    3|
|GLASS CAKE COVER ...|    2|
|IVORY ENCHANTED F...|   93|
|ANTIQUE SILVER TE...|  227|
|BLUE FELT HANGING...|    2|
|PACK OF 12 COLOUR...|  223|
|   CLAM SHELL SMALL |   48|
+--------------------+-----+
only showing top 20 rows



In [51]:
from pyspark.sql.functions import desc, asc

In [52]:
df.groupby(df["Description"]).count()

DataFrame[Description: string, count: bigint]

In [53]:
df.groupby(df["Description"]).count().sort(desc("count")).show()

+--------------------+-----+
|         Description|count|
+--------------------+-----+
|WHITE HANGING HEA...| 2369|
|REGENCY CAKESTAND...| 2200|
|JUMBO BAG RED RET...| 2159|
|       PARTY BUNTING| 1727|
|LUNCH BAG RED RET...| 1638|
|ASSORTED COLOUR B...| 1501|
|SET OF 3 CAKE TIN...| 1473|
|                NULL| 1454|
|PACK OF 72 RETROS...| 1385|
|LUNCH BAG  BLACK ...| 1350|
|NATURAL SLATE HEA...| 1280|
|             POSTAGE| 1252|
|JUMBO BAG PINK PO...| 1251|
|HEART OF WICKER S...| 1237|
|JAM MAKING SET WI...| 1229|
|JUMBO STORAGE BAG...| 1214|
|PAPER CHAIN KIT 5...| 1210|
|JUMBO SHOPPER VIN...| 1202|
| LUNCH BAG CARS BLUE| 1197|
|LUNCH BAG SPACEBO...| 1192|
+--------------------+-----+
only showing top 20 rows



In [54]:
df.groupby(df["Description"]).count().sort(asc("count")).show()

+--------------------+-----+
|         Description|count|
+--------------------+-----+
|PINK BERTIE MOBIL...|    1|
|SET OF 3 PINK FLY...|    1|
|        amazon sales|    1|
|PINK BOUDOIR T-LI...|    1|
|BREAD BIN, DINER ...|    1|
|PINK PAINTED KASH...|    1|
|ETCHED GLASS STAR...|    1|
|CREAM SWEETHEART ...|    1|
|4 GOLD FLOCK CHRI...|    1|
|TEA TIME BREAKFAS...|    1|
|PEG BAG APPLE DESIGN|    1|
|JARDIN ETCHED GLA...|    1|
|DUSTY PINK CHRIST...|    1|
|BLACKCHRISTMAS TR...|    1|
|     FLAMINGO LIGHTS|    1|
|VINTAGE BLUE TINS...|    1|
|CURIOUS IMAGES SC...|    1|
|CAKESTAND, 3 TIER...|    1|
|TEA TIME CAKE STA...|    1|
|        label mix up|    1|
+--------------------+-----+
only showing top 20 rows

