In [1]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Data_wrangling").getOrCreate()

In [4]:
df = spark.read.format("csv") \
    .option("inferSchema", "True") \
    .option("header", "True") \
    .option("sep", ",") \
    .load("./data/Online_Retail.csv")

In [5]:
df

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: int, Country: string]

In [6]:
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS S

In [9]:
type(df)

pyspark.sql.dataframe.DataFrame

In [10]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



# スキーマを指定してファイル読み込み

In [13]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType

In [14]:
spark.version

'3.5.0'

In [19]:
schema = StructType([
    StructField("InvoiceNo", StringType(), False),
    StructField("StockCode", StringType(), False),
    StructField("Description", StringType(), False),
    StructField("Quantity", IntegerType(), False),
    StructField("InvoiceDate", DateType(), False),
    StructField("UnitPrice", FloatType(), False),
    StructField("CustomerID", IntegerType(), False),
    StructField("Country", StringType(), False)
])

In [20]:
df = spark.read.format("csv") \
    .option("header", "True") \
    .option("sep", ",") \
    .load("./data/Online_Retail.csv", schema=schema)

In [32]:
df.show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |22752    |SET 7 BABUSHKA NESTING BOXES       |2       |2010-12-01 |7.

In [22]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'date'),
 ('UnitPrice', 'float'),
 ('CustomerID', 'int'),
 ('Country', 'string')]

In [23]:
df.select("StockCode", "Description").show()

+---------+--------------------+
|StockCode|         Description|
+---------+--------------------+
|   85123A|WHITE HANGING HEA...|
|    71053| WHITE METAL LANTERN|
|   84406B|CREAM CUPID HEART...|
|   84029G|KNITTED UNION FLA...|
|   84029E|RED WOOLLY HOTTIE...|
|    22752|SET 7 BABUSHKA NE...|
|    21730|GLASS STAR FROSTE...|
|    22633|HAND WARMER UNION...|
|    22632|HAND WARMER RED P...|
|    84879|ASSORTED COLOUR B...|
|    22745|POPPY'S PLAYHOUSE...|
|    22748|POPPY'S PLAYHOUSE...|
|    22749|FELTCRAFT PRINCES...|
|    22310|IVORY KNITTED MUG...|
|    84969|BOX OF 6 ASSORTED...|
|    22623|BOX OF VINTAGE JI...|
|    22622|BOX OF VINTAGE AL...|
|    21754|HOME BUILDING BLO...|
|    21755|LOVE BUILDING BLO...|
|    21777|RECIPE BOX WITH M...|
+---------+--------------------+
only showing top 20 rows



In [24]:
df.count()

541909

# filter: 条件を満たすレコードの抽出

In [25]:
df.filter(df['UnitPrice'] > 30).show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536392|    22827|RUSTIC  SEVENTEEN...|       1| 2010-12-01|    165.0|     13705|United Kingdom|
|   536396|    22803|IVORY EMBROIDERED...|       2| 2010-12-01|    35.75|     17850|United Kingdom|
|   536406|    22803|IVORY EMBROIDERED...|       2| 2010-12-01|    35.75|     17850|United Kingdom|
|   536540|       C2|            CARRIAGE|       1| 2010-12-01|     50.0|     14911|          EIRE|
|   536544|    22769|CHALKBOARD KITCHE...|       1| 2010-12-01|    51.02|      NULL|United Kingdom|
|   536544|    22847|BREAD BIN DINER S...|       1| 2010-12-01|     34.0|      NULL|United Kingdom|
|   536544|      DOT|      DOTCOM POSTAGE|       1| 2010-12-01|   569.77|      NULL|United Kingdom|


In [26]:
df.filter(df['Country']=="Switzerland").show()

+---------+---------+--------------------+--------+-----------+---------+----------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|    Country|
+---------+---------+--------------------+--------+-----------+---------+----------+-----------+
|   536858|    22326|ROUND SNACK BOXES...|      30| 2010-12-03|     2.95|     13520|Switzerland|
|   536858|    22554|PLASTERS IN TIN W...|      36| 2010-12-03|     1.65|     13520|Switzerland|
|   536858|    21731|RED TOADSTOOL LED...|      24| 2010-12-03|     1.65|     13520|Switzerland|
|   536858|    20677|  PINK POLKADOT BOWL|      16| 2010-12-03|     1.25|     13520|Switzerland|
|   536858|    20750|RED RETROSPOT MIN...|       2| 2010-12-03|     7.95|     13520|Switzerland|
|   536858|     POST|             POSTAGE|       2| 2010-12-03|     40.0|     13520|Switzerland|
|   539488|    22837|HOT WATER BOTTLE ...|       8| 2010-12-20|     4.65|     12377|Switzerland|
|   539488|    22112|CHOCOLATE

In [27]:
df.filter(df['Country']=="Switzerland").count()

2002

# where: 条件を満たすレコードの抽出

In [30]:
df.where(df['Description'].contains("WATER")).show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536373   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536375   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536376   |22114    |HOT WATER BOTTLE TEA AND SYMPATHY  |48      |2010-12-01 |3.45     |15291     |United Kingdom|
|536390   |21485    |RETROSPOT HEART HOT WATER BOTTLE   |24      |2010-12-01 |4.25     |17511     |United Kingdom|
|536390   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|24      |2010-12-01 |3.

In [31]:
df.where(df['Description'].contains("WATER")).count()

11374

# 列の作成と削除

In [34]:
df.withColumn("amount", df["Quantity"] * df["UnitPrice"]).show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+---------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |amount   |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+---------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|15.299999|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|22.0     |
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |


In [36]:
df = df.withColumn("amount", df["Quantity"] * df["UnitPrice"])

In [38]:
df.show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+---------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |amount   |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+---------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|15.299999|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|22.0     |
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 |3.39     |17850     |United Kingdom|20.34    |


In [40]:
df.drop("amount").show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6| 2010-12-01|     4.25|     17850|United Kingdom|


In [42]:
df = df.drop("amount")
df.show(truncate=False)

+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate|UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+-----------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 |2.55     |17850     |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 |2.75     |17850     |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 |3.39     |17850     |United Kingdom|
|536365   |22752    |SET 7 BABUSHKA NESTING BOXES       |2       |2010-12-01 |7.

In [43]:
df.drop("InvoiceNo", "StockCode").show()

+--------------------+--------+-----------+---------+----------+--------------+
|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+--------------------+--------+-----------+---------+----------+--------------+
|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|
|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17850|United Kingdom|
|GLASS STAR FROSTE...|       6| 2010-12-01|     4.25|     17850|United Kingdom|
|HAND WARMER UNION...|       6| 2010-12-01|     1.85|     17850|United Kingdom|
|HAND WARMER RED P...|       6| 2010-12-01|     1.85|     17850|United Kingdom|
|ASSORTED COLOUR B...|      32| 2010-12-

# 列名の変更

In [44]:
df.withColumnRenamed("Country", "CountryName").show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|   CountryName|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6| 2010-12-01|     4.25|     17850|United Kingdom|


In [45]:
df.withColumnRenamed("Country", "CountryName").columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'CountryName']

# キャスト (型変換)

In [62]:
df = df.withColumn("Quantity", df["Quantity"].cast("int"))

In [63]:
df.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6| 2010-12-01|     4.25|     17850|United Kingdom|


In [64]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'date'),
 ('UnitPrice', 'float'),
 ('CustomerID', 'int'),
 ('Country', 'string')]

# 日付の処理

In [65]:
from pyspark.sql.functions import year, month, dayofmonth

In [67]:
df_tmp = df.withColumn("purchased_year", year("InvoiceDate"))
df_tmp.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|purchased_year|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|          2010|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|          2010|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|
|   536365|    22752|SET 7 BABUSHKA NE...|       2| 2010-12-01|     7.65|     17

In [68]:
df_tmp = df_tmp.withColumn("purchased_month", month("InvoiceDate"))
df_tmp.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+---------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|purchased_year|purchased_month|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+---------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|          2010|             12|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|             12|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|          2010|             12|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|             12|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6| 2010-12-01|     3.39|     17850

In [69]:
df_tmp = df_tmp.withColumn("purchased_day", dayofmonth("InvoiceDate"))
df_tmp.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+---------------+-------------+
|InvoiceNo|StockCode|         Description|Quantity|InvoiceDate|UnitPrice|CustomerID|       Country|purchased_year|purchased_month|purchased_day|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+--------------+---------------+-------------+
|   536365|   85123A|WHITE HANGING HEA...|       6| 2010-12-01|     2.55|     17850|United Kingdom|          2010|             12|            1|
|   536365|    71053| WHITE METAL LANTERN|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|             12|            1|
|   536365|   84406B|CREAM CUPID HEART...|       8| 2010-12-01|     2.75|     17850|United Kingdom|          2010|             12|            1|
|   536365|   84029G|KNITTED UNION FLA...|       6| 2010-12-01|     3.39|     17850|United Kingdom|          2010|             12|

# 欠損値処理

In [70]:
df.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'date'),
 ('UnitPrice', 'float'),
 ('CustomerID', 'int'),
 ('Country', 'string')]