In [1]:
from pyspark import *
from pyspark.sql import *

import pandas as pd
import os
import sys
import pyarrow as pa
import pyarrow.parquet as pq
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_SUBMIT_ARGS'] = """--name job_name --master local --conf spark.dynamicAllocation.enabled=true pyspark-shell"""

spark = SparkSession.builder.appName("DataFrame").getOrCreate()
conf = SparkConf().setMaster("local").setAppName("Assignment 11")
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/23 13:42:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/04/23 13:42:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
sales = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("retail-data/by-day/2011-10-04.csv")\
.coalesce(5)\
.where("Description IS NOT NULL")

                                                                                

In [3]:
sales.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   569397|    23200|     JUMBO BAG PEARS|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|
|   569397|    23201|  JUMBO BAG ALPHABET|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|
|   569397|    23199|    JUMBO BAG APPLES|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|
|   569397|   85099F|JUMBO BAG STRAWBERRY|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|
|   569397|   85099B|JUMBO BAG RED RET...|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [4]:
#tokenize string from Description column, splitting on a given character
from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(5, False)

+-----------------------+----------------------------+
|Description            |DescOut                     |
+-----------------------+----------------------------+
|JUMBO BAG PEARS        |[jumbo, bag, pears]         |
|JUMBO BAG ALPHABET     |[jumbo, bag, alphabet]      |
|JUMBO BAG APPLES       |[jumbo, bag, apples]        |
|JUMBO BAG STRAWBERRY   |[jumbo, bag, strawberry]    |
|JUMBO BAG RED RETROSPOT|[jumbo, bag, red, retrospot]|
+-----------------------+----------------------------+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
print(englishStopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [6]:
from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
.setStopWords(englishStopWords)\
.setInputCol("DescOut")
stops.transform(tokenized).show(5,False)

+-----------------------+----------------------------+-------------------------------------+
|Description            |DescOut                     |StopWordsRemover_d5f3719445d6__output|
+-----------------------+----------------------------+-------------------------------------+
|JUMBO BAG PEARS        |[jumbo, bag, pears]         |[jumbo, bag, pears]                  |
|JUMBO BAG ALPHABET     |[jumbo, bag, alphabet]      |[jumbo, bag, alphabet]               |
|JUMBO BAG APPLES       |[jumbo, bag, apples]        |[jumbo, bag, apples]                 |
|JUMBO BAG STRAWBERRY   |[jumbo, bag, strawberry]    |[jumbo, bag, strawberry]             |
|JUMBO BAG RED RETROSPOT|[jumbo, bag, red, retrospot]|[jumbo, bag, red, retrospot]         |
+-----------------------+----------------------------+-------------------------------------+
only showing top 5 rows



In [16]:
data = stops.transform(tokenized)

In [40]:
from pyspark.ml.feature import IndexToString, StringIndexer

indexer = StringIndexer(inputCol="Description", outputCol="DescIndex")
model = indexer.fit(data)
indexed = model.transform(data)

converter = IndexToString(inputCol="DescIndex", outputCol="originalDescription")
converted = converter.transform(indexed)

converted.select("Description", "DescOut", "DescIndex", "originalDescription").show(5)

+--------------------+--------------------+---------+--------------------+
|         Description|             DescOut|DescIndex| originalDescription|
+--------------------+--------------------+---------+--------------------+
|     JUMBO BAG PEARS| [jumbo, bag, pears]|    503.0|     JUMBO BAG PEARS|
|  JUMBO BAG ALPHABET|[jumbo, bag, alph...|    106.0|  JUMBO BAG ALPHABET|
|    JUMBO BAG APPLES|[jumbo, bag, apples]|    107.0|    JUMBO BAG APPLES|
|JUMBO BAG STRAWBERRY|[jumbo, bag, stra...|    505.0|JUMBO BAG STRAWBERRY|
|JUMBO BAG RED RET...|[jumbo, bag, red,...|      3.0|JUMBO BAG RED RET...|
+--------------------+--------------------+---------+--------------------+
only showing top 5 rows



In [41]:
from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCols=["DescIndex"],
                        outputCols=["DescVec"])
model = encoder.fit(converted)
encoded = model.transform(converted)
encoded.show(5)

+--------------------+--------------------+-------------------------------------+---------+--------------------+------------------+
|         Description|             DescOut|StopWordsRemover_d5f3719445d6__output|DescIndex| originalDescription|           DescVec|
+--------------------+--------------------+-------------------------------------+---------+--------------------+------------------+
|     JUMBO BAG PEARS| [jumbo, bag, pears]|                  [jumbo, bag, pears]|    503.0|     JUMBO BAG PEARS|(1199,[503],[1.0])|
|  JUMBO BAG ALPHABET|[jumbo, bag, alph...|                 [jumbo, bag, alph...|    106.0|  JUMBO BAG ALPHABET|(1199,[106],[1.0])|
|    JUMBO BAG APPLES|[jumbo, bag, apples]|                 [jumbo, bag, apples]|    107.0|    JUMBO BAG APPLES|(1199,[107],[1.0])|
|JUMBO BAG STRAWBERRY|[jumbo, bag, stra...|                 [jumbo, bag, stra...|    505.0|JUMBO BAG STRAWBERRY|(1199,[505],[1.0])|
|JUMBO BAG RED RET...|[jumbo, bag, red,...|                 [jumbo, bag, red

### 4. Represent every Description text with a sum of those vectors.

In [50]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["DescVec"],
    outputCol="features")

output = assembler.transform(encoded)

output.select("features", "Description", "DescOut").show(5, truncate=False)

+------------------+-----------------------+----------------------------+
|features          |Description            |DescOut                     |
+------------------+-----------------------+----------------------------+
|(1199,[503],[1.0])|JUMBO BAG PEARS        |[jumbo, bag, pears]         |
|(1199,[106],[1.0])|JUMBO BAG ALPHABET     |[jumbo, bag, alphabet]      |
|(1199,[107],[1.0])|JUMBO BAG APPLES       |[jumbo, bag, apples]        |
|(1199,[505],[1.0])|JUMBO BAG STRAWBERRY   |[jumbo, bag, strawberry]    |
|(1199,[3],[1.0])  |JUMBO BAG RED RETROSPOT|[jumbo, bag, red, retrospot]|
+------------------+-----------------------+----------------------------+
only showing top 5 rows



In [53]:
sales.filter(sales.Description == "SET OF 12 MINI LOAF BAKING CASES").show(truncate=False)

+---------+---------+--------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                     |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+--------------------------------+--------+-------------------+---------+----------+--------------+
|569415   |23295    |SET OF 12 MINI LOAF BAKING CASES|3       |2011-10-04 10:14:00|2.46     |null      |United Kingdom|
|569479   |23295    |SET OF 12 MINI LOAF BAKING CASES|24      |2011-10-04 12:41:00|0.83     |13408.0   |United Kingdom|
|569523   |23295    |SET OF 12 MINI LOAF BAKING CASES|8       |2011-10-04 14:41:00|0.83     |16033.0   |United Kingdom|
|569532   |23295    |SET OF 12 MINI LOAF BAKING CASES|16      |2011-10-04 15:19:00|0.83     |13552.0   |United Kingdom|
|569545   |23295    |SET OF 12 MINI LOAF BAKING CASES|2       |2011-10-04 16:37:00|2.46     |null      |United Kingdom|
|569546   |23295    |SET OF 12 MINI LOAF

In [42]:
import numpy as np
from pyspark.ml.linalg import *
from pyspark.sql.types import * 
from pyspark.sql.functions import *

In [51]:
def cos_sim(a,b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

In [52]:
df = output.withColumn("coSim", udf(cos_sim, FloatType())(col("features"), array([list(v) for v in static_vector])))
df.limit(10).toPandas()

TypeError: 'numpy.float64' object is not iterable

In [66]:
tokenizer = Tokenizer(inputCol="Description", outputCol="descWords")
descData = tokenizer.transform(sales)

In [73]:
descData.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|           descWords|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+
|   569397|    23200|     JUMBO BAG PEARS|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom| [jumbo, bag, pears]|
|   569397|    23201|  JUMBO BAG ALPHABET|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|[jumbo, bag, alph...|
|   569397|    23199|    JUMBO BAG APPLES|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|[jumbo, bag, apples]|
|   569397|   85099F|JUMBO BAG STRAWBERRY|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|[jumbo, bag, stra...|
|   569397|   85099B|JUMBO BAG RED RET...|      10|2011-10-04 08:26:00|     2.08|   12747.0|Unite

In [70]:
from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
print(englishStopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [74]:
from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
.setStopWords(englishStopWords)\
.setInputCol("descWords")
stops.transform(descData).show(5,False)

+---------+---------+-----------------------+--------+-------------------+---------+----------+--------------+----------------------------+-------------------------------------+
|InvoiceNo|StockCode|Description            |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |descWords                   |StopWordsRemover_62764232d3d1__output|
+---------+---------+-----------------------+--------+-------------------+---------+----------+--------------+----------------------------+-------------------------------------+
|569397   |23200    |JUMBO BAG PEARS        |10      |2011-10-04 08:26:00|2.08     |12747.0   |United Kingdom|[jumbo, bag, pears]         |[jumbo, bag, pears]                  |
|569397   |23201    |JUMBO BAG ALPHABET     |10      |2011-10-04 08:26:00|2.08     |12747.0   |United Kingdom|[jumbo, bag, alphabet]      |[jumbo, bag, alphabet]               |
|569397   |23199    |JUMBO BAG APPLES       |10      |2011-10-04 08:26:00|2.08     |12747.0   |United Kingdom|

In [79]:
data = stops.transform(descData)

In [86]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
hashingTF = HashingTF(inputCol="descWords", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(descData)

featurizedData.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+--------------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|           descWords|         rawFeatures|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+--------------------+--------------------+
|   569397|    23200|     JUMBO BAG PEARS|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom| [jumbo, bag, pears]|(20,[4,12,15],[1....|
|   569397|    23201|  JUMBO BAG ALPHABET|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|[jumbo, bag, alph...|(20,[3,4,15],[1.0...|
|   569397|    23199|    JUMBO BAG APPLES|      10|2011-10-04 08:26:00|     2.08|   12747.0|United Kingdom|[jumbo, bag, apples]|(20,[4,5,15],[1.0...|
|   569397|   85099F|JUMBO BAG STRAWBERRY|      10|2011-10-04 08:26:00|     2.08|   12747.0|United K

In [88]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("Description", "descWords", "features").show(5)

+--------------------+--------------------+--------------------+
|         Description|           descWords|            features|
+--------------------+--------------------+--------------------+
|     JUMBO BAG PEARS| [jumbo, bag, pears]|(20,[4,12,15],[1....|
|  JUMBO BAG ALPHABET|[jumbo, bag, alph...|(20,[3,4,15],[1.5...|
|    JUMBO BAG APPLES|[jumbo, bag, apples]|(20,[4,5,15],[1.5...|
|JUMBO BAG STRAWBERRY|[jumbo, bag, stra...|(20,[4,12,15],[1....|
|JUMBO BAG RED RET...|[jumbo, bag, red,...|(20,[4,8,12,15],[...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [56]:
sales.filter(sales.Description == "SET OF 12 MINI LOAF BAKING CASES").show(truncate=False)

+---------+---------+--------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                     |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+--------------------------------+--------+-------------------+---------+----------+--------------+
|569415   |23295    |SET OF 12 MINI LOAF BAKING CASES|3       |2011-10-04 10:14:00|2.46     |null      |United Kingdom|
|569479   |23295    |SET OF 12 MINI LOAF BAKING CASES|24      |2011-10-04 12:41:00|0.83     |13408.0   |United Kingdom|
|569523   |23295    |SET OF 12 MINI LOAF BAKING CASES|8       |2011-10-04 14:41:00|0.83     |16033.0   |United Kingdom|
|569532   |23295    |SET OF 12 MINI LOAF BAKING CASES|16      |2011-10-04 15:19:00|0.83     |13552.0   |United Kingdom|
|569545   |23295    |SET OF 12 MINI LOAF BAKING CASES|2       |2011-10-04 16:37:00|2.46     |null      |United Kingdom|
|569546   |23295    |SET OF 12 MINI LOAF

In [91]:
tfIdfIn = data\
.where("array_contains(descWords, 'loaf')")\
.select("descWords")
tfIdfIn.show()

+--------------------+
|           descWords|
+--------------------+
|[set, of, 12, min...|
|[set, of, 12, min...|
|[set, of, 12, min...|
|[set, of, 6, snac...|
|[set, of, 12, min...|
|[set, of, 6, snac...|
|[set, of, 12, min...|
|[set, of, 12, min...|
+--------------------+



In [95]:
#tokenize string from Description column, splitting on a given character
from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(5, False)

+-----------------------+----------------------------+
|Description            |DescOut                     |
+-----------------------+----------------------------+
|JUMBO BAG PEARS        |[jumbo, bag, pears]         |
|JUMBO BAG ALPHABET     |[jumbo, bag, alphabet]      |
|JUMBO BAG APPLES       |[jumbo, bag, apples]        |
|JUMBO BAG STRAWBERRY   |[jumbo, bag, strawberry]    |
|JUMBO BAG RED RETROSPOT|[jumbo, bag, red, retrospot]|
+-----------------------+----------------------------+
only showing top 5 rows



In [96]:
from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
print(englishStopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [97]:
from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
.setStopWords(englishStopWords)\
.setInputCol("DescOut")
stops.transform(tokenized).show(5,False)

+-----------------------+----------------------------+-------------------------------------+
|Description            |DescOut                     |StopWordsRemover_1e470ba46cc6__output|
+-----------------------+----------------------------+-------------------------------------+
|JUMBO BAG PEARS        |[jumbo, bag, pears]         |[jumbo, bag, pears]                  |
|JUMBO BAG ALPHABET     |[jumbo, bag, alphabet]      |[jumbo, bag, alphabet]               |
|JUMBO BAG APPLES       |[jumbo, bag, apples]        |[jumbo, bag, apples]                 |
|JUMBO BAG STRAWBERRY   |[jumbo, bag, strawberry]    |[jumbo, bag, strawberry]             |
|JUMBO BAG RED RETROSPOT|[jumbo, bag, red, retrospot]|[jumbo, bag, red, retrospot]         |
+-----------------------+----------------------------+-------------------------------------+
only showing top 5 rows



In [98]:
data = stops.transform(tokenized)

In [104]:
from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="DescOut",
outputCol="result")

word2Vec

Word2Vec_d05a514d93a0

In [101]:
model = word2Vec.fit(data)
result = model.transform(data)
result.show(5)

+--------------------+--------------------+-------------------------------------+--------------------+
|         Description|             DescOut|StopWordsRemover_1e470ba46cc6__output|              result|
+--------------------+--------------------+-------------------------------------+--------------------+
|     JUMBO BAG PEARS| [jumbo, bag, pears]|                  [jumbo, bag, pears]|[0.30474084119002...|
|  JUMBO BAG ALPHABET|[jumbo, bag, alph...|                 [jumbo, bag, alph...|[0.46335827310880...|
|    JUMBO BAG APPLES|[jumbo, bag, apples]|                 [jumbo, bag, apples]|[0.40410287181536...|
|JUMBO BAG STRAWBERRY|[jumbo, bag, stra...|                 [jumbo, bag, stra...|[0.38322112957636...|
|JUMBO BAG RED RET...|[jumbo, bag, red,...|                 [jumbo, bag, red,...|[0.26491938793333...|
+--------------------+--------------------+-------------------------------------+--------------------+
only showing top 5 rows



In [105]:
sales.filter(sales.Description == "SET OF 12 MINI LOAF BAKING CASES").show(truncate=False)

+---------+---------+--------------------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                     |Quantity|InvoiceDate        |UnitPrice|CustomerID|Country       |
+---------+---------+--------------------------------+--------+-------------------+---------+----------+--------------+
|569415   |23295    |SET OF 12 MINI LOAF BAKING CASES|3       |2011-10-04 10:14:00|2.46     |null      |United Kingdom|
|569479   |23295    |SET OF 12 MINI LOAF BAKING CASES|24      |2011-10-04 12:41:00|0.83     |13408.0   |United Kingdom|
|569523   |23295    |SET OF 12 MINI LOAF BAKING CASES|8       |2011-10-04 14:41:00|0.83     |16033.0   |United Kingdom|
|569532   |23295    |SET OF 12 MINI LOAF BAKING CASES|16      |2011-10-04 15:19:00|0.83     |13552.0   |United Kingdom|
|569545   |23295    |SET OF 12 MINI LOAF BAKING CASES|2       |2011-10-04 16:37:00|2.46     |null      |United Kingdom|
|569546   |23295    |SET OF 12 MINI LOAF

In [106]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [107]:
retail_df = pd.read_csv("retail-data/by-day/2011-10-04.csv")
retail_df.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,569397,23200,JUMBO BAG PEARS,10,2011-10-04 08:26:00,2.08,12747.0,United Kingdom
1,569397,23201,JUMBO BAG ALPHABET,10,2011-10-04 08:26:00,2.08,12747.0,United Kingdom
2,569397,23199,JUMBO BAG APPLES,10,2011-10-04 08:26:00,2.08,12747.0,United Kingdom
3,569397,85099F,JUMBO BAG STRAWBERRY,10,2011-10-04 08:26:00,2.08,12747.0,United Kingdom
4,569397,85099B,JUMBO BAG RED RETROSPOT,10,2011-10-04 08:26:00,2.08,12747.0,United Kingdom


In [120]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

retail_df['Description']= retail_df['Description'].apply(str)

desc = list(retail_df['Description'])

def get_tokens(text):
    tokens = word_tokenize(text)
    return tokens

desc_tokens = [get_tokens(text) for text in desc]

[nltk_data] Downloading package punkt to /home/f_dev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [121]:
import nltk
nltk.download('stopwords')
english_stopwords = set(nltk.corpus.stopwords.words('english'))
print(english_stopwords)

{'no', "mustn't", 'himself', 'what', 'isn', 'that', 'she', 'being', 'didn', "mightn't", 'aren', 'i', "she's", 'will', 'are', "didn't", 'o', 'itself', 'ma', 'for', 'once', 'so', 'which', 'such', 'we', 'd', "you'll", 'doesn', 'mustn', 'nor', "aren't", "it's", 've', 'the', 'having', "isn't", 'under', 'of', 'more', 't', 'll', 're', 'through', 'theirs', 'to', 'shan', "needn't", 'after', 'during', 'most', 'yourselves', 'don', 'ourselves', 'just', 'had', 'there', 'it', 'won', 'whom', 'how', 'hadn', 'herself', "hadn't", 'in', 'mightn', 'over', 'where', 'was', 'shouldn', 'is', 'ours', 'our', 'this', 'and', 'myself', 'these', "won't", 'before', 's', 'do', 'me', 'did', 'both', 'why', "shan't", 'until', 'themselves', 'be', 'yours', 'while', 'against', 'own', 'between', 'on', 'been', 'its', 'again', 'about', 'some', 'out', 'who', "shouldn't", 'weren', 'wasn', 'hasn', 'their', 'your', 'here', 'haven', 'his', "couldn't", 'ain', 'my', 'off', "that'll", 'you', 'at', 'too', 'y', 'an', 'were', 'any', 'fe

[nltk_data] Downloading package stopwords to /home/f_dev/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
