In [14]:
# Import Spark library for python
import findspark
findspark.init("/usr/local/spark")
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark SQL basic example").getOrCreate()

In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [4]:
df2 = spark.read.csv("ratings.csv", header=True, inferSchema=True)


In [5]:
df2.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [6]:
(training, test) = df2.randomSplit([0.8, 0.2])

In [7]:
df2.createOrReplaceTempView("movies")

In [8]:
df2.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [9]:

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [10]:

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0784787008679773


In [11]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [12]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[932, 7.629805],...|
|   463|[[1232, 7.6249375...|
|   496|[[2935, 8.62207],...|
|   148|[[2135, 7.0532713...|
|   540|[[932, 6.678624],...|
|   392|[[1916, 9.89174],...|
|   243|[[674, 10.915435]...|
|    31|[[232, 9.808979],...|
|   516|[[66934, 10.28180...|
|   580|[[56145, 6.779917...|
|   251|[[70994, 10.51590...|
|   451|[[5066, 8.173663]...|
|    85|[[5066, 7.413784]...|
|   137|[[3983, 5.1053014...|
|    65|[[6155, 7.1636295...|
|   458|[[89118, 12.17054...|
|   481|[[1218, 5.385297]...|
|    53|[[2600, 8.327575]...|
|   255|[[102481, 10.6803...|
|   588|[[2261, 8.387946]...|
+------+--------------------+
only showing top 20 rows



In [19]:
result=spark.sql("SELECT * FROM book where LENGTH(ISBN) < 8")

In [20]:
result.show()

+-------+-------+-----------+
|User-ID|   ISBN|Book-Rating|
+-------+-------+-----------+
| 276798|6379702|          5|
| 276875| 273755|          7|
| 276925|6511929|          0|
| 277051|6514251|          0|
| 277186|6514219|          0|
| 277186|6542379|          0|
| 277186|6543545|          7|
| 277186|6550983|          0|
| 277209|5531774|          0|
| 277334|6530400|          6|
| 277371|7137699|          6|
| 277378|6485294|          0|
| 277399| 254794|          9|
| 277478|6715184|          0|
| 277565|6551238|          5|
| 277565|7651864|          6|
| 277701|6546684|          0|
| 277941|6531156|          7|
| 277962|2231115|          0|
| 277962|2232766|          0|
+-------+-------+-----------+
only showing top 20 rows



In [22]:
from pyspark.sql.types import *
schema = StructType([
StructField("InvoiceNo", StringType(), True),
StructField("Desc", ArrayType(StringType(), True), True)])

ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])