In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
DIR = '/content/gdrive/My Drive/Spark_course/data/'

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

import findspark
findspark.init("spark-2.4.5-bin-hadoop2.7")# SPARK_HOME

In [0]:
from __future__ import print_function

from pyspark.ml.regression import LinearRegression

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

In [0]:
# Create a SparkSession (Note, the config section is only for Windows!)
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [0]:
# Load up our data and convert it to the format MLLib expects.
inputLines = spark.sparkContext.textFile("/content/gdrive/My Drive/Spark_course/data/regression.txt")
data = inputLines.map(lambda x: x.split(",")).map(lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

In [19]:
i = 0
for result in data.collect():
  print(result)

  if i>10:
    break
  
  i +=1

(-1.74, DenseVector([1.66]))
(1.24, DenseVector([-1.18]))
(0.29, DenseVector([-0.4]))
(-0.13, DenseVector([0.09]))
(-0.39, DenseVector([0.38]))
(-1.79, DenseVector([1.73]))
(0.71, DenseVector([-0.77]))
(1.39, DenseVector([-1.48]))
(1.15, DenseVector([-1.43]))
(0.13, DenseVector([-0.07]))
(0.05, DenseVector([-0.07]))
(1.9, DenseVector([-1.8]))


In [0]:
# Convert this RDD to a DataFrame
colNames = ["label", "features"]
df = data.toDF(colNames)

In [23]:
df.show()

+-----+--------+
|label|features|
+-----+--------+
|-1.74|  [1.66]|
| 1.24| [-1.18]|
| 0.29|  [-0.4]|
|-0.13|  [0.09]|
|-0.39|  [0.38]|
|-1.79|  [1.73]|
| 0.71| [-0.77]|
| 1.39| [-1.48]|
| 1.15| [-1.43]|
| 0.13| [-0.07]|
| 0.05| [-0.07]|
|  1.9|  [-1.8]|
| 1.48| [-1.42]|
| 0.32|  [-0.3]|
|-1.11|   [1.0]|
| 0.51| [-0.62]|
|-1.58|  [1.45]|
|-0.46|  [0.44]|
|-0.49|  [0.37]|
| 0.31|  [-0.3]|
+-----+--------+
only showing top 20 rows



In [0]:
# Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
# Perhaps you're importing data from a real database. Or you are using structured streaming
# to get your data.

# Let's split our data into training data and testing data
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

In [0]:
# Now create our linear regression model
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Train the model using our training data
model = lir.fit(trainingDF)

# Now see if we can predict values in our test data.
# Generate predictions using our linear regression model for all features in our
# test dataframe:
fullPredictions = model.transform(testDF).cache()

# Extract the predictions and the "known" correct labels.
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

# Zip them together
predictionAndLabel = predictions.zip(labels).collect()

In [26]:
# Print out the predicted and actual values for each point
for prediction in predictionAndLabel:
  print(prediction)


# Stop the session
spark.stop()

(-2.6675460087431055, -3.74)
(-1.86944641571897, -2.36)
(-1.6699215174629363, -2.29)
(-1.5986626252286384, -2.26)
(-1.449018951536613, -2.07)
(-1.4347671730897535, -2.0)
(-1.3777600593023152, -1.94)
(-1.4062636161960342, -1.94)
(-1.2993752778445877, -1.91)
(-1.4062636161960342, -1.87)
(-1.3065011670680176, -1.8)
(-1.2281163856102897, -1.79)
(-1.1782351610462813, -1.77)
(-1.1568574933759919, -1.65)
(-1.0285914873542559, -1.58)
(-1.1711092718228515, -1.58)
(-1.1924869394931408, -1.53)
(-1.0357173765776857, -1.47)
(-1.0000879304605368, -1.46)
(-1.128353936482273, -1.42)
(-1.0000879304605368, -1.36)
(-0.9217031490028093, -1.34)
(-1.0499691550245454, -1.33)
(-0.8076889214279327, -1.3)
(-0.8076889214279327, -1.29)
(-1.0428432658011155, -1.29)
(-0.8433183675450816, -1.27)
(-0.8290665890982221, -1.26)
(-0.8504442567685114, -1.25)
(-0.7720594753107839, -1.24)
(-0.8433183675450816, -1.23)
(-0.8504442567685114, -1.22)
(-0.8789478136622305, -1.17)
(-0.8860737028856603, -1.16)
(-0.8148148106513624,

In [0]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import regexp_extract

In [0]:
# Create a SparkSession (the config bit is only for Windows!)
spark = SparkSession.builder.appName("StructuredStreaming").getOrCreate()

# Monitor the logs directory for new log data, and read in the raw lines as accessLines
accessLines = spark.readStream.text("/content/gdrive/My Drive/Spark_course/data/logs")

In [0]:
# Parse out the common log format to a DataFrame
contentSizeExp = r'\s(\d+)$'
statusExp = r'\s(\d{3})\s'
generalExp = r'\"(\S+)\s(\S+)\s*(\S*)\"'
timeExp = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
hostExp = r'(^\S+\.[\S+\.]+\S+)\s'

In [0]:
logsDF = accessLines.select(regexp_extract('value', hostExp, 1).alias('host'),
                         regexp_extract('value', timeExp, 1).alias('timestamp'),
                         regexp_extract('value', generalExp, 1).alias('method'),
                         regexp_extract('value', generalExp, 2).alias('endpoint'),
                         regexp_extract('value', generalExp, 3).alias('protocol'),
                         regexp_extract('value', statusExp, 1).cast('integer').alias('status'),
                         regexp_extract('value', contentSizeExp, 1).cast('integer').alias('content_size'))

In [0]:
# Keep a running count of every access by status code
statusCountsDF = logsDF.groupBy(logsDF.status).count()

# Kick off our streaming query, dumping results to the console
query = ( statusCountsDF.writeStream.outputMode("complete").format("console").queryName("counts").start() )

In [39]:
# Run forever until terminated
query.awaitTermination()

# Cleanly shut down the session
spark.stop()

KeyboardInterrupt: ignored