In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
DIR = '/content/gdrive/My Drive/Spark_course/data/'

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

import findspark
findspark.init("spark-2.4.5-bin-hadoop2.7")# SPARK_HOME


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

import collections

# Create a SparkSession (Note, the config section is only for Windows!)
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [0]:
def mapper(line):
    fields = line.split(',')
    return Row(ID=int(fields[0]), name=str(fields[1].encode("utf-8")), age=int(fields[2]), numFriends=int(fields[3]))

In [0]:
lines = spark.sparkContext.textFile("/content/gdrive/My Drive/Spark_course/data/fakefriends.csv")
people = lines.map(mapper)

In [10]:
i = 0

for result in people.collect():
  print(result)

  if i > 10:
    break

  i +=1

Row(ID=0, age=33, name="b'Will'", numFriends=385)
Row(ID=1, age=26, name="b'Jean-Luc'", numFriends=2)
Row(ID=2, age=55, name="b'Hugh'", numFriends=221)
Row(ID=3, age=40, name="b'Deanna'", numFriends=465)
Row(ID=4, age=68, name="b'Quark'", numFriends=21)
Row(ID=5, age=59, name="b'Weyoun'", numFriends=318)
Row(ID=6, age=37, name="b'Gowron'", numFriends=220)
Row(ID=7, age=54, name="b'Will'", numFriends=307)
Row(ID=8, age=38, name="b'Jadzia'", numFriends=380)
Row(ID=9, age=27, name="b'Hugh'", numFriends=181)
Row(ID=10, age=53, name="b'Odo'", numFriends=191)
Row(ID=11, age=57, name="b'Ben'", numFriends=372)


In [0]:
# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people).cache()
schemaPeople.createOrReplaceTempView("people")

In [14]:
schemaPeople.show()

+---+---+-----------+----------+
| ID|age|       name|numFriends|
+---+---+-----------+----------+
|  0| 33|    b'Will'|       385|
|  1| 26|b'Jean-Luc'|         2|
|  2| 55|    b'Hugh'|       221|
|  3| 40|  b'Deanna'|       465|
|  4| 68|   b'Quark'|        21|
|  5| 59|  b'Weyoun'|       318|
|  6| 37|  b'Gowron'|       220|
|  7| 54|    b'Will'|       307|
|  8| 38|  b'Jadzia'|       380|
|  9| 27|    b'Hugh'|       181|
| 10| 53|     b'Odo'|       191|
| 11| 57|     b'Ben'|       372|
| 12| 54|   b'Keiko'|       253|
| 13| 56|b'Jean-Luc'|       444|
| 14| 43|    b'Hugh'|        49|
| 15| 36|     b'Rom'|        49|
| 16| 22|  b'Weyoun'|       323|
| 17| 35|     b'Odo'|        13|
| 18| 45|b'Jean-Luc'|       455|
| 19| 60|  b'Geordi'|       246|
+---+---+-----------+----------+
only showing top 20 rows



In [15]:
# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT * FROM people WHERE age >= 13 AND age <= 19")

# The results of SQL queries are RDDs and support all the normal RDD operations.
for teen in teenagers.collect():
  print(teen)

# We can also use functions instead of SQL queries:
schemaPeople.groupBy("age").count().orderBy("age").show()

Row(ID=21, age=19, name="b'Miles'", numFriends=268)
Row(ID=52, age=19, name="b'Beverly'", numFriends=269)
Row(ID=54, age=19, name="b'Brunt'", numFriends=5)
Row(ID=106, age=18, name="b'Beverly'", numFriends=499)
Row(ID=115, age=18, name="b'Dukat'", numFriends=397)
Row(ID=133, age=19, name="b'Quark'", numFriends=265)
Row(ID=136, age=19, name="b'Will'", numFriends=335)
Row(ID=225, age=19, name="b'Elim'", numFriends=106)
Row(ID=304, age=19, name="b'Will'", numFriends=404)
Row(ID=341, age=18, name="b'Data'", numFriends=326)
Row(ID=366, age=19, name="b'Keiko'", numFriends=119)
Row(ID=373, age=19, name="b'Quark'", numFriends=272)
Row(ID=377, age=18, name="b'Beverly'", numFriends=418)
Row(ID=404, age=18, name="b'Kasidy'", numFriends=24)
Row(ID=409, age=19, name="b'Nog'", numFriends=267)
Row(ID=439, age=18, name="b'Data'", numFriends=417)
Row(ID=444, age=18, name="b'Keiko'", numFriends=472)
Row(ID=492, age=19, name="b'Dukat'", numFriends=36)
Row(ID=494, age=18, name="b'Kasidy'", numFriends=194)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions

In [0]:
def loadMovieNames():
    movieNames = {}
    with open("/content/gdrive/My Drive/Spark_course/data/ml-100k/u.item", encoding = "ISO-8859-1") as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [0]:
# Create a SparkSession (the config bit is only for Windows!)
spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

In [0]:
# Load up our movie ID -> name dictionary
nameDict = loadMovieNames()

In [0]:
# Get the raw data
lines = spark.sparkContext.textFile("/content/gdrive/My Drive/Spark_course/data/ml-100k/u.data")
# Convert it to a RDD of Row objects
movies = lines.map(lambda x: Row(movieID =int(x.split()[1])))
# Convert that to a DataFrame
movieDataset = spark.createDataFrame(movies)

In [27]:
movieDataset.show()

+-------+
|movieID|
+-------+
|    242|
|    302|
|    377|
|     51|
|    346|
|    474|
|    265|
|    465|
|    451|
|     86|
|    257|
|   1014|
|    222|
|     40|
|     29|
|    785|
|    387|
|    274|
|   1042|
|   1184|
+-------+
only showing top 20 rows



In [0]:
# Some SQL-style magic to sort all movies by popularity in one line!
topMovieIDs = movieDataset.groupBy("movieID").count().orderBy("count", ascending=False).cache()

In [31]:
topMovieIDs.show(50)

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
|    286|  481|
|    288|  478|
|      1|  452|
|    300|  431|
|    121|  429|
|    174|  420|
|    127|  413|
|     56|  394|
|      7|  392|
|     98|  390|
|    237|  384|
|    117|  378|
|    172|  367|
|    222|  365|
|    313|  350|
|    204|  350|
|    405|  344|
|     79|  336|
|    210|  331|
|    151|  326|
|    173|  324|
|     69|  321|
|    748|  316|
|    168|  316|
|    269|  315|
|    257|  303|
|    195|  301|
|    423|  300|
|      9|  299|
|    276|  298|
|    318|  298|
|     22|  297|
|    302|  297|
|     96|  295|
|    328|  295|
|     25|  293|
|    118|  293|
|     15|  293|
|    183|  291|
|    216|  290|
|    176|  284|
|     64|  283|
|    202|  280|
|    234|  280|
|    191|  276|
+-------+-----+
only showing top 50 rows



In [32]:
# Grab the top 10
top10 = topMovieIDs.take(10)

# Print the results
print("\n")
for result in top10:
    # Each row has movieID, count as above.
    print("%s: %d" % (nameDict[result[0]], result[1]))

# Stop the session
spark.stop()



Star Wars (1977): 583
Contact (1997): 509
Fargo (1996): 508
Return of the Jedi (1983): 507
Liar Liar (1997): 485
English Patient, The (1996): 481
Scream (1996): 478
Toy Story (1995): 452
Air Force One (1997): 431
Independence Day (ID4) (1996): 429
