In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row


spark = SparkSession.builder.appName('SparkSQL').getOrCreate()

def mapper(line):
    fields = line.split(',')
    return Row(ID=int(fields[0]),name=str(fields[1]), \
        age=int(fields[2]),numFriends=int(fields[3]))
    
lines = spark.sparkContext.textFile("../resources/sources/fakefriends.csv")
people = lines.map(mapper)
schemaPeople = spark.createDataFrame(people).cache()
schemaPeople.createOrReplaceTempView('people')
teenagers = spark.sql('SELECT * FROM people WHERE age >= 13 and age <= 19')
teenagers.show(5)
spark.stop()

+---+--------+---+----------+
| ID|    name|age|numFriends|
+---+--------+---+----------+
|  0|    Will| 33|       385|
|  1|Jean-Luc| 26|         2|
|  2|    Hugh| 55|       221|
|  3|  Deanna| 40|       465|
|  4|   Quark| 68|        21|
+---+--------+---+----------+
only showing top 5 rows



In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkSQL').getOrCreate()
    
people = spark.read.option('header','true').option('inferSchema','true').csv("../resources/sources/fakefriends.csv")

print('Here is our inferred schema:')
people.printSchema()

print('Lets display the name column:')
people.select('name').show()

print('Filter out anyone over 21:')
people.filter(people.age < 21).show()

print('Group by age')
people.groupBy('age').count().show()

print('Make everyone 10 years older:')
people.select(people.name,people.age + 10).show()

spark.stop()

Group by age
+---+-----+
|age|count|
+---+-----+
| 31|    8|
| 65|    5|
| 53|    7|
| 34|    6|
| 28|   10|
| 26|   17|
| 27|    8|
| 44|   12|
| 22|    7|
| 47|    9|
| 52|   11|
| 40|   17|
| 20|    5|
| 57|   12|
| 54|   13|
| 48|   10|
| 19|   11|
| 64|   12|
| 41|    9|
| 43|    7|
+---+-----+
only showing top 20 rows



In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import try_avg,round

spark = SparkSession.builder.appName('numberOfFriendsByAge').getOrCreate()
    
people = spark.read.option('header','true').option('inferSchema','true').csv("../resources/sources/fakefriends.csv")

people.createOrReplaceTempView('people')

filteredPeople = spark.sql('select age,number_of_friends from people')

finalDataframe = filteredPeople.groupBy('age').agg(round(try_avg('number_of_friends'),2).alias('avg_friends')).sort('age').show()

spark.stop()

+---+-----------+
|age|avg_friends|
+---+-----------+
| 18|     343.38|
| 19|     213.27|
| 20|      165.0|
| 21|     350.88|
| 22|     206.43|
| 23|      246.3|
| 24|      233.8|
| 25|     197.45|
| 26|     242.06|
| 27|     228.13|
| 28|      209.1|
| 29|     215.92|
| 30|     235.82|
| 31|     267.25|
| 32|     207.91|
| 33|     325.33|
| 34|      245.5|
| 35|     211.63|
| 36|      246.6|
| 37|     249.33|
+---+-----------+
only showing top 20 rows



In [24]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun 
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType

spark = SparkSession.builder.appName('minimumTemperatures').getOrCreate()

schema = StructType([\
    StructField('stationID',StringType(),True), \
    StructField('date',IntegerType(),True), \
    StructField('measureType',StringType(),True), \
    StructField('temperature',FloatType(),True)])

df = spark.read.schema(schema).csv('../resources/sources/1800.csv')

minTemps = df.filter(df.measureType == 'TMIN')

stationTemps = minTemps.select('stationID','temperature')

minTempsByStation = stationTemps.groupby('stationID').min('temperature')

minTempsByStationF = minTempsByStation.withColumn('temperature',fun.round(fun.col('min(temperature)') * 0.1, 2))

minTempsByStationF.show()

spark.stop()

+-----------+----------------+-----------+
|  stationID|min(temperature)|temperature|
+-----------+----------------+-----------+
|ITE00100554|          -148.0|      -14.8|
|EZE00100082|          -135.0|      -13.5|
+-----------+----------------+-----------+



In [53]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun 
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

spark = SparkSession.builder.appName('aggregateByCustomer').getOrCreate()

schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("itemID", IntegerType(), True),
    StructField("amount", DoubleType(), True)
])

df = (spark.read
           .option("header", "false")
           .schema(schema)
           .csv("../resources/sources/customer-orders.csv")
     )

transformedDf = (
    df.groupBy("userID")
      .agg(fun.round(fun.sum("amount"), 2).alias("totalAmount"))
      .sort('totalAmount',ascending=False)
)

transformedDf.show()

spark.stop()


+------+-----------+
|userID|totalAmount|
+------+-----------+
|    68|    6375.45|
|    73|     6206.2|
|    39|    6193.11|
|    54|    6065.39|
|    71|    5995.66|
|     2|    5994.59|
|    97|    5977.19|
|    46|    5963.11|
|    42|    5696.84|
|    59|    5642.89|
|    41|    5637.62|
|     0|    5524.95|
|     8|    5517.24|
|    85|    5503.43|
|    61|    5497.48|
|    32|    5496.05|
|    58|    5437.73|
|    63|    5415.15|
|    15|    5413.51|
|     6|    5397.88|
+------+-----------+
only showing top 20 rows



In [30]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun
from pyspark.sql.types import IntegerType, StringType

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("popularMovieDf")
    .getOrCreate()
)

df = (
    spark.read
    .text("../resources/ml-100k/u.data")
)

dfMovieNames = (
    spark.read
    .text("../resources/ml-100k/u.item")
)

df_formatted = df.select(
    fun.split(df.value, "\t").getItem(0).cast(StringType()).alias("userID"),
    fun.split(df.value, "\t").getItem(1).cast(StringType()).alias("movieID"),
    fun.split(df.value, "\t").getItem(2).cast(IntegerType()).alias("rating"),
    fun.split(df.value, "\t").getItem(3).cast(IntegerType()).alias("timestamp")
)

dfMovieNames_formatted = dfMovieNames.select(
    fun.split(dfMovieNames.value, "\\|").getItem(0).cast(StringType()).alias("movieID"),
    fun.split(dfMovieNames.value, "\\|").getItem(1).cast(StringType()).alias("movieName")
)

df_ratings_count = df_formatted.groupBy("movieID").count().withColumnRenamed("count", "numRatings")
max_ratings = df_ratings_count.agg(fun.max("numRatings").alias("max_numRatings")).collect()[0]["max_numRatings"]
df_most_rated = df_ratings_count.filter(fun.col("numRatings") == max_ratings)
joinedDf = df_most_rated.join(dfMovieNames_formatted, on="movieID", how="left")

joinedDf.show()

spark.stop()


+-------+----------+----------------+
|movieID|numRatings|       movieName|
+-------+----------+----------------+
|     50|       583|Star Wars (1977)|
+-------+----------+----------------+



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("popularSuperhero")
    .getOrCreate()
)

df = (
    spark.read
    .text("../resources/marvel-dataset/Marvel_Graph.txt")
)

dfNames = (
    spark.read
    .text("../resources/marvel-dataset/Marvel_Names.txt")
)

df_splitted = df.select(fun.split(fun.col("value"), " ").alias("split_values"))
df_exploded = df_splitted.select(fun.explode(fun.col("split_values")).alias("id"))
df_id_count = df_exploded.groupBy("id").count().orderBy(fun.desc("count"))
df_id_count = df_id_count.filter(fun.col('id') != '')

dfNamesFormatted = (
    dfNames
    .select(
        fun.split(fun.col("value"), " ", 2).getItem(0).alias("id"),
        fun.split(fun.col("value"), " ", 2).getItem(1).alias("hero_name_raw")
    )
    # Remove surrounding quotes from the hero name
    .withColumn("heroName",
                fun.regexp_replace("hero_name_raw", '"', ""))  
    .drop("hero_name_raw")
)

finalDf = df_id_count.join(dfNamesFormatted,how='left',on='id').sort('count',ascending=False)

finalDf.show()

spark.stop()

+----+-----+--------------------+
|  id|count|            heroName|
+----+-----+--------------------+
| 859| 1937|     CAPTAIN AMERICA|
|5306| 1745|SPIDER-MAN/PETER PAR|
|2664| 1532|IRON MAN/TONY STARK |
|5716| 1429|THING/BENJAMIN J. GR|
|6306| 1397|    WOLVERINE/LOGAN |
|3805| 1389|MR. FANTASTIC/REED R|
|2557| 1374|HUMAN TORCH/JOHNNY S|
|4898| 1348|SCARLET WITCH/WANDA |
|5736| 1292|THOR/DR. DONALD BLAK|
| 403| 1283|BEAST/HENRY &HANK& P|
|6066| 1266|             VISION |
|2650| 1247|INVISIBLE WOMAN/SUE |
|2399| 1179|                HAWK|
|1289| 1107|CYCLOPS/SCOTT SUMMER|
|5467| 1098|STORM/ORORO MUNROE S|
| 133| 1097|ANGEL/WARREN KENNETH|
|6148| 1096|WASP/JANET VAN DYNE |
| 154| 1095|ANT-MAN/DR. HENRY J.|
|5046| 1083|SHE-HULK/JENNIFER WA|
|1602| 1082|DR. STRANGE/STEPHEN |
+----+-----+--------------------+
only showing top 20 rows



In [22]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
import sys

def computeCosineSimilarity(spark, data):
    # Compute xx, xy and yy columns
    pairScores = data \
      .withColumn("xx", func.col("rating1") * func.col("rating1")) \
      .withColumn("yy", func.col("rating2") * func.col("rating2")) \
      .withColumn("xy", func.col("rating1") * func.col("rating2")) 

    # Compute numerator, denominator and numPairs columns
    calculateSimilarity = pairScores \
      .groupBy("movie1", "movie2") \
      .agg( \
        func.sum(func.col("xy")).alias("numerator"), \
        (func.sqrt(func.sum(func.col("xx"))) * func.sqrt(func.sum(func.col("yy")))).alias("denominator"), \
        func.count(func.col("xy")).alias("numPairs")
      )

    # Calculate score and select only needed columns (movie1, movie2, score, numPairs)
    result = calculateSimilarity \
      .withColumn("score", \
        func.when(func.col("denominator") != 0, func.col("numerator") / func.col("denominator")) \
          .otherwise(0) \
      ).select("movie1", "movie2", "score", "numPairs")

    return result

# Get movie name by given movie id 
def getMovieName(movieNames, movieId):
    result = movieNames.filter(func.col("movieID") == movieId) \
        .select("movieTitle").collect()[0]

    return result[0]


spark = SparkSession.builder.appName("MovieSimilarities").master("local[*]").getOrCreate()

movieNamesSchema = StructType([ \
                               StructField("movieID", IntegerType(), True), \
                               StructField("movieTitle", StringType(), True) \
                               ])
    
moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])
    
    
# Create a broadcast dataset of movieID and movieTitle.
# Apply ISO-885901 charset
movieNames = spark.read \
      .option("sep", "|") \
      .option("charset", "ISO-8859-1") \
      .schema(movieNamesSchema) \
      .csv("../resources/ml-100k/u.item")

# Load up movie data as dataset
movies = spark.read \
      .option("sep", "\t") \
      .schema(moviesSchema) \
      .csv("../resources/ml-100k/u.data")


ratings = movies.select("userId", "movieId", "rating")

# Emit every movie rated together by the same user.
# Self-join to find every combination.
# Select movie pairs and rating pairs
moviePairs = ratings.alias("ratings1") \
      .join(ratings.alias("ratings2"), (func.col("ratings1.userId") == func.col("ratings2.userId")) \
            & (func.col("ratings1.movieId") < func.col("ratings2.movieId"))) \
      .select(func.col("ratings1.movieId").alias("movie1"), \
        func.col("ratings2.movieId").alias("movie2"), \
        func.col("ratings1.rating").alias("rating1"), \
        func.col("ratings2.rating").alias("rating2"))


moviePairSimilarities = computeCosineSimilarity(spark, moviePairs).cache()

if (len(sys.argv) > 1):
    scoreThreshold = 0.97
    coOccurrenceThreshold = 50.0

    movieID = 313

    # Filter for movies with this sim that are "good" as defined by
    # our quality thresholds above
    filteredResults = moviePairSimilarities.filter( \
        ((func.col("movie1") == movieID) | (func.col("movie2") == movieID)) & \
          (func.col("score") > scoreThreshold) & (func.col("numPairs") > coOccurrenceThreshold))

    # Sort by quality score.
    results = filteredResults.sort(func.col("score").desc()).take(10)
    
    print ("Top 10 similar movies for " + getMovieName(movieNames, movieID))
    
    for result in results:
        # Display the similarity result that isn't the movie we're looking at
        similarMovieID = result.movie1
        if (similarMovieID == movieID):
          similarMovieID = result.movie2
        
        print(getMovieName(movieNames, similarMovieID) + "\tscore: " \
              + str(result.score) + "\tstrength: " + str(result.numPairs))
        


24/12/22 19:47:12 WARN CacheManager: Asked to cache already cached data.


Top 10 similar movies for Titanic (1997)
Shawshank Redemption, The (1994)	score: 0.9774914218747776	strength: 98
Braveheart (1995)	score: 0.9773906047511945	strength: 114
In the Line of Fire (1993)	score: 0.9750576027310787	strength: 70
Sling Blade (1996)	score: 0.974991844395603	strength: 54
Edge, The (1997)	score: 0.9739215608821148	strength: 67
Die Hard (1988)	score: 0.9738113931276092	strength: 87
Primal Fear (1996)	score: 0.9736181709820277	strength: 87
Good Will Hunting (1997)	score: 0.9715894131404956	strength: 160
Fugitive, The (1993)	score: 0.9714155141969335	strength: 119
Glory (1989)	score: 0.9713991588484581	strength: 71
