In [5]:
import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as func
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType,LongType
import codecs

In [2]:
spark = SparkSession.builder.appName("PopularHero").getOrCreate()

schema = StructType([\
                    StructField("id",IntegerType(),True),\
                    StructField("name",StringType(),True)\
    ])

names = spark.read.schema(schema).option("sep"," ").csv("file:///SparkCourse/Marvel-names.txt")

lines = spark.read.text("file:///SparkCourse/Marvel-graph.txt")

# we trim each line of whitespace
connections = lines.withColumn("id",func.split(func.col("value"), " ")[0]) \
    .withColumn("connections",func.size(func.split(func.col("value")," "))-1) \
        .groupby("id").agg(func.sum("connections").alias("connections"))

mostPopular = connections.sort(func.col("connections").desc()).first()
mostPopularName = names.filter(func.col("id") == mostPopular[0]).select("name").first()

print(mostPopularName[0] + "is the most popular hero" + str(mostPopular[1]))



CAPTAIN AMERICAis the most popular hero1937


In [5]:
spark.stop()

In [23]:
spark = SparkSession.builder\
                    .master("local[2]")\
                    .appName("Examples")\
                    .getOrCreate()

In [6]:
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
# creating dataframe from rdd
rdd = spark.sparkContext.parallelize(data)

In [7]:
dfFromRDD1 = rdd.toDF()
dfFromRDD1.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)



In [8]:
columns = ["language","user_count"]
dfFromRDD1 = rdd.toDF(columns)
dfFromRDD1.printSchema()

root
 |-- language: string (nullable = true)
 |-- user_count: string (nullable = true)



In [None]:
# otra forma
dfFromRDD2 = spark.createDataFrame(rdd).toDF(*columns)
dfFromRDD2.printSchema()

# crear df vacio
df_vacio = spark.createDataFrame(spark.sparkContext.emptyRDD(),schema) # schema seria la estructura:dtype,nombre columnas..
df_vacio1 = spark.sparkContext.parallelize([]).toDF(schema)
df_vacio2 = spark.createDataFrame([],schema)

In [15]:
spark.stop()

In [20]:
# pandas: se puede convertir un pyspark dataframe a panda, para operar con el con ml, pero despues habra que convertirlo de nuevo a 
# pyspark dataframe para que tengas todas las ventajas de procesamiento de spark ( con pandas es mas lento)
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]
columns = ["first_name","middle_name","last_name","dob","gender","salary"]
#df
pysparkDF = spark.createDataFrame(data=data,schema=columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=True)

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|  dob|gender|salary|
+----------+-----------+---------+-----+------+------+
|     James|           |    Smith|36636|     M| 60000|
|   Michael|       Rose|         |40288|     M| 70000|
|    Robert|           | Williams|42114|      |400000|
|     Maria|       Anne|    Jones|39192|     F|500000|
|       Jen|       Mary|    Brown|     |     F|     0|
+----------+-----------+---------+-----+------+------+



### Convert PySpark Dataframe to Pandas DataFrame
PySpark DataFrame provides a method toPandas() to convert it Python Pandas DataFrame.

toPandas() results in the collection of all records in the PySpark DataFrame to the driver program and should be done on a small subset of the data. running on larger dataset’s results in memory error and crashes the application.

In [21]:
pandasDf = pysparkDF.toPandas()
print(pandasDf)

  first_name middle_name last_name    dob gender  salary
0      James                 Smith  36636      M   60000
1    Michael        Rose            40288      M   70000
2     Robert              Williams  42114         400000
3      Maria        Anne     Jones  39192      F  500000
4        Jen        Mary     Brown             F       0


In [22]:
spark.stop()

In [26]:
rdd = spark.sparkContext.textFile(r"C:\SparkCourse\book.txt")
# flatmap
rdd2 = rdd.flatMap(lambda x:x.split(" "))

In [30]:
# map
rdd3 = rdd2.map(lambda x:(x,1))
# reduce by key
rdd4 = rdd3.reduceByKey(lambda a,b:(a+b))
# sort
rdd5 = rdd4.map(lambda x:(x[1],x[0])).sortByKey()


In [31]:
final_df = spark.createDataFrame(data=rdd5,schema=["palabras","count"])

In [32]:
final_df.show(5)

+--------+----------------+
|palabras|           count|
+--------+----------------+
|       1|Self-Employment:|
|       1|      Disclaimer|
|       1|        Decision|
|       1|          Career|
|       1|             Ego|
+--------+----------------+
only showing top 5 rows



In [35]:
print("count " +str(final_df.count()))

count 6993


In [3]:
spark.stop()

NameError: name 'spark' is not defined

In [15]:
# repartition and colaesence
spark = SparkSession.builder.appName("ejemplos").master("local[5]").getOrCreate()


In [8]:
# broadcast variables
states = {"NY":"New York", "CA":"California", "FL":"Florida"}
broadcastStates = spark.sparkContext.broadcast(states)

data = [("James","Smith","USA","CA"),
("Michael","Rose","USA","NY"),
("Robert","Williams","USA","CA"),
("Maria","Jones","USA","FL")
]

columns = ["firstname","lastname","country","state"]

df = spark.createDataFrame(data=data,schema=columns)
df.printSchema()
df.show(truncate=False)

def state_convert(code):
    return broadcastStates.value[code]

result = df.rdd.map(lambda x:(x[0],x[1],x[2],state_convert(x[3]))).toDF(columns)
result.show(truncate=False)


root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+

+---------+--------+-------+----------+
|firstname|lastname|country|state     |
+---------+--------+-------+----------+
|James    |Smith   |USA    |California|
|Michael  |Rose    |USA    |New York  |
|Robert   |Williams|USA    |California|
|Maria    |Jones   |USA    |Florida   |
+---------+--------+-------+----------+



In [9]:
spark.stop()

In [11]:
# acumulator
accum = spark.sparkContext.accumulator(0)
rdd = spark.sparkContext.parallelize([1,2,3,4,5])
rdd.foreach(lambda x:accum.add(x))
print(accum.value)

accuSum = spark.sparkContext.accumulator(0)
def countFun(x):
    global accuSum
    accuSum+=x

rdd.foreach(countFun)
print(accuSum.value)

accumCount = spark.sparkContext.accumulator(0)
rdd2 = spark.sparkContext.parallelize([1,2,3,4,5])
rdd2.foreach(lambda x:accumCount.add(x))
print(accumCount.value)





15
15
15


In [13]:
spark.stop()    

In [14]:
row = Row("James",40)
print(row[0] + "," + str(row[1]))

row2 = Row(name="Alice",age=34)
print(row2.name)

James,40
Alice


In [16]:
# select
data = [("James","Smith","USA","CA"),
    ("Michael","Rose","USA","NY"),
    ("Robert","Williams","USA","CA"),
    ("Maria","Jones","USA","FL")
  ]
columns = ["firstname","lastname","country","state"]

df = spark.createDataFrame(data=data,schema=columns)
df.show(truncate=False)

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|James    |Smith   |USA    |CA   |
|Michael  |Rose    |USA    |NY   |
|Robert   |Williams|USA    |CA   |
|Maria    |Jones   |USA    |FL   |
+---------+--------+-------+-----+



In [17]:
df.select("firstname").show() # igualmente puedo usar df.firstname o func.col("firstname")

+---------+
|firstname|
+---------+
|    James|
|  Michael|
|   Robert|
|    Maria|
+---------+



In [18]:
# collect (recupera los datos, por eso mejor usarlo en small data y despues de hacer groupby, filter...)
dataCollect = df.collect()
print(dataCollect)

[Row(firstname='James', lastname='Smith', country='USA', state='CA'), Row(firstname='Michael', lastname='Rose', country='USA', state='NY'), Row(firstname='Robert', lastname='Williams', country='USA', state='CA'), Row(firstname='Maria', lastname='Jones', country='USA', state='FL')]


In [19]:
for row in dataCollect:
    print("my name is ",row["firstname"], " y vivo en ",row["state"])

my name is  James  y vivo en  CA
my name is  Michael  y vivo en  NY
my name is  Robert  y vivo en  CA
my name is  Maria  y vivo en  FL


In [20]:
spark.stop()

In [3]:
from pyspark.ml.recommendation import ALS
import sys
import codecs

def loadMoviesNames():
    movieNames = {}
    with codecs.open("C:/SparkCourse/ml-100k/u.ITEM","r",encoding='ISO-8859-1',errors='ignore' ) as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [8]:
spark = SparkSession.builder.appName("Alsexample").getOrCreate()

moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

names = loadMoviesNames()

In [9]:
ratings = spark.read.option("sep","\t").schema(moviesSchema)\
    .csv("C:/SparkCourse/ml-100k/u.data")

print("Training recomendation model ...")

als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID")\
    .setItemCol("movieID").setRatingCol("rating")

model = als.fit(ratings)


Training recomendation model ...


In [18]:
# manually construct a dataframe of the user ID we wants rec for
userID = 10
userSchema = StructType([StructField("userID",IntegerType(),True)])
users = spark.createDataFrame([[userID,]],userSchema)

recommendations = model.recommendForUserSubset(users, 10).collect()

print("Top 10 recommendations for user ID" + str(userID))

for userRecs in recommendations:
    myRecs = userRecs[1]
    for rec in myRecs:
        movie = rec[0]
        rating = rec[1]
        movieName = names[movie]
        print(movieName + str(rating))

Top 10 recommendations for user ID1
Secret Agent, The (1996)7.2776384353637695
American Dream (1990)6.270341873168945
Traveller (1997)6.146881580352783
Dangerous Beauty (1998)6.122129440307617
Angel Baby (1995)6.0209856033325195
Pather Panchali (1955)6.001928329467773
Underneath, The (1995)5.852176666259766
Aparajito (1956)5.84736442565918
Fear of a Black Hat (1993)5.591094493865967
Four Days in September (1997)5.564853191375732


In [19]:
spark.stop()

In [3]:
from __future__ import print_function
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors

if __name__ == "__main__":
    #create spark session
    spark = SparkSession.builder.appName("linear").getOrCreate()
    # load data and convert it to the formal mlib expects
    inputLines = spark.sparkContext.textFile("C:/SparkCourse/regression.txt")
    data = inputLines.map(lambda x: x.split(",")).map(lambda x:(float(x[0]),Vectors.dense(float(x[1]))))
    # convert this rdd to a dataframe
    colNames = ["label","features"]
    df = data.toDF(colNames)
    # there a lot of cases where you can avoid going from an Rdd to a dataframe(e.g you are importing from a real dataset o streaming)
    # split data
    trainTest = df.randomSplit([0.5,0.5])
    trainingDF = trainTest[0]
    testDf = trainTest[1]
    # now create linear regresion model
    lir = LinearRegression(maxIter=10,regParam=0.3,elasticNetParam=0.8)
    # train model
    model = lir.fit(trainingDF)
    # now lets see if we can predict with out test data
    fullPrediction = model.transform(testDf).cache()
    predictions = fullPrediction.select("prediction").rdd.map(lambda x:x[0])
    labels = fullPrediction.select("label").rdd.map(lambda x:x[0])
    # zip together
    predictionAndLabel = predictions.zip(labels).collect()
    # print
    for prediction in predictionAndLabel:
        print(prediction)

    spark.stop()





(-2.6468024342930363, -3.74)
(-1.814069449462319, -2.58)
(-1.658814147205745, -2.29)
(-1.5459012001100543, -2.27)
(-1.39064589785348, -2.09)
(-1.4400453122078445, -2.07)
(-1.4259311938208832, -2.0)
(-1.369474720273038, -1.94)
(-1.2918470691447508, -1.91)
(-1.3130182467251927, -1.91)
(-1.2212764772099443, -1.79)
(-1.1930482404360216, -1.75)
(-1.0307358789859664, -1.67)
(-1.2142194180164636, -1.61)
(-1.1648200036620988, -1.58)
(-1.1859911812425408, -1.53)
(-0.9742794054381211, -1.48)
(-1.0307358789859664, -1.47)
(-0.995450583018563, -1.46)
(-0.995450583018563, -1.36)
(-0.917822931890276, -1.34)
(-0.8049099847945854, -1.3)
(-0.833138221568508, -1.3)
(-0.8401952807619887, -1.27)
(-0.85430939914895, -1.26)
(-0.8472523399554693, -1.25)
(-0.7696246888271823, -1.24)
(-0.8401952807619887, -1.23)
(-0.8825376359228726, -1.16)
(-0.8119670439880661, -1.14)
(-0.7766817480206629, -1.12)
(-0.8684235175359113, -1.11)
(-0.6496546825380111, -1.1)
(-0.7625676296337016, -1.1)
(-0.833138221568508, -1.09)
(-

In [4]:
spark.stop()

### decision tree

In [15]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor

if __name__ == "__main__":

    # Create a SparkSession (Note, the config section is only for Windows!)
    spark = SparkSession.builder.appName("DecisionTree").getOrCreate()

    
    # Load up data as dataframe
    data = spark.read.option("header", "true").option("inferSchema", "true")\
        .csv('C:\\SparkCourse\\realestate.csv')

    assembler = VectorAssembler().setInputCols(["HouseAge", "DistanceToMRT", \
                               "NumberConvenienceStores"]).setOutputCol("features")
    
    df = assembler.transform(data).select("PriceOfUnitArea", "features")

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our decision tree
    dtr = DecisionTreeRegressor().setFeaturesCol("features").setLabelCol("PriceOfUnitArea")

    # Train the model using our training data
    model = dtr.fit(trainingDF)

    # Now see if we can predict values in our test data.
    # Generate predictions using our decision tree model for all features in our
    # test dataframe:
    fullPredictions = model.transform(testDF).cache()

    # Extract the predictions and the "known" correct labels.
    predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
    labels = fullPredictions.select("PriceOfUnitArea").rdd.map(lambda x: x[0])

    # Zip them together
    predictionAndLabel = predictions.zip(labels).collect()

    # Print out the predicted and actual values for each point
    for prediction in predictionAndLabel:
      print(prediction)


    # Stop the session
    spark.stop()
    
    spark.stop()

(40.0, 7.6)
(16.36, 11.6)
(24.69999999999999, 12.2)
(16.36, 12.8)
(18.9, 13.0)
(14.233333333333334, 13.7)
(14.033333333333331, 14.4)
(14.033333333333331, 15.0)
(16.36, 15.6)
(14.033333333333331, 15.6)
(16.36, 15.9)
(16.36, 16.7)
(14.033333333333331, 17.4)
(24.69999999999999, 17.4)
(23.75714285714286, 17.7)
(16.36, 18.2)
(14.233333333333334, 18.3)
(16.36, 18.8)
(24.69999999999999, 18.8)
(14.233333333333334, 19.0)
(14.233333333333334, 19.1)
(18.9, 19.2)
(16.36, 20.0)
(16.36, 20.7)
(23.75714285714286, 20.7)
(32.56, 20.8)
(27.772727272727273, 20.9)
(27.772727272727273, 21.3)
(27.772727272727273, 21.7)
(27.772727272727273, 21.8)
(35.69999999999999, 22.0)
(27.772727272727273, 22.1)
(14.033333333333331, 22.1)
(32.56, 22.3)
(14.033333333333331, 22.6)
(27.772727272727273, 22.8)
(27.772727272727273, 22.9)
(40.0, 23.0)
(27.772727272727273, 23.1)
(14.233333333333334, 23.1)
(27.772727272727273, 23.2)
(27.772727272727273, 23.6)
(27.772727272727273, 23.7)
(14.233333333333334, 23.8)
(27.77272727272727

In [9]:
spark.stop()