In [0]:
from pyspark.sql import SparkSession #Para poder trabajar con Spark vamos a tener que levantar una sesión
import pyspark.sql.functions as f #Esta línea es muy importante ya que aqui se encuentran todas las funciones que vamos a poder usar
from pyspark.sql.types import IntegerType

In [0]:
#Leo una tabla de un fileStore
file_location = "/FileStore/tables/titanic_train.txt"
# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format("csv") \
  .option("inferSchema", "false") \
  .option("header", "true") \
  .option("sep", ",") \
  .load(file_location)

In [0]:
df.columns

Out[4]: ['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [0]:
for column in ["Age", "Survived", "SibSp", "Pclass", "Fare"]:
    df = df.withColumn(column, df[column].cast(IntegerType()))
df.printSchema()

root
 |-- PassengerId: string (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: string (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: integer (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [0]:
#Vamos a usar la funcion agg para poder crear muchas agregaciones al mismo tiempo
#Vamos a usar las funciones:
#1) f.min y f.max, nos darán los mínimos y máximos del campo que deseamos
#2) f.alias, esta función servirá para renombrar el campo 
display(df.groupBy("Sex").agg(f.min(f.col("Age")).alias("minAge"), f.max(f.col("Age")).alias("maxAge"), 
                      f.median(f.col("Age")).alias("medianAge"), f.mean("Fare").alias("meanFare"),
                      f.mean(f.col("Age"))))


Sex,minAge,maxAge,medianAge,meanFare,avg(Age)
female,0,63,27.0,44.06369426751592,27.90421455938697
male,0,80,29.0,25.103986135181977,30.70198675496689


In [0]:
#Vamos a trabajar con el sort. A la función se le pasa el nombre o los nombres de campos y se dice si es ascendente o descendente 
display(df.groupby("Sex").count().sort("count", ascending = False))

Sex,count
male,577
female,314


In [0]:
#Para crear una tabla de doble entrada puedo trabajar con la función .crosstab 
display(df.crosstab("Sex", "Survived"))

Sex_Survived,0,1
female,81,233
male,468,109


In [0]:
#Otra forma de filtrar es utilizando la opción where
display(df.where("Pclass == 3"))

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7,,S
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7,,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8,,Q
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11,,S
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16,G6,S
13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8,,S
14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31,,S
15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7,,S


In [0]:
df.createOrReplaceTempView("titanic")
a = spark.sql("select * from titanic where Pclass = 3")
display(a)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7,,S
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7,,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8,,Q
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11,,S
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16,G6,S
13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8,,S
14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31,,S
15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7,,S


In [0]:
b = spark.sql ("select a.*, row_number() over(partition by Sex order by Age desc) as ranking from titanic a ")
display(b)

In [0]:
#Para realizar correlaciones, se puede usar la función corr

display(df.select(f.corr("Age", "Fare")))

"corr(Age, Fare)"
0.0965038252651772


In [0]:
#Trabajar con quantiles
bucketsNumber = 10
buckets = [x / bucketsNumber for x in range(1, bucketsNumber)]
column_name = "Age"
quantiles = df.approxQuantile(column_name, buckets, 0.0)
df = df.withColumn("{}_quantile".format(column_name), f.when(f.col(column_name) < quantiles[0], 0)\
                    .otherwise(-1))
for x in range(0, len(quantiles)-1):
    df = df.withColumn("{}_quantile".format(column_name), 
                    f.when(f.col(column_name).between(quantiles[x], quantiles[x+1]),x+1)\
                    .otherwise(f.col("{}_quantile".format(column_name))))

df = df.withColumn("{}_quantile".format(column_name), f.when(f.col(column_name) > quantiles[-1], x+2)\
                   .otherwise(f.col("{}_quantile".format(column_name))))

display(df.groupBy("{}_quantile".format(column_name)).count())

Age_quantile,count
-1,177
1,68
6,69
3,74
5,91
9,64
4,59
8,84
7,69
2,65
