# Apache Spark SQL

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ChurnPrediction").getOrCreate()
df = spark.read.csv("ChurnModelling.csv", header=True, inferSchema=True)

In [54]:
df.createOrReplaceTempView("clients")

In [55]:
spark.sql("""
    SELECT Age, Balance, EstimatedSalary
    FROM clients
    WHERE Age > 50
""").show()

+---+---------+---------------+
|Age|  Balance|EstimatedSalary|
+---+---------+---------------+
| 58|      0.0|       32790.02|
| 56|209767.31|      150694.42|
| 52|109922.61|       96823.32|
| 51| 123372.3|      115429.32|
| 59|126224.87|        4423.63|
| 55|      0.0|       13288.46|
| 58|110597.76|      160122.66|
| 55|119618.42|       29861.13|
| 59|      0.0|      160849.43|
| 54|138547.97|       70196.23|
| 76|128410.71|      181718.73|
| 71|142550.25|      122506.78|
| 59|122781.51|      140166.95|
| 64|      0.0|       13181.37|
| 51|      0.0|       69101.23|
| 51|117808.74|       67311.12|
| 54|126113.28|      112777.38|
| 62| 98854.34|       86920.97|
| 52|107304.39|       28806.32|
| 61|157201.48|       50368.63|
+---+---------+---------------+
only showing top 20 rows



In [56]:
spark.sql("""
    SELECT Geography, AVG(Balance) AS AvgBalance, COUNT(*) AS TotalClients
    FROM clients
    GROUP BY Geography
""").show()


+---------+------------------+------------+
|Geography|        AvgBalance|TotalClients|
+---------+------------------+------------+
|  Germany|119730.11613391797|        2509|
|   France| 62092.63651575591|        5014|
|    Spain| 61818.14776342345|        2477|
+---------+------------------+------------+



In [60]:
spark.sql("""
    SELECT Age, EstimatedSalary,
           Balance / (EstimatedSalary + 1) AS BalanceToSalaryRatio
    FROM clients
""").show()


+---+---------------+--------------------+
|Age|EstimatedSalary|BalanceToSalaryRatio|
+---+---------------+--------------------+
| 24|      167256.35|   0.608804755067565|
| 58|       32790.02|                 0.0|
| 56|      150694.42|  1.3919952577191794|
| 36|       47271.61|  2.7446874627823594|
| 33|       43932.54|  3.9813026676202283|
| 52|       96823.32|  1.1352789257905451|
| 31|      179453.66|                 0.0|
| 48|      151310.16|   1.113747128764329|
| 51|      115429.32|   1.068803239911316|
| 36|      161668.15|                 0.0|
| 59|        4423.63|  28.527779723954318|
| 41|      131710.59|  0.7214308171361382|
| 39|       32615.21|                 0.0|
| 36|      196142.26|  0.3726259061871409|
| 46|      162643.15|  0.7256458962710924|
| 25|       38772.82|  2.0514767438441712|
| 39|        55556.3|   2.190608974878189|
| 33|       42171.13|                 0.0|
| 45|       10908.33|                 0.0|
| 55|       13288.46|                 0.0|
+---+------

# 10 Most Common Classes [Core Classes, Spark Session, Configuration, Input/Output, DataFrame, Column, Data Types, Functions, Grouping, Catalog, UDF, UDTF]

In [2]:
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True)
])

data = [Row("Alice", 30), Row("Bob", 45)]
df = spark.createDataFrame(data, schema=schema)
df

NameError: name 'spark' is not defined

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ChurnPrediction") \
    .getOrCreate()

In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("AppName").setMaster("local[2]")
sc = SparkContext(conf=conf)

In [None]:
# Citire CSV
df = spark.read.option("header", "true").csv("churn.csv")

# Scriere în Parquet
df.write.mode("overwrite").parquet("output/churn_data")

In [None]:
df.select("Age", "Balance").filter(df["Age"] > 50).show(5)

In [None]:
from pyspark.sql.functions import col, when

df = df.withColumn("IsSenior", when(col("Age") > 60, 1).otherwise(0))
df.show(5)

In [None]:
from pyspark.sql.types import DoubleType

df = df.withColumn("Balance", col("Balance").cast(DoubleType()))
df.show(5)

In [None]:
from pyspark.sql.functions import avg, count

df.groupBy("Geography").agg(
    avg("Balance").alias("AvgBalance"),
    count("*").alias("NrClienti")
).show()

In [None]:
df.groupBy("Gender").count().show()


In [None]:
df.createOrReplaceTempView("clients")
spark.catalog.listTables()

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType


def age_group(age): return 1 if age > 60 else 0


age_udf = udf(age_group, IntegerType())
df = df.withColumn("AgeGroup", age_udf(col("Age")))
df.show()

In [None]:
from pyspark.sql.functions import explode, split

df = df.withColumn("Tags", explode(split(col("Interests"), ",")))
df.show()

# 5 Most Common Pandas Api on Apache Spark

In [None]:
import pyspark.pandas as ps

df = ps.read_csv("churn.csv")
df.show(5)

In [None]:
small_df = df.head(100)  # eșantion
pandas_df = small_df.to_pandas()
type(pandas_df)

In [None]:
avg_balance = df.groupby("Geography")["Balance"].mean()
avg_balance.show()

In [None]:
df["RiskTag"] = df["CreditScore"].apply(lambda x: "High" if x < 500 else "Low")
df.show(5)

In [None]:
merged_df = df.merge(small_df, on="CustomerID", how="left")

# 5 Most Common Api for Apache Spark Core 

In [None]:
from pyspark import SparkContext

sc = SparkContext(appName="ChurnAnalysis")

In [None]:
rdd = sc.parallelize([1, 2, 3, 4])


In [None]:
mapped_rdd = rdd.map(lambda x: x * 2)
filtered_rdd = rdd.filter(lambda x: x > 2)


In [None]:
results = rdd.collect()
subset = rdd.take(3)


In [None]:
filtered_rdd.cache()



# SparkSession – Cum se creează o sesiune SparkSession. Particularități si Exemple

In [None]:
from pyspark.sql import SparkSession


In [None]:
spark = SparkSession.builder \
    .appName("ChurnPrediction") \
    .master("local[*]") \
    .getOrCreate()


In [None]:
print(spark.version)


# 4.2 SparkContext – Cum se creează o sesiune SparkContext (pe pași). Particularități si Exemple

In [None]:
from pyspark import SparkContext, SparkConf

In [None]:
conf = SparkConf().setAppName("ChurnApp").setMaster("local[*]")

In [None]:
sc = SparkContext(conf=conf)

In [None]:
data = [10, 20, 30, 40, 50]
rdd = sc.parallelize(data)
print(rdd.collect())


In [None]:
squared = rdd.map(lambda x: x * x)
print(squared.collect())  # Output: [100, 400, 900, 1600, 2500]


In [None]:
total = rdd.reduce(lambda a, b: a + b)
print("Suma totală:", total)


# 4.3 SparkSubmit – Cum se creează o sesiune SparkSubmit (pe pași). Particularități si Exemple

In [None]:
# churn_prediction.py
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ChurnPrediction").getOrCreate()
df = spark.read.option("header", True).csv("ChurnModelling.csv")
df.groupBy("Geography").count().show()
spark.stop()
