<a href="https://colab.research.google.com/github/codal-tshah/data-practices-2024/blob/15_apr/PySpark/PySpark_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **PySpark Installation**

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=fddf3e0353b3aec592bfaf3905fe92eee46a24ed3ec043f91b320ff51d968343
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
import pyspark

In [None]:
from pyspark.sql import SparkSession


# **DataFrame Creation**

In [None]:
spark = SparkSession.builder.appName("PySpark Example").getOrCreate()

df = spark.sql("SELECT 'Py' as low")

df.show()

+---+
|low|
+---+
| Py|
+---+



In [None]:
spark = SparkSession.builder.getOrCreate()
df = spark.sql('''select 'PySpark' as hello ''')
df.show()

+-------+
|  hello|
+-------+
|PySpark|
+-------+



In [None]:
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

# Create a SparkConf object
conf = SparkConf().setAppName("PySpark Demo App").setMaster('local[1]')

# Get the value of 'spark.master' property
print(conf.get('spark.master'))  # Output: 'local[2]'

# Get the value of 'spark.app.name' property
print(conf.get('spark.app.name'))  # Output: 'PySpark Demo App'


local[1]
PySpark Demo App


In [None]:
spark

In [None]:
type(spark)

In [None]:
dir(spark)

['Builder',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activeSession',
 '_conf',
 '_convert_from_pandas',
 '_createFromLocal',
 '_createFromRDD',
 '_create_dataframe',
 '_create_from_pandas_with_arrow',
 '_create_shell_session',
 '_getActiveSessionOrCreate',
 '_get_numpy_record_dtype',
 '_inferSchema',
 '_inferSchemaFromList',
 '_instantiatedSession',
 '_jconf',
 '_jsc',
 '_jsparkSession',
 '_jvm',
 '_repr_html_',
 '_sc',
 'active',
 'addArtifact',
 'addArtifacts',
 'addTag',
 'builder',
 'catalog',
 'clearTags',
 'client',
 'conf',
 'copyFromLocalToFs',
 'createDataFrame',
 'getActiveSession',
 'getTags',
 'interru

In [None]:
 help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike'), ForwardRef('ArrayLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
    or a :class:`numpy.ndarray`.
    
    .. versionadded:: 2.0.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    data : :class:`RDD` or iterable
        an RDD of any kind of SQL data representation (:class:`Row`,
        :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`,
        :class:`pandas.DataFrame` or :class:`numpy.ndarray`.
    schema : :class:`pyspark.sql.types.DataType`, str or list, op

In [None]:
spark.createDataFrame([('Alice', 1)]).show()

+-----+---+
|   _1| _2|
+-----+---+
|Alice|  1|
+-----+---+



In [None]:
spark.createDataFrame([('MARK', 52)]).show()

+----+---+
|  _1| _2|
+----+---+
|MARK| 52|
+----+---+



In [None]:
d = [{'name': 'Alice', 'age': 1}]
spark.createDataFrame(d).show()

+---+-----+
|age| name|
+---+-----+
|  1|Alice|
+---+-----+



In [None]:
d = [{'name': 'Alice', 'age': 1}]
spark.createDataFrame([('MARK', 52)]).show()

+----+---+
|  _1| _2|
+----+---+
|MARK| 52|
+----+---+



In [None]:
spark.createDataFrame([('Alice', 1)], ['name', 'age']).show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
+-----+---+



In [None]:
from pyspark.sql import Row
Person = Row('name', 'age')
df = spark.createDataFrame([Person("Alice", 1)])
df.show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
+-----+---+



In [None]:
data = [(1,'mahesh'),(2,'rajesh'),(3,'suresh')]
df = spark.createDataFrame(data=data)
df.show()

+---+------+
| _1|    _2|
+---+------+
|  1|mahesh|
|  2|rajesh|
|  3|suresh|
+---+------+



In [None]:
data = [(1,'mahesh'),(2,'rajesh'),(3,'suresh')]
df = spark.createDataFrame(data=data, schema = ['id','name'])
df.show()
df.printSchema()

+---+------+
| id|  name|
+---+------+
|  1|mahesh|
|  2|rajesh|
|  3|suresh|
+---+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [None]:
from pyspark.sql.types import *
help(StructType)

**Converting the datatypes**

In [None]:
data = [(1,'mahesh'),(2,'rajesh'),(3,'suresh')]
df = spark.createDataFrame(data=data, schema = ['id','name'])
schema  = StructType([StructField(name='id',dataType=IntegerType()),
                      StructField(name='name',dataType=StringType())])
df.show()
df.printSchema()

+---+------+
| id|  name|
+---+------+
|  1|mahesh|
|  2|rajesh|
|  3|suresh|
+---+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [9]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .master("yarn") \
    .appName("MySparkApp") \
    .getOrCreate()

# Create a DataFrame
data = [("Alice", 34), ("Bob", 45), ("Charlie", 25)]
df = spark.createDataFrame(data, ["Name", "Age"])

# Perform operations on the DataFrame
df.show()

# Stop the SparkSession
spark.stop()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [8]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("MySparkApp") \
    .getOrCreate()

# Create a DataFrame
data = [("Alice", 34), ("Bob", 45), ("Charlie", 25)]
df = spark.createDataFrame(data, ["Name", "Age"])

# Perform operations on the DataFrame
df.show()

# Stop the SparkSession
spark.stop()


+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 34|
|    Bob| 45|
|Charlie| 25|
+-------+---+

