
# Create a schema
#### Step 1: Create a SparkSession

First, initialize the Spark session. This session serves as the entry point for reading and manipulating data in Spark.

```python
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ReadCSVWithSchema") \
    .getOrCreate()
```

#### Step 2: Define the Schema

Define a schema for the CSV file using `StructType` and `StructField`. This schema explicitly specifies the data types for each column.

```python
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define the schema for the CSV file
schema = StructType([
    StructField("Name", StringType(), True),  # Column "Name" with data type String
    StructField("Age", IntegerType(), True),  # Column "Age" with data type Integer
    StructField("Salary", DoubleType(), True) # Column "Salary" with data type Double
])
```

#### Step 3: Read the CSV File with the Defined Schema

Use the `spark.read.csv()` method to read the CSV file into a DataFrame, passing the schema as an argument.

```python
# Path to the CSV file
csv_file_path = "path/to/your/file.csv"

# Read the CSV file into a DataFrame using the predefined schema
df = spark.read.csv(csv_file_path, header=True, schema=schema)

# Show the DataFrame content (for demonstration)
df.show()
```

- **`header=True`**: Indicates that the first line of the CSV file contains column headers.
- **`schema=schema`**: Specifies the schema defined earlier, which Spark will use to parse the CSV data.



In [None]:
# from pyspark.sql import SparkSession

# # Initialize SparkSession with Hive support
# spark = SparkSession.builder \
#     .appName("HiveConnectionCheck") \
#     .config("spark.sql.warehouse.dir","/user/hive/warehouse")\
#     .config("hive.metastore.uris","thrift://localhost:9083")\
#     .config("spark.sql.catalogImplementation", "hive") \
#     .enableHiveSupport() \
#     .getOrCreate()

# spark.sparkContext.setLogLevel("ERROR")

In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession with Hive support
spark = SparkSession.builder \
    .appName("HiveConnectionCheck") \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [None]:
# Show all tables in the 'default' database
spark.sql("SHOW DATABASES").show()

In [21]:
# Drop the existing table if it exists
spark.sql("create database spark_db")
spark.sql("DROP TABLE IF EXISTS spark_db.emp")
# Recreate the table with the desired format
spark.sql("""
    CREATE TABLE spark_db.emp (
        name STRING,
        age INT,
        salary DOUBLE
    )
    USING parquet
    LOCATION 'hdfs:////user/maneelcha49dgre/Spark/spark_db'
""")

# # Check the table properties to see the existing format
spark.sql("DESCRIBE FORMATTED spark_db.emp").show(truncate=False)



+----------------------------+--------------------------------------------------------------+-------+
|col_name                    |data_type                                                     |comment|
+----------------------------+--------------------------------------------------------------+-------+
|name                        |string                                                        |null   |
|age                         |int                                                           |null   |
|salary                      |double                                                        |null   |
|                            |                                                              |       |
|# Detailed Table Information|                                                              |       |
|Database                    |spark_db                                                      |       |
|Table                       |emp                                                 

In [22]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [None]:
spark.sql("use spark_db")

In [None]:
spark.sql("select * from emp").show()

In [23]:
from pyspark.sql import Row
# Step 1: Create a simple Spark DataFrame
data = [
    Row(name="Alice", age=30, salary=3000.0),
    Row(name="Bob", age=25, salary=2500.0),
    Row(name="Charlie", age=35, salary=3500.0)
]

df = spark.createDataFrame(data)
df.show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|  Alice| 30|3000.0|
|    Bob| 25|2500.0|
|Charlie| 35|3500.0|
+-------+---+------+



                                                                                

In [24]:
# Step 4: Insert data into the Hive table
df.write.mode("append").saveAsTable("spark_db.emp")

                                                                                

In [27]:
# Verify the data by querying the Hive table
result_df = spark.sql("SELECT * FROM spark_db.emp")
result_df.show()

# Stop the SparkSession
# spark.stop()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|  Alice| 30|3000.0|
|    Bob| 25|2500.0|
|Charlie| 35|3500.0|
+-------+---+------+



In [28]:
#insert data into spark_db.emp table
spark.sql("insert into table spark_db.emp values('Max',34,34000.0)")

DataFrame[]

In [29]:
result_df = spark.sql("SELECT * FROM spark_db.emp")
result_df.show()

+-------+---+-------+
|   name|age| salary|
+-------+---+-------+
|  Alice| 30| 3000.0|
|    Bob| 25| 2500.0|
|Charlie| 35| 3500.0|
|    Max| 34|34000.0|
+-------+---+-------+

