# Preparing Pyspark and imports

In [62]:
!pip install pyspark #Actually works unlike using [!wget -q "URL"]



In [80]:
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, FloatType

In [81]:
spark = SparkSession.builder\
.appName('test') \
.getOrCreate()

***This code will look for any csv file within the main directory***

In [82]:
for file in os.listdir():
  if '.csv' in file:
    print(f'The size of the {file} file is {os.stat(file).st_size/(1024*1024):,.2f} MB')

The size of the global_temperatures.csv file is 0.19 MB


***This os.listdir directs to the path folder "read_test"***

In [83]:
for file in os.listdir('read_test'):
  if '.csv' in file:
    print(f'The size of the {file} file is {os.stat("read_test/"+file).st_size/(1024*1024):,.2f} MB')

The size of the global_temperatures.csv file is 0.19 MB


# Reading the CSV file

In [84]:
spark_df = spark.read.csv('global_temperatures.csv', header=True, inferSchema=True)


In [85]:
spark_df.show(10)

+----------+----------------------+---------------------------------+------------------+-----------------------------+------------------+-----------------------------+------------------------------+-----------------------------------------+
|        dt|LandAverageTemperature|LandAverageTemperatureUncertainty|LandMaxTemperature|LandMaxTemperatureUncertainty|LandMinTemperature|LandMinTemperatureUncertainty|LandAndOceanAverageTemperature|LandAndOceanAverageTemperatureUncertainty|
+----------+----------------------+---------------------------------+------------------+-----------------------------+------------------+-----------------------------+------------------------------+-----------------------------------------+
|1776-08-01|                14.837|                            3.437|              NULL|                         NULL|              NULL|                         NULL|                          NULL|                                     NULL|
|1777-08-01|                12.815| 

In [86]:
print(f'There are {len(spark_df.columns)} columns and {spark_df.count():,} rows in the Pyspark DataFrame')

There are 9 columns and 3,192 rows in the Pyspark DataFrame


In [87]:
spark_df.printSchema()

root
 |-- dt: date (nullable = true)
 |-- LandAverageTemperature: double (nullable = true)
 |-- LandAverageTemperatureUncertainty: double (nullable = true)
 |-- LandMaxTemperature: double (nullable = true)
 |-- LandMaxTemperatureUncertainty: double (nullable = true)
 |-- LandMinTemperature: double (nullable = true)
 |-- LandMinTemperatureUncertainty: double (nullable = true)
 |-- LandAndOceanAverageTemperature: double (nullable = true)
 |-- LandAndOceanAverageTemperatureUncertainty: double (nullable = true)



# Using Schema

In [93]:
schema = StructType([
    StructField('Date', StringType(), True),
    StructField('Land Avg Temp', StringType(), True),
    StructField('Land Avg Temp(Uncertainty)', StringType(), True),
    StructField('Land Max Temp', FloatType(), True),
    StructField('Land Max Temp(Uncertainty)', FloatType(), True),
    StructField('Land Min Temp', FloatType(), True),
    StructField('Land Min Temp(Uncertainty)', FloatType(), True),
    StructField('Land and Ocean Avg Temp', FloatType(), True),
    StructField('Land and Ocean Avg Temp(Uncertainty)', FloatType(), True)
    ])


In [94]:
spark_df2 = spark.read.csv('read_test/global_temperatures.csv', schema=schema, header=True)

In [95]:
spark_df2.show(6)

+----------+------------------+--------------------------+-------------+--------------------------+-------------+--------------------------+-----------------------+------------------------------------+
|      Date|     Land Avg Temp|Land Avg Temp(Uncertainty)|Land Max Temp|Land Max Temp(Uncertainty)|Land Min Temp|Land Min Temp(Uncertainty)|Land and Ocean Avg Temp|Land and Ocean Avg Temp(Uncertainty)|
+----------+------------------+--------------------------+-------------+--------------------------+-------------+--------------------------+-----------------------+------------------------------------+
|1776-08-01|            14.837|                     3.437|         NULL|                      NULL|         NULL|                      NULL|                   NULL|                                NULL|
|1777-08-01|            12.815|                     1.269|         NULL|                      NULL|         NULL|                      NULL|                   NULL|                            