In [4]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 65kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 45.7MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=711d789b022b3263e70cacc64c28eb639444de7bd58a0b84d01baf46aec37169
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [1]:
# import PySpark Libraries

from pyspark.sql import SparkSession

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

In [3]:
from pyspark.sql.functions import udf

In [4]:
# Initiate PySpark Session

spark = SparkSession\
        .builder\
        .appName("Birds")\
        .getOrCreate()

In [5]:
# csv file input path.
input_path = '/content/sample_data/birds.csv'

# dataframe schema structure.
input_schema = StructType(
    [
     StructField("Species", StringType()),
     StructField("Category", StringType()),
     StructField("Period", StringType()),
     StructField("Annual Percentage Change", DoubleType())
     ]
)

In [6]:
# Read the csv file.
df = spark.read.csv(path = input_path, header=True, schema=input_schema)

In [7]:
# Print Dataframe Schema.
df.printSchema()

root
 |-- Species: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Period: string (nullable = true)
 |-- Annual Percentage Change: double (nullable = true)



In [8]:
# Get the count of DataFrame records.
df.count()

132

In [9]:
# Display DataFrame data.
df.show()

+--------------------+--------------+-----------+------------------------+
|             Species|      Category|     Period|Annual Percentage Change|
+--------------------+--------------+-----------+------------------------+
|Greenfinch (Chlor...|Farmland birds|(1970-2014)|                   -1.13|
|Jackdaw (Corvus m...|Farmland birds|(1970-2014)|                    2.12|
|Kestrel (Falco ti...|Farmland birds|(1970-2014)|                   -1.49|
|Reed Bunting (Emb...|Farmland birds|(1970-2014)|                   -0.86|
|Rook (Corvus frug...|Farmland birds|(1970-2014)|                    0.17|
|Woodpigeon (Colum...|Farmland birds|(1970-2014)|                    1.85|
|Yellow Wagtail (M...|Farmland birds|(1970-2014)|                   -2.56|
|Corn Bunting (Emb...|Farmland birds|(1970-2014)|                   -5.02|
|Goldfinch (Cardue...|Farmland birds|(1970-2014)|                    2.14|
|Grey Partridge (P...|Farmland birds|(1970-2014)|                   -5.46|
|Lapwing (Vanellus...|Far

In [10]:
# this function derives the English name (from English and Latin species combination) from the Species column. 
def get_english_name(species):
  return species.split('(')[0].strip()

# print('test: {}'.format(get_english_name('Greenfinch (Chloris chloris)')))

# this function returns the year (when the data collection began) from the Period column.
def get_start_year(period):
  return period.split('-')[0].strip('(')

# print('test: {}'.format(get_start_year('(1970-2014)')))

# this function returns the change trend category from the Annual Percentage Change column.
def get_trend(annual_percentage_change):
  trend = ''

  if annual_percentage_change < -3.0:
    trend = 'strong decline'
  elif annual_percentage_change >= -3.0 and annual_percentage_change <= -0.50:
    trend = 'weak decline'
  elif annual_percentage_change > -0.50 and annual_percentage_change < 0.50:
    trend = 'no change'
  elif annual_percentage_change >= 0.50 and annual_percentage_change <= 3.0:
    trend = 'weak increase'
  elif annual_percentage_change > 3.0:
    trend = 'strong increase'
  else:
    trend = 'unknown'

  return trend

# print('test: {}'.format(get_trend(0.44)))


In [11]:
# Register the get_english_name function as PySpark UDF functions.
spark.udf.register("get_english_name", get_english_name,StringType())

<function __main__.get_english_name>

In [12]:
# Register the get_start_year function as PySpark UDF functions.
spark.udf.register("get_start_year", get_start_year, StringType())

<function __main__.get_start_year>

In [13]:
# Register the get_trend function as PySpark UDF functions.
spark.udf.register("get_trend", get_trend, StringType())

<function __main__.get_trend>

In [14]:
# change the DataFrame into a table or view.
df.createOrReplaceTempView("birds_data")

In [15]:
# call the PySpark UDF functions to transform the data.
birds_sql = spark.sql("SELECT get_english_name(Species) as species \
                        , category \
                        , get_start_year(Period) as collect_from_year \
                        , `Annual Percentage Change` as annual_percentage_change \
                        , get_trend(`Annual Percentage Change`) as trend \
                        FROM birds_data")

In [16]:
birds_sql.show()

+--------------+--------------+-----------------+------------------------+--------------+
|       species|      category|collect_from_year|annual_percentage_change|         trend|
+--------------+--------------+-----------------+------------------------+--------------+
|    Greenfinch|Farmland birds|             1970|                   -1.13|  weak decline|
|       Jackdaw|Farmland birds|             1970|                    2.12| weak increase|
|       Kestrel|Farmland birds|             1970|                   -1.49|  weak decline|
|  Reed Bunting|Farmland birds|             1970|                   -0.86|  weak decline|
|          Rook|Farmland birds|             1970|                    0.17|     no change|
|    Woodpigeon|Farmland birds|             1970|                    1.85| weak increase|
|Yellow Wagtail|Farmland birds|             1970|                   -2.56|  weak decline|
|  Corn Bunting|Farmland birds|             1970|                   -5.02|strong decline|
|     Gold