# Using PySpark Lab

In [0]:
!python -m pip install --upgrade pip

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[?25l[K     |▏                               | 10 kB 15.9 MB/s eta 0:00:01[K     |▎                               | 20 kB 9.3 MB/s eta 0:00:01[K     |▌                               | 30 kB 12.8 MB/s eta 0:00:01[K     |▋                               | 40 kB 7.3 MB/s eta 0:00:01[K     |▊                               | 51 kB 6.2 MB/s eta 0:00:01[K     |█                               | 61 kB 7.3 MB/s eta 0:00:01[K     |█                               | 71 kB 7.8 MB/s eta 0:00:01[K     |█▎                              | 81 kB 6.6 MB/s eta 0:00:01[K     |█▍                              | 92 kB 7.2 MB/s eta 0:00:01[K     |█▌                              | 102 kB 6.9 MB/s eta 0:00:01[K     |█▊                              | 112 kB 6.9 MB/s eta 0:00:01[K     |█▉                              | 122 kB 6.9 MB/s eta 0:00:01[K     |██                              | 133 kB 6.9 MB/s eta 0:00:01[K     |██

In [0]:
# let's perform installing
!pip install -q findspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/317.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/317.0 MB[0m [31m14.6 MB/s[0m eta [36m0:00:22[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/317.0 MB[0m [31m54.4 MB/s[0m eta [36m0:00:06[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/317.0 MB[0m [31m68.9 MB/s[0m eta [36m0:00:05[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/317.0 MB[0m [31m86.2 MB/s[0m eta [36m0:00:04[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/317.0 MB[0m [31m84.2 MB/s[0m eta [36m0:00:04[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.2/317.0 MB[0m [31m78.3 MB/s[0m eta [36m0:00:04[0m[2K     [91m━━[0m

In [0]:
#let's import spark and pyspark
import findspark
findspark.init()

In [0]:
#let's import pyspark
from pyspark.sql import SparkSession

In [0]:
# let's create a variable to start the spark session
spark = SparkSession.builder.appName('imigration').getOrCreate()

In [0]:
# let's call the variable
spark

In [0]:
#let's import the libraries
import os
import sys
import pyspark
import numpy as np
import pandas as pd
import pyspark.pandas as ps
from pyspark.pandas import read_csv
from matplotlib import pyplot as plt
%matplotlib inline



In [0]:
print(pyspark.__version__)

3.3.2.dev0


In [0]:
#let's load (read) dataset using spark
df = spark.read.options(header='true', inferschema='true').csv('dbfs:/FileStore/tables/migration_Dec_2023.csv')

In [0]:
df.printSchema()

root
 |-- year_month: date (nullable = true)
 |-- month_of_release: date (nullable = true)
 |-- passenger_type: string (nullable = true)
 |-- direction: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: string (nullable = true)
 |-- estimate: integer (nullable = true)
 |-- standard_error: integer (nullable = true)
 |-- status: string (nullable = true)



In [0]:
# let's read the dataset using pyspark
psdf = read_csv('dbfs:/FileStore/tables/migration_Dec_2023.csv')

In [0]:
psdf.head()

Unnamed: 0,year_month,month_of_release,passenger_type,direction,sex,age,estimate,standard_error,status
0,2001-01-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,344,0,Final
1,2001-02-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,269,0,Final
2,2001-03-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,239,0,Final
3,2001-04-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,233,0,Final
4,2001-05-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,206,0,Final


In [0]:
#checking the type of data in psdf
type(psdf)

Out[16]: pyspark.pandas.frame.DataFrame

## Data Attributes

In [0]:
#let's check the shape of psdf
psdf.shape

Out[17]: (49136, 9)

In [0]:
#size displays total umber of values in the dataset: rows * columns
psdf.size

Out[18]: 442224

In [0]:
#number of data dimensionality
psdf.ndim

Out[20]: 2

In [0]:
psdf.columns

Out[21]: Index(['year_month', 'month_of_release', 'passenger_type', 'direction', 'sex',
       'age', 'estimate', 'standard_error', 'status'],
      dtype='object')

In [0]:
psdf.dtypes

Out[22]: year_month          object
month_of_release    object
passenger_type      object
direction           object
sex                 object
age                 object
estimate             int32
standard_error       int32
status              object
dtype: object

In [0]:
# getting information about the dataset
psdf.info

Out[23]: <bound method DataFrame.info of      year_month month_of_release     passenger_type direction     sex          age  estimate  standard_error       status
0    2001-01-01       2020-09-01  Long-term migrant  Arrivals  Female    0-4 years       344               0        Final
1    2001-02-01       2020-09-01  Long-term migrant  Arrivals  Female    0-4 years       269               0        Final
2    2001-03-01       2020-09-01  Long-term migrant  Arrivals  Female    0-4 years       239               0        Final
3    2001-04-01       2020-09-01  Long-term migrant  Arrivals  Female    0-4 years       233               0        Final
4    2001-05-01       2020-09-01  Long-term migrant  Arrivals  Female    0-4 years       206               0        Final
5    2001-06-01       2020-09-01  Long-term migrant  Arrivals  Female    0-4 years       253               0        Final
6    2001-07-01       2020-09-01  Long-term migrant  Arrivals  Female    0-4 years       242             

In [0]:
psdf.head()

Unnamed: 0,year_month,month_of_release,passenger_type,direction,sex,age,estimate,standard_error,status
0,2001-01-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,344,0,Final
1,2001-02-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,269,0,Final
2,2001-03-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,239,0,Final
3,2001-04-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,233,0,Final
4,2001-05-01,2020-09-01,Long-term migrant,Arrivals,Female,0-4 years,206,0,Final


In [0]:
psdf.isnull().sum()
#there is no missing 

Out[27]: year_month          0
month_of_release    0
passenger_type      0
direction           0
sex                 0
age                 0
estimate            0
standard_error      0
status              0
dtype: int64

In [0]:
#statistical summary
psdf.describe()

Unnamed: 0,estimate,standard_error
count,49136.0,49136.0
mean,458.944094,1.307778
std,1278.24969,11.599548
min,-4451.0,0.0
25%,18.0,0.0
50%,141.0,0.0
75%,381.0,0.0
max,24210.0,680.0


In [0]:
psdf.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
estimate,49136.0,458.944094,1278.24969,-4451.0,18.0,141.0,381.0,24210.0
standard_error,49136.0,1.307778,11.599548,0.0,0.0,0.0,0.0,680.0


## Conclusion:
--Please add 7 takeaways from doing this lab.
1. Upload PySpark and Spark libs
2. create a spark session
3. read/load dataset using spark and pyspark
4. see information, head, summary, missing value, describing statistical summary of table
5. find total number of columns and rows
6. number of data dimensionality od dataset
7. find shape and data type for dataset

#### End of the lab 6