# Dataframes, Formatting, Casting Data Type and Correlation with Pyspark

In [2]:
from pyspark.sql import SparkSession

# Load data from a CSV
file_location = "/FileStore/tables/df_panel_fix.csv"
df = spark.read.format("CSV").option("inferSchema", True).option("header", True).load(file_location)
display(df.take(5))

_c0,province,specific,general,year,gdp,fdi,rnr,rr,i,fr,reg,it
0,Anhui,147002.0,,1996,2093.3,50661,0.0,0.0,0.0,1128873,East China,631930
1,Anhui,151981.0,,1997,2347.32,43443,0.0,0.0,0.0,1356287,East China,657860
2,Anhui,174930.0,,1998,2542.96,27673,0.0,0.0,0.0,1518236,East China,889463
3,Anhui,285324.0,,1999,2712.34,26131,,,,1646891,East China,1227364
4,Anhui,195580.0,32100.0,2000,2902.09,31847,0.0,0.0,0.0,1601508,East China,1499110


In [3]:
df.columns

In [4]:
df.printSchema()

In [5]:
# for row in df.head(5):
#     print(row)
#     print('\n')

In [6]:
df.describe().show()

In [7]:
df.describe().printSchema()

## Casting Data Types and Formatting Significant Digits

In [8]:
from pyspark.sql.functions import format_number

In [9]:
result = df.describe()
result.select(result['province']
,format_number(result['specific'].cast('float'),2).alias('specific')
,format_number(result['general'].cast('float'),2).alias('general')
,format_number(result['year'].cast('int'),2).alias('year'),format_number(result['gdp'].cast('float'),2).alias('gdp')
,format_number(result['rnr'].cast('int'),2).alias('rnr'),format_number(result['rr'].cast('float'),2).alias('rr')
,format_number(result['fdi'].cast('int'),2).alias('fdi'),format_number(result['it'].cast('float'),2).alias('it')
,result['reg'].cast('string').alias('reg')
             ).show()

## New Columns generated from extant columns using withColumn

In [10]:
df2 = df.withColumn("specific_gdp_ratio",df["specific"]/(df["gdp"]*100))#.show()

In [11]:
df2.select('specific_gdp_ratio').show()

In [12]:
df.orderBy(df["specific"].asc()).head(1)[0][0]

## Finding the Mean, Max, and Min

In [13]:
from pyspark.sql.functions import mean
df.select(mean("specific")).show()

In [14]:
from pyspark.sql.functions import max,min

In [15]:
df.select(max("specific"),min("specific")).show()

In [16]:
df.filter("specific < 60000").count()

In [17]:
df.filter(df['specific'] < 60000).count()

In [18]:
from pyspark.sql.functions import count
result = df.filter(df['specific'] < 60000)
result.select(count('specific')).show()

In [19]:
(df.filter(df["gdp"]>8000).count()*1.0/df.count())*100

In [20]:
from pyspark.sql.functions import corr
df.select(corr("gdp","fdi")).show()

## Finding the max value by Year

In [21]:
from pyspark.sql.functions import year
#yeardf = df.withColumn("Year",year(df["year"]))

In [22]:
max_df = df.groupBy('year').max()

In [23]:
max_df.select('year','max(gdp)').show()


In [24]:
from pyspark.sql.functions import month

In [25]:
#df.select("year","avg(gdp)").orderBy('year').show()

This post includes code adapted from [Spark and Python for Big Data udemy course](https://udemy.com/course/spark-and-python-for-big-data-with-pyspark) and [Spark and Python for Big Data notebooks](https://github.com/SuperJohn/spark-and-python-for-big-data-with-pyspark).