In [1]:
import pyspark
import pandas as pd
import numpy as np
from pydataset import data
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, expr
from pyspark.sql.functions import month, year, quarter
from pyspark.sql.functions import regexp_extract, regexp_replace
from pyspark.sql.functions import *

# 1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [2]:
# The name of the column should be language
df = pd.DataFrame({'language': ['python', 'HTML', 'java', 'sql','JavaScript']})
df.head()

Unnamed: 0,language
0,python
1,HTML
2,java
3,sql
4,JavaScript


In [3]:
# Creating the spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [4]:
# View the schema of the dataframe
spark_df = spark.createDataFrame(df)
spark_df

DataFrame[language: string]

In [5]:
# View the schema of the dataframe
spark_df.printSchema()

root
 |-- language: string (nullable = true)



In [6]:
# Output the shape of the dataframe
spark_df.count()

5

In [7]:
# Show the first 5 records in the dataframe
spark_df.show(5)

+----------+
|  language|
+----------+
|    python|
|      HTML|
|      java|
|       sql|
|JavaScript|
+----------+



In [8]:
pd_v_spark = pd.DataFrame([['pd.read_csv("myfile.csv")', 
                            'spark.read.load("myfile.csv", format = "csv", sep = ",")'], 
                           ['pd.read_json("myfile.json")', 
                            'spark.read.load("myfile.json", format = "json") OR spark.read.json("myfile.json")']], 
                          index = ['csv', 'json'], 
                          columns = ['pandas', 'spark'])

# to display and see all text in dataframe
pd.set_option('display.max_colwidth', 10000)


pd_v_spark

Unnamed: 0,pandas,spark
csv,"pd.read_csv(""myfile.csv"")","spark.read.load(""myfile.csv"", format = ""csv"", sep = "","")"
json,"pd.read_json(""myfile.json"")","spark.read.load(""myfile.json"", format = ""json"") OR spark.read.json(""myfile.json"")"


In [9]:
pd_v_spark = pd_v_spark.append(pd.DataFrame([['pd_df.head()', 'sp_df.show(), .head(), .take()'],
                                             ['pd_df.head(1)', 'sp_df.first()'],
                                             ['pd_df.describe()', 'sp_df.describe()'],
                                             ['pd_df.columns', 'sp_df.columns'],
                                             ['len(pd_df)', 'sp_df.count()'],
                                             ['len(pd_df.drop_duplicates())', 'sp_df.distinct().count()'],
                                             ['pd_df.info()', 'sp_df.printSchema()']
                                            ],
                                            index = ['1st n rows', '1st row','summary statistics', 
                                                     'column names', '# rows', '# distinct rows', 
                                                     'df schema info'], 
                                            columns = ['pandas', 'spark']))
pd_v_spark

Unnamed: 0,pandas,spark
csv,"pd.read_csv(""myfile.csv"")","spark.read.load(""myfile.csv"", format = ""csv"", sep = "","")"
json,"pd.read_json(""myfile.json"")","spark.read.load(""myfile.json"", format = ""json"") OR spark.read.json(""myfile.json"")"
1st n rows,pd_df.head(),"sp_df.show(), .head(), .take()"
1st row,pd_df.head(1),sp_df.first()
summary statistics,pd_df.describe(),sp_df.describe()
column names,pd_df.columns,sp_df.columns
# rows,len(pd_df),sp_df.count()
# distinct rows,len(pd_df.drop_duplicates()),sp_df.distinct().count()
df schema info,pd_df.info(),sp_df.printSchema()


In [10]:
pd_v_spark = pd_v_spark.append(pd.DataFrame([['pd_df[["col1", "col2"]]', 
                                              'sp_df.select(sp_df.col1, sp_df.col2)']
                                            ],
                                            index = ['select columns'], 
                                            columns = ['pandas', 'spark']))

In [11]:
pd_v_spark = pd_v_spark.append(pd.DataFrame([['np.where(pd_df.c1.array > 0, "positive")', 
                                              'sp_df.select(df.c1, when(df.c1 > 0, "positive").alias("number_sign"))'],
                                            ],
                                            index = ['conditional assigning'], 
                                            columns = ['pandas', 'spark']))
pd_v_spark

Unnamed: 0,pandas,spark
csv,"pd.read_csv(""myfile.csv"")","spark.read.load(""myfile.csv"", format = ""csv"", sep = "","")"
json,"pd.read_json(""myfile.json"")","spark.read.load(""myfile.json"", format = ""json"") OR spark.read.json(""myfile.json"")"
1st n rows,pd_df.head(),"sp_df.show(), .head(), .take()"
1st row,pd_df.head(1),sp_df.first()
summary statistics,pd_df.describe(),sp_df.describe()
column names,pd_df.columns,sp_df.columns
# rows,len(pd_df),sp_df.count()
# distinct rows,len(pd_df.drop_duplicates()),sp_df.distinct().count()
df schema info,pd_df.info(),sp_df.printSchema()
select columns,"pd_df[[""col1"", ""col2""]]","sp_df.select(sp_df.col1, sp_df.col2)"


In [12]:
pd_v_spark = pd_v_spark.append(pd.DataFrame([['np.where(pd_df.c1.array > 0, "pos", "neg")', 
                                              'sp_df.select(df.c1, when(df.c1 > 0, "pos").otherwise("neg").alias("number_sign"))'],
                                            ],
                                            index = ['conditional assigning with else'], 
                                            columns = ['pandas', 'spark']))
pd_v_spark

Unnamed: 0,pandas,spark
csv,"pd.read_csv(""myfile.csv"")","spark.read.load(""myfile.csv"", format = ""csv"", sep = "","")"
json,"pd.read_json(""myfile.json"")","spark.read.load(""myfile.json"", format = ""json"") OR spark.read.json(""myfile.json"")"
1st n rows,pd_df.head(),"sp_df.show(), .head(), .take()"
1st row,pd_df.head(1),sp_df.first()
summary statistics,pd_df.describe(),sp_df.describe()
column names,pd_df.columns,sp_df.columns
# rows,len(pd_df),sp_df.count()
# distinct rows,len(pd_df.drop_duplicates()),sp_df.distinct().count()
df schema info,pd_df.info(),sp_df.printSchema()
select columns,"pd_df[[""col1"", ""col2""]]","sp_df.select(sp_df.col1, sp_df.col2)"


In [13]:
pd_v_spark = pd_v_spark.append(pd.DataFrame([['pd_df.sort_values(by=["c1"])', 
                                              'sp_df.sort(sp_df.c1)'],
                                             ['pd_df.sort_values(by=["c1","c2"])',
                                              'sp_df.sort(sp_df.c1, sp_df.c2)'],
                                             ['pd_df.sort_values(by=["c1","c2"], ascending=[False, True])',
                                              'sp_df.sort(sp_df.c1.desc(), sp_df.c2)'],
                                             ['pd_df.sort_values(by=["c1","c2"], ascending=False)', 
                                              'sp_df.sort(desc("c1"), desc("c2")) OR sp_df.sort(col("c1").desc(), col("c2").desc())']
                                            ],
                                            index = ['sort 1 col asc', 'sort 2+ cols asc', 'sort 2+ cols desc/asc', 'sort 2+ cols desc'], 
                                            columns = ['pandas', 'spark']))
pd_v_spark

Unnamed: 0,pandas,spark
csv,"pd.read_csv(""myfile.csv"")","spark.read.load(""myfile.csv"", format = ""csv"", sep = "","")"
json,"pd.read_json(""myfile.json"")","spark.read.load(""myfile.json"", format = ""json"") OR spark.read.json(""myfile.json"")"
1st n rows,pd_df.head(),"sp_df.show(), .head(), .take()"
1st row,pd_df.head(1),sp_df.first()
summary statistics,pd_df.describe(),sp_df.describe()
column names,pd_df.columns,sp_df.columns
# rows,len(pd_df),sp_df.count()
# distinct rows,len(pd_df.drop_duplicates()),sp_df.distinct().count()
df schema info,pd_df.info(),sp_df.printSchema()
select columns,"pd_df[[""col1"", ""col2""]]","sp_df.select(sp_df.col1, sp_df.col2)"


# 2. Load the mpg dataset as a spark dataframe.

A. Create 1 column of output that contains a message like the one below:

```The 1999 audi a4 has a 4 cylinder engine.```

For each vehicle.

B. Transform the trans column so that it only contains either manual or auto.



In [14]:
# Pandas df
mpg_pd = data('mpg')
mpg_pd.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [15]:
# Creating spark DF
mpg = spark.createDataFrame(data("mpg"))

In [16]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [17]:
mpg.select(mpg.hwy, mpg.cty, mpg.model).show(5)

+---+---+-----+
|hwy|cty|model|
+---+---+-----+
| 29| 18|   a4|
| 29| 21|   a4|
| 31| 20|   a4|
| 30| 21|   a4|
| 26| 16|   a4|
+---+---+-----+
only showing top 5 rows



## A. Create 1 column of output that contains a message like the one below:
```The 1999 audi a4 has a 4 cylinder engine.```

In [18]:
#  Do one item at a time to see how it prints out!
mpg.select(concat(lit("The "), mpg.year, lit(" "), mpg.manufacturer, 
                  lit(" "), lit("has a "), mpg.cyl, lit(" cylinder engine."))
           .alias('Car Description')).show(5, truncate=False)

+--------------------------------------+
|Car Description                       |
+--------------------------------------+
|The 1999 audi has a 4 cylinder engine.|
|The 1999 audi has a 4 cylinder engine.|
|The 2008 audi has a 4 cylinder engine.|
|The 2008 audi has a 4 cylinder engine.|
|The 1999 audi has a 6 cylinder engine.|
+--------------------------------------+
only showing top 5 rows



## B. Transform the trans column so that it only contains either manual or auto.

In [19]:
#sp_df.sort(sp_df.c1, sp_df.c2) Sorting by transmission
mpg.sort(mpg.trans).show(3)

+------------+------+-----+----+---+--------+---+---+---+---+-------+
|manufacturer| model|displ|year|cyl|   trans|drv|cty|hwy| fl|  class|
+------------+------+-----+----+---+--------+---+---+---+---+-------+
|      nissan|altima|  2.5|2008|  4|auto(av)|  f| 23| 31|  r|midsize|
|      nissan|altima|  3.5|2008|  6|auto(av)|  f| 19| 26|  p|midsize|
|      nissan|maxima|  3.5|2008|  6|auto(av)|  f| 19| 25|  p|midsize|
+------------+------+-----+----+---+--------+---+---+---+---+-------+
only showing top 3 rows



In [20]:
# Filtered by trans
mpg.filter(mpg['trans'].like('%auto%')).show(3)

+------------+-----+-----+----+---+--------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|   trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+--------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+--------+---+---+---+---+-------+
only showing top 3 rows



In [21]:
mpg.select(
    'trans',
    regexp_extract('trans', r"^(\w+)",1).alias('manual')).show()

+----------+------+
|     trans|manual|
+----------+------+
|  auto(l5)|  auto|
|manual(m5)|manual|
|manual(m6)|manual|
|  auto(av)|  auto|
|  auto(l5)|  auto|
|manual(m5)|manual|
|  auto(av)|  auto|
|manual(m5)|manual|
|  auto(l5)|  auto|
|manual(m6)|manual|
|  auto(s6)|  auto|
|  auto(l5)|  auto|
|manual(m5)|manual|
|  auto(s6)|  auto|
|manual(m6)|manual|
|  auto(l5)|  auto|
|  auto(s6)|  auto|
|  auto(s6)|  auto|
|  auto(l4)|  auto|
|  auto(l4)|  auto|
+----------+------+
only showing top 20 rows



In [22]:
# Importing all the pyspark equations
from pyspark.sql.functions import *
mpg.withColumn("trans", when(mpg.trans.startswith("a"), "auto").otherwise("manual")).show(6)

+------------+-----+-----+----+---+------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|   a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
+------------+-----+-----+----+---+------+---+---+---+---+-------+
only showing top 6 rows



# 3. Load the tips dataset as a spark dataframe.

A. What percentage of observations are smokers?

B. Create a column that contains the tip percentage

C. Calculate the average tip percentage for each combination of sex and smoker.

In [23]:
# Creating spark DF
tips = spark.createDataFrame(data("tips"))
tips.show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



## A. What percentage of observations are smokers?

In [24]:
tips.groupBy('smoker').agg(round(count(tips.smoker)/ tips.count(),2).alias('smoker_percentages')).show()

+------+------------------+
|smoker|smoker_percentages|
+------+------------------+
|    No|              0.62|
|   Yes|              0.38|
+------+------------------+



## B. Create a column that contains the tip percentage

In [25]:
tip_percentage = tips.select(tips.tip/tips.total_bill).show()

+-------------------+
| (tip / total_bill)|
+-------------------+
|0.05944673337257211|
|0.16054158607350097|
|0.16658733936220846|
| 0.1397804054054054|
|0.14680764538430255|
|0.18623962040332148|
|0.22805017103762829|
|0.11607142857142858|
|0.13031914893617022|
| 0.2185385656292287|
| 0.1665043816942551|
|0.14180374361883155|
|0.10181582360570687|
|0.16277807921866522|
|0.20364126770060686|
|0.18164967562557924|
| 0.1616650532429816|
|0.22774708410067526|
|0.20624631703005306|
|0.16222760290556903|
+-------------------+
only showing top 20 rows



In [26]:
tips = tips.withColumn('tip_percentage', expr('Round((tip/total_bill) * 100)'))
tips.show()

+----------+----+------+------+---+------+----+--------------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_percentage|
+----------+----+------+------+---+------+----+--------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|           6.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|          16.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|          17.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|          14.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|          15.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|          19.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|          23.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|          12.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|          13.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|          22.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|          17.0|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|          14.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|        

## C. Calculate the average tip percentage for each combination of sex and smoker.

In [27]:
tips.groupBy(tips.sex,tips.smoker).agg(round(mean(tips.tip),2)).show()

+------+------+------------------+
|   sex|smoker|round(avg(tip), 2)|
+------+------+------------------+
|  Male|    No|              3.11|
|  Male|   Yes|              3.05|
|Female|    No|              2.77|
|Female|   Yes|              2.93|
+------+------+------------------+



# 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to farenheight.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [28]:
from vega_datasets import data
weather = data.seattle_weather()
weather = spark.createDataFrame(weather)

In [29]:
weather.show()

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|     4.4|     2.2| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|     7.2|     2.8| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|    10.0|     2.8| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|     9.4|     5.0| 3.4|   rain|
|2012-01-10 00:00:00|          1.0|     6.1|     0.6| 3.4|   rain|
|2012-01-11 00:00:00|          0.0|     6.1|    -1.1| 5.1|    sun|
|2012-01-12 00:00:00|          0.0|     6.1|    -1.7| 1.9|    

## Convert the temperatures to farenheight.
T(°F) = T(°C) × 9/5 + 32

In [30]:
weather = weather.withColumn("temp_max", round(weather.temp_max*9/5 + 32, 2))
weather = weather.withColumn("temp_min", round(weather.temp_min*9/5 + 32, 2))

In [31]:
weather.show()

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|   48.02|   37.04| 6.1|   rain|
|2012-01-06 00:00:00|          2.5|   39.92|   35.96| 2.2|   rain|
|2012-01-07 00:00:00|          0.0|   44.96|   37.04| 2.3|   rain|
|2012-01-08 00:00:00|          0.0|    50.0|   37.04| 2.0|    sun|
|2012-01-09 00:00:00|          4.3|   48.92|    41.0| 3.4|   rain|
|2012-01-10 00:00:00|          1.0|   42.98|   33.08| 3.4|   rain|
|2012-01-11 00:00:00|          0.0|   42.98|   30.02| 5.1|    sun|
|2012-01-12 00:00:00|          0.0|   42.98|   28.94| 1.9|    

## Which month has the most rain, on average?

In [35]:
# Parens allow to indent and tab code
(
    weather.withColumn("month", month("date"))
    .groupBy("month")
    .agg(sum("precipitation").alias("total_rainfall"))
    .sort("month")
    .show()
)

+-----+------------------+
|month|    total_rainfall|
+-----+------------------+
|    1|465.99999999999994|
|    2|             422.0|
|    3|             606.2|
|    4|             375.4|
|    5|             207.5|
|    6|             132.9|
|    7|              48.2|
|    8|             163.7|
|    9|235.49999999999997|
|   10|             503.4|
|   11|             642.5|
|   12| 622.7000000000002|
+-----+------------------+



## Which year was the windiest?

In [41]:
(
    weather.withColumn('year', year('date'))
    .groupBy('year')
    .agg(avg('wind').alias('avg_wind_speed'))
    .sort('year').show()
)

+----+------------------+
|year|    avg_wind_speed|
+----+------------------+
|2012| 3.400819672131148|
|2013|3.0158904109589058|
|2014| 3.387671232876714|
|2015| 3.159726027397261|
+----+------------------+



## What is the most frequent type of weather in January?

In [38]:
(
    weather.filter(month("date") == 1)
    .groupBy("weather")
    .agg(count("weather"))
    .show()
)

+-------+--------------+
|weather|count(weather)|
+-------+--------------+
|    fog|            38|
|drizzle|            10|
|   rain|            35|
|    sun|            33|
|   snow|             8|
+-------+--------------+



## What is the average high and low tempurature on sunny days in July in 2013 and 2014?

In [47]:
(
    weather.filter(month("date") == 7)
    .filter(year("date") > 2012)
    .filter(year('date') < 2015)
    .filter(weather.weather == 'sun')
    .agg(mean("temp_min"))
    .show()
)

+-----------------+
|    avg(temp_min)|
+-----------------+
|57.52884615384615|
+-----------------+



In [44]:
(
    weather.filter(month('date') == 7)
    .filter(year('date') == 2013 & 2014)
    .filter(weather.weather == 'sun')
    .agg(mean('temp_max'))
    .show()
)

+-------------+
|avg(temp_max)|
+-------------+
|        73.85|
+-------------+



## What percentage of days were rainy in q3 of 2015?

In [54]:
(
    weather.filter(year("date") == 2015)
    .filter(quarter('date') == 3)
    .select(when(col('weather') == 'rain', 1).otherwise(0).alias('rain'))
    .agg(mean('rain')).show()
)

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



## For each year, find what percentage of days it rained (had non-zero precipitation)

In [56]:
(
    weather.withColumn('year', year('date'))
    .select(when(col('precipitation') > 0,1). otherwise(0).alias('rain'), 'year')
    .groupby('year')
    .agg(mean('rain')).show()
)

+----+-------------------+
|year|          avg(rain)|
+----+-------------------+
|2015|0.39452054794520547|
|2013|0.41643835616438357|
|2014|  0.410958904109589|
|2012|0.48360655737704916|
+----+-------------------+

