# Student Alcohol Consumption

### Introduction:

This time you will download a dataset from the UCI.

### Step 1. Import the necessary libraries

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("alcohol").getOrCreate()
spark

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/04_Apply/Students_Alcohol_Consumption/student-mat.csv).

### Step 3. Assign it to a variable called df.

In [2]:
from pyspark import SparkFiles

In [3]:
url = "https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/04_Apply/Students_Alcohol_Consumption/student-mat.csv"

spark.sparkContext.addFile(url)

df = spark.read.csv(SparkFiles.get("student-mat.csv"), header=True, inferSchema=True, sep=',')
df.head(2)

[Row(school='GP', sex='F', age=18, address='U', famsize='GT3', Pstatus='A', Medu=4, Fedu=4, Mjob='at_home', Fjob='teacher', reason='course', guardian='mother', traveltime=2, studytime=2, failures=0, schoolsup='yes', famsup='no', paid='no', activities='no', nursery='yes', higher='yes', internet='no', romantic='no', famrel=4, freetime=3, goout=4, Dalc=1, Walc=1, health=3, absences=6, G1=5, G2=6, G3=6),
 Row(school='GP', sex='F', age=17, address='U', famsize='GT3', Pstatus='T', Medu=1, Fedu=1, Mjob='at_home', Fjob='other', reason='course', guardian='father', traveltime=1, studytime=2, failures=0, schoolsup='no', famsup='yes', paid='no', activities='no', nursery='no', higher='yes', internet='yes', romantic='no', famrel=5, freetime=3, goout=3, Dalc=1, Walc=1, health=3, absences=4, G1=5, G2=5, G3=6)]

### Step 4. For the purpose of this exercise slice the dataframe from 'school' until the 'guardian' column

In [11]:
school_df = df.select(df.columns[df.columns.index("school"): df.columns.index("guardian") + 1 ])
school_df.show(5)

+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home| teacher|course|  mother|
|    GP|  F| 17|      U|    GT3|      T|   1|   1|at_home|   other|course|  father|
|    GP|  F| 15|      U|    LE3|      T|   1|   1|at_home|   other| other|  mother|
|    GP|  F| 15|      U|    GT3|      T|   4|   2| health|services|  home|  mother|
|    GP|  F| 16|      U|    GT3|      T|   3|   3|  other|   other|  home|  father|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
only showing top 5 rows



### Step 5. Create a lambda function that will capitalize strings.

In [5]:
def capitalizer(x):
    return str(x[0].upper() + x[1:].lower())

print(capitalizer('lamBda'))

Lambda


In [6]:
cap = lambda x : x.capitalize()

s1 = "lamBda"
print(cap(s1))

Lambda


### Step 6. Capitalize both Mjob and Fjob

In [7]:
from pyspark.sql.functions import *

In [8]:
from pyspark.sql.types import *

In [9]:
udf_cap = udf(lambda x : x.capitalize(), StringType())

In [17]:
school_df = school_df.withColumn("Mjob", udf_cap(col("Mjob")))
school_df = school_df.withColumn("Fjob", udf_cap(col("Fjob")))
school_df.show(5)

+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|At_home| Teacher|course|  mother|
|    GP|  F| 17|      U|    GT3|      T|   1|   1|At_home|   Other|course|  father|
|    GP|  F| 15|      U|    LE3|      T|   1|   1|At_home|   Other| other|  mother|
|    GP|  F| 15|      U|    GT3|      T|   4|   2| Health|Services|  home|  mother|
|    GP|  F| 16|      U|    GT3|      T|   3|   3|  Other|   Other|  home|  father|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+
only showing top 5 rows



### Step 7. Print the last elements of the data set.

In [20]:
school_df.tail(2)

[Row(school='MS', sex='M', age=18, address='R', famsize='LE3', Pstatus='T', Medu=3, Fedu=2, Mjob='Services', Fjob='Other', reason='course', guardian='mother'),
 Row(school='MS', sex='M', age=19, address='U', famsize='LE3', Pstatus='T', Medu=1, Fedu=1, Mjob='Other', Fjob='At_home', reason='course', guardian='father')]

### Step 8. Did you notice the original dataframe is still lowercase? Why is that? Fix it and capitalize Mjob and Fjob.

In [21]:
df = df.withColumn("Mjob", udf_cap(col("Mjob")))
df = df.withColumn("Fjob", udf_cap(col("Fjob")))
df.show(5)

+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|At_home| Teacher|course|  mother|         2|        2|       0|      yes|    no|  no|        no|    yes|   yes|      no|      no|     4|       3|    4|   1|   1|     3|       6|  5|  6|  6|
|    GP|  F| 17|      U|    GT3|      T|

### Step 9. Create a function called majority that returns a boolean value to a new column called legal_drinker (Consider majority as older than 17 years old)

In [26]:
def majority(x):
    if x > 17:
        return 1
    else:
        return 0
    
print(majority(18))    

1


In [27]:
udf_majority = udf(lambda x: majority(x), IntegerType())

In [31]:
school_df = school_df.withColumn("legal_drinker", udf_majority(col("age")))
school_df.show(5)

+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+-------------+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|legal_drinker|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+-------------+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|At_home| Teacher|course|  mother|            1|
|    GP|  F| 17|      U|    GT3|      T|   1|   1|At_home|   Other|course|  father|            0|
|    GP|  F| 15|      U|    LE3|      T|   1|   1|At_home|   Other| other|  mother|            0|
|    GP|  F| 15|      U|    GT3|      T|   4|   2| Health|Services|  home|  mother|            0|
|    GP|  F| 16|      U|    GT3|      T|   3|   3|  Other|   Other|  home|  father|            0|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+-------------+
only showing top 5 rows



### Step 10. Multiply every number of the dataset by 10. 
##### I know this makes no sense, don't forget it is just an exercise

In [32]:
school_df.printSchema()

root
 |-- school: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- address: string (nullable = true)
 |-- famsize: string (nullable = true)
 |-- Pstatus: string (nullable = true)
 |-- Medu: integer (nullable = true)
 |-- Fedu: integer (nullable = true)
 |-- Mjob: string (nullable = true)
 |-- Fjob: string (nullable = true)
 |-- reason: string (nullable = true)
 |-- guardian: string (nullable = true)
 |-- legal_drinker: integer (nullable = true)



In [41]:
c = 'sex'
school_df.select(c).dtypes[0][1]

'string'

In [46]:
for c in school_df.columns:
#     print(c)
    dtype = school_df.select(c).dtypes[0][1]
#     print(dtype)
    if dtype == 'int' or dtype == 'float':
        school_df = school_df.withColumn(c, col(c)*10)
school_df.show(5)

+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+-------------+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|    Fjob|reason|guardian|legal_drinker|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+-------------+
|    GP|  F|180|      U|    GT3|      A|  40|  40|At_home| Teacher|course|  mother|           10|
|    GP|  F|170|      U|    GT3|      T|  10|  10|At_home|   Other|course|  father|            0|
|    GP|  F|150|      U|    LE3|      T|  10|  10|At_home|   Other| other|  mother|            0|
|    GP|  F|150|      U|    GT3|      T|  40|  20| Health|Services|  home|  mother|            0|
|    GP|  F|160|      U|    GT3|      T|  30|  30|  Other|   Other|  home|  father|            0|
+------+---+---+-------+-------+-------+----+----+-------+--------+------+--------+-------------+
only showing top 5 rows

