# Iris

### Introduction:

This exercise may seem a little bit strange, but keep doing it.

### Step 1. Import the necessary libraries

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("iris").getOrCreate()
spark

### Step 2. Import the dataset from this [address](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data). 

In [2]:
from pyspark import SparkFiles

### Step 3. Assign it to a variable called iris

In [5]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

spark.sparkContext.addFile(url)

iris = spark.read.csv(SparkFiles.get("iris.data"), header=False, inferSchema=True, sep=',')

In [6]:
iris.show(5)

+---+---+---+---+-----------+
|_c0|_c1|_c2|_c3|        _c4|
+---+---+---+---+-----------+
|5.1|3.5|1.4|0.2|Iris-setosa|
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 5 rows



### Step 4. Create columns for the dataset

In [57]:
# 1. sepal_length (in cm)
# 2. sepal_width (in cm)
# 3. petal_length (in cm)
# 4. petal_width (in cm)
# 5. class

In [7]:
iris.columns

['_c0', '_c1', '_c2', '_c3', '_c4']

In [21]:
new_names = ["sepal_length", 
             "sepal_width",
             "petal_length",
             "petal_width",
             "class"]

In [22]:
iris = iris.toDF(*new_names)
iris.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      class|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



### Step 5.  Is there any missing value in the dataframe?

In [13]:
import pyspark.sql.functions as F
def count_missings(spark_df,sort=True):
    """
    Counts number of nulls and nans in each column
    """
    df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp', 'string', 'date')]).toPandas()

    if len(df) == 0:
        print("There are no any missing values!")
        return None

    if sort:
        return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)

    return df

In [23]:
count_missings(iris)

Unnamed: 0,count
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0


### Step 6.  Lets set the values of the rows 10 to 29 of the column 'petal_length' to NaN

In [25]:
# iris.select('petal_length').collect()[10:30]

#trying to find an answer

### Step 7. Good, now lets substitute the NaN values to 1.0

In [None]:
#trying to find an answer

### Step 8. Now let's delete the column class

In [26]:
iris.drop("class").show(5)

+------------+-----------+------------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|
+------------+-----------+------------+-----------+
|         5.1|        3.5|         1.4|        0.2|
|         4.9|        3.0|         1.4|        0.2|
|         4.7|        3.2|         1.3|        0.2|
|         4.6|        3.1|         1.5|        0.2|
|         5.0|        3.6|         1.4|        0.2|
+------------+-----------+------------+-----------+
only showing top 5 rows



### Step 9.  Set the first 3 rows as NaN

In [None]:
#trying to find an answer

### Step 10.  Delete the rows that have NaN

In [27]:
iris.dropna()

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, class: string]

### Step 11. Reset the index so it begins with 0 again

In [None]:
#trying to find an answer

### BONUS: Create your own question and answer it.