### Acquisition of elevation data

In [None]:
import numpy as np
import pandas as pd #This is were we are going to store the explanatory variables
from sklearn.datasets import load_iris # This is a machine learning library

data = load_iris() #here the data is loaded

data #we call the data

In [None]:
data.keys()

In [None]:
data.data

In [None]:
data.target_names

In [None]:
data.feature_names

### Vaiable X = Pandas DataFrame

In [None]:
X = pd.DataFrame(data.data,columns=['sepal_length','sepal_width','petal_length',"petal_width"])
X.head()

### Variable Y = Objective

In [None]:
Y = pd.DataFrame(data.target,columns=['species'])
Y.head()

### Problem 2 Combining data
##### A DataFrame variable (df) that combines the X and y

In [None]:
df = pd.concat([X,Y], axis=1)
df.head()

### Problem 3 Checking the data¶
##### Here we check what kind of data we will be working with
##### First we Display the 4th sample from the beginning and see what each feature is like. (Numerical data or character data, etc.)

In [None]:
X4 = df[3:4]
X4

### see what each feature is like

In [None]:
X4.info()

### the total number of samples for each label

In [None]:
df['species'].value_counts()

### We Check if there is a missing value in the feature quantity

In [None]:
df.info()
df.isnull().sum()

### We Display the mean, standard deviation, and quartiles of the feature values at once

In [None]:
df.describe()

### Problem 4 Examining the iris dataset itself¶
This Fisher’s or Anderson’s iris data set was made back in the day - the British statistician and biologist Ronald Fisher published it in his 1936 paper “The use of multiple measurements in taxonomic problems” as an example of linear discriminant analysis. It is sometimes called Anderson’s Iris data set because Edgar Anderson collected the data to quantify the morphologic variation of Iris flowers of three related species. Basically, these two very old scientist guys gave birth to dataset, which is still used today to lear basicas of RStudio. iris dataset gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica
The Iris Dataset contains four features (length and width of sepals and petals) of 50 samples of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). These measures were used to create a linear discriminant model to classify the species. The dataset is often used in data mining, classification and clustering examples and to test algorithms

### Problem 5 Extracting the required data
Extractsepal_width, columns in three different ways

In [None]:
df.loc[:,'sepal_width']#method 1

In [None]:
df.iloc[:,1:2].head()#method 2

In [None]:
df['sepal_width']#method 3

#### Extract the 50th to 99th data

In [None]:
df.loc[50:99]

#### Extract the50th to 99th data of thepetal_lengthcolumn

In [None]:
df.iloc[50:100, 2]


### Extract data with apetal_width value of 0.2

In [None]:
df.loc[:,'petal_width'][df.iloc[:,3] == 0.2]


### Difference between .loc and .iloc
loc() : loc() is label based data selecting method which means that we have to pass the name of the row or column which we want to select. This method includes the last element of the range passed in it, unlike iloc(). loc() can accept the boolean data unlike iloc()
iloc() : iloc() is a indexed based selecting method which means that we have to pass integer index in the method to select specific row/column. This method does not include the last element of the range passed in it unlike loc(). iloc() does not accept the boolean data unlike loc()

### Problem 6 Creating a diagram¶
##### Make a pie chart of the number of samples per label (while showing percentages)

In [None]:
import matplotlib.pyplot as plt
sample_setosa = df[df['species']==0]
sample_virgiclor = df[df['species']==1]
sample_virginica = df[df['species']==2]

samples = [len(sample_setosa), len(sample_virgiclor), len(sample_virgiclor)]
labels = 'setosa', 'virgiclor','virginca'
explode = (0.1, 0.1, 0.1)
plt.pie(samples,explode=explode, labels=labels, autopct='%1.1f%%',
       shadow=True, startangle=90)
plt.title("A pie chart of the number of samples per label")
plt.show()

##### Select features one by one and visualize the distribution of data for each label using a box plot¶
#### Select features one by one and visualize the distribution of data for each label using a violin plot
#### for sepal length

In [None]:
import seaborn as sns
fig, axes = plt.subplots(1, 2)

sns.boxplot(x="species", y="sepal_length",data=df, ax=axes[0])
sns.violinplot(x="species", y="sepal_length",data=df,ax=axes[1])

axes[0].set_xticklabels(['setosa','virgiclor','virginica'])
axes[0].set_title('sepal_length boxplot')

axes[1].set_xticklabels(['setosa','virgiclor','virginica'])
axes[1].set_title('sepal_length violinplot')
fig.set_size_inches(14, 5)
plt.show()

#### for sepal width

In [None]:
fig, axes = plt.subplots(1, 2)

sns.boxplot(x="species", y="petal_length",data=df, ax=axes[0])
sns.violinplot(x="species", y="petal_length",data=df,ax=axes[1])

axes[0].set_xticklabels(['setosa','virgiclor','virginica'])
axes[0].set_title('petal_length boxplot')

axes[1].set_xticklabels(['setosa','virgiclor','virginica'])
axes[1].set_title('petal_length violinplot')

fig.set_size_inches(14, 5)
plt.show()

### Box plots and violin plots display similar plots. Explain how they differ and what advantages each has

violin plots are closely related to Tukey's (1977) box plots,they add useful information such as the distribution of the sample data (density trace).
By default, box plots show data points outside 1.5 * the inter-quartile range as outliers above whereas violin plots show the whole range of the data Moreover A violin plot is more informative than a plain box plot. While a box plot only shows summary statistics such as mean/median and interquartile ranges, the violin plot shows the full distribution of the data. The difference is particularly useful when the data distribution is multimodal (more than one peak). In this case a violin plot shows the presence of different peaks, their position and relative amplitude. Like box plots, violin plots are used to represent comparison of a variable distribution (or sample distribution) across different "categories" (for example, temperature distribution compared between day and night, or distribution of car prices compared across different car makers).

### Problem 7 Confirming the relationship between features
Take one feature for each of the vertical and horizontal axes and create a color-coded scatter plot for each type (6).

In [None]:
s1 = df[df['species']==0]
s2 = df[df['species']==1]
s3 = df[df['species']==2]

labels = data.target_names
label =['sepal_length','sepal_width','petal_length','petal_width']

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.xlabel(label[2])
plt.ylabel(label[1])
plt.scatter(s1.iloc[:, 0], s1.iloc[:, 1], label=labels[0])
plt.scatter(s2.iloc[:, 0], s2.iloc[:, 1], label=labels[1])
plt.scatter(s3.iloc[:, 0], s3.iloc[:, 1], label=labels[2])
plt.legend()
plt.show()

In [None]:
data.target_names


In [None]:
fig, ((ax1,ax2,ax3), (ax4,ax5,ax6)) = plt.subplots(2, 3, figsize=(20, 10))

ax1.scatter(s1.iloc[:,0], s1.iloc[:,1], label=labels[0])
ax1.scatter(s2.iloc[:,0], s2.iloc[:,1], label=labels[1])
ax1.scatter(s3.iloc[:,0], s3.iloc[:,1], label=labels[2])

ax1.set_title('scatter Plot of Feature Values')
ax1.set_xlabel(label[0])
ax1.set_ylabel(label[1])
ax1.legend()


ax2.scatter(s1.iloc[:,0], s1.iloc[:,2], label=labels[0])
ax2.scatter(s2.iloc[:,0], s2.iloc[:,2], label=labels[1])
ax2.scatter(s3.iloc[:,0], s3.iloc[:,2], label=labels[2])

ax2.set_title('scatter Plot of Feature Values')
ax2.set_xlabel(label[0])
ax2.set_ylabel(label[2])
ax2.legend()

ax3.scatter(s1.iloc[:,0], s1.iloc[:,3], label=labels[0])
ax3.scatter(s2.iloc[:,0], s2.iloc[:,3], label=labels[1])
ax3.scatter(s3.iloc[:,0], s3.iloc[:,3], label=labels[2])

ax3.set_title('scatter Plot of Feature Values')
ax3.set_xlabel(label[0])
ax3.set_ylabel(label[3])
ax3.legend()


ax4.scatter(s1.iloc[:,1], s1.iloc[:,2], label=labels[0])
ax4.scatter(s2.iloc[:,1], s2.iloc[:,2], label=labels[1])
ax4.scatter(s3.iloc[:,1], s3.iloc[:,2], label=labels[2])

ax4.set_title('scatter Plot of Feature Values')
ax4.set_xlabel(label[1])
ax4.set_ylabel(label[2])
ax4.legend()

ax5.scatter(s1.iloc[:,1], s1.iloc[:,3], label=labels[0])
ax5.scatter(s2.iloc[:,1], s2.iloc[:,3], label=labels[1])
ax5.scatter(s3.iloc[:,1], s3.iloc[:,3], label=labels[2])

ax5.set_title('scatter Plot of Feature Values')
ax5.set_xlabel(label[1])
ax5.set_ylabel(label[3])
ax5.legend()


ax6.scatter(s1.iloc[:,2], s1.iloc[:,3], label=labels[0])
ax6.scatter(s2.iloc[:,2], s2.iloc[:,3], label=labels[1])
ax6.scatter(s3.iloc[:,2], s3.iloc[:,3], label=labels[2])

ax6.set_title('scatter Plot of Feature Values')
ax6.set_xlabel(label[2])
ax6.set_ylabel(label[3])
ax6.legend()

plt.show()

#### Create a scatterplot matrix that displays all combinations of scatterplots at once (1)

In [None]:
sns.pairplot(df, diag_kind="kde",hue='species')
plt.show()

#### Create a correlation coefficient matrix for 4 features

In [None]:
corrMatrix = X.corr()
corrMatrix

#### Make a heat map of the correlation coefficient matrix (1)¶

In [None]:
sns.heatmap(corrMatrix, annot=True)
plt.show()