# ___

# [ Machine Learning in Geosciences ]

**Department of Applied Geoinformatics and Carthography, Charles University** 

*Lukas Brodsky lukas.brodsky@natur.cuni.cz*


___

# Scikit-learn (first steps)

Scikit-learn is a **Machine Learning Library for Python**. Scikit-learn is: 
* Simple and efficient tools for predictive data analysis
* Accessible to everybody, and reusable in various contexts
* Built on NumPy, SciPy, and matplotlib
* Open source, commercially usable - BSD license

After completing this lab, you shall know:

* How to import sklearn modules. 
* How to run simple predictive analysis (classification and regression) with Scikit-learn in Python.

WITHOUT ANY AMBITION OF TUNING THE MODEL!!!  

## Documentation

    
Please refer to **[Scikit-learn official documentation](https://scikit-learn.org/)**, and use the **[Scikit-learn API Reference](https://scikit-learn.org/stable/modules/classes.html)**

Used version (shall be 0.24.1):


In [None]:
import sklearn
sklearn.__version__

## Using Scikit-learn for simple classification

In [None]:
# Sklearn API 

# data set splitting
from sklearn.model_selection import train_test_split
# coding
from sklearn.preprocessing import LabelEncoder

# classifier
from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier 
# from sklearn.svm import SVC 

# validation metrics
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix

# visualization
from sklearn.tree import plot_tree

# sample data
from sklearn import datasets

In [None]:
# data science libraries

# nd-addrays
import numpy as np

# dataframes
import pandas as pd

# plotting
import matplotlib.pyplot as plt 
# notebook solution
%matplotlib inline 
# seaborn works on top of matplotlib
import seaborn as sns

In [None]:
# load iris data from sklearn
# iris = datasets.load_iris()

# or from seaborn
df = sns.load_dataset('iris') 
df.head()

In [None]:
# view sample of the data set 
import matplotlib.image as mpimg
img = 'iris-dataset.png'
img = mpimg.imread('iris-dataset.png')
plt.figure(figsize=(15, 10), dpi=80)
plt.imshow(img)

In [None]:
# explore the data set
df.info()

In [None]:
# check the input data features  data type 
print(type(df))

In [None]:
# check dimensionality
# iris.data.shape
df.shape

In [None]:
# missing data
df.isnull().any()

In [None]:
# vizualizace atributu - pruzkumova analyza 
sns.pairplot(data=df, hue = 'species')

### Prepare features X and labels y for machine learning modelling

In [None]:
# Data preparation
df.columns

In [None]:
df['species']

In [None]:
reference = df['species']
df1 = df.copy()
df1 = df1.drop('species', axis = 1)

In [None]:
# Defining the attributes 
X = df1

In [None]:
X

In [None]:
reference

In [None]:
# kodovani trid 
le = LabelEncoder()
num_codes = le.fit_transform(reference) 
num_codes

In [None]:
# codes
spec_code  = pd.concat([df['species'], pd.DataFrame(num_codes)], axis=1)

for col in spec_code:
    print(spec_code[col].unique())

In [None]:
y = num_codes

In [None]:
# split data (randomely) 
# nahodne rozdeleni 
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.7, random_state = 42)

print("Trenovaci mnozina ", X_train.shape)
print("Testovaci mnozina ", X_test.shape)

In [None]:
# model instance and parameters
my_tree = DecisionTreeClassifier(max_depth=3, min_samples_split=2, splitter='random', random_state = 42) 

In [None]:
# fitting the model
my_tree.fit(X_train, y_train)

In [None]:
# tree cross validation
cv_strom = cross_validate(my_tree, X_train,y_train, cv=5, scoring='f1_macro', return_estimator=True)
print('Average F1-skore: {:.3f} '.format(cv_strom['test_score'].mean()))

In [None]:
# accuracy on the test data set 
print('Accuray F1-score on test set: {:.3f}'.format(
    round(f1_score(y_test, my_tree.predict(X_test), average='macro'), 3)))

In [None]:
y_pred = my_tree.predict(X_test)

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, y_pred) 
plt.figure(figsize=(7,7))
sns.heatmap(data=cm,linewidths=.5, annot=True, square=True, cmap='Blues')
plt.ylabel('Reference')
plt.xlabel('Predicted class')


In [None]:
# Visualize tree
plt.figure(figsize=(9, 6), dpi=80)
rozhodovaci_strom = plot_tree(decision_tree=my_tree, feature_names = df1.columns,
class_names =["setosa", "vercicolor", "verginica"] , filled=True ,  precision=2, rounded=True)

### ---

## Using Scikit-learn for simple regression

(Support Vector Regression using linear and nonlinear kernels) 

In [None]:
# Sklearn API
from sklearn.svm import SVR

In [None]:
# generate non-linear sample data 
X = np.sort(5 * np.random.rand(40, 1), axis=0)
y = np.sin(X).ravel()

# add some noise to the targets
y[::5] += 2 * (0.5 - np.random.rand(8))

In [None]:
# check the data dimensionality
print(X.shape)
print(y.shape)

In [None]:
# plot them to see what we are going to work with 
plt.plot(X[:,0], y, 'k.')

In [None]:
# create model instace 
svr_lin = SVR(kernel='linear', C=100, gamma='auto')

In [None]:
svr_lin

In [None]:
# fit the data to the model 
svr_lin.fit(X, y)

In [None]:
# predict the values given X 
lin_pred = svr_lin.predict(X)

In [None]:
lin_pred.shape

In [None]:
# diff = y - lin_pred
# print(diff)

In [None]:
# plot the data points with your model results 
plt.plot(X[:,0], y, 'k.')
plt.plot(X[:,0], lin_pred, 'b--')


In [None]:
# test your non-linear model (polynomial)
# degree 3 or other 
svr_poly = SVR(kernel='poly', C=100, gamma='auto', 
               degree=3, epsilon=.1, coef0=1)

In [None]:
# fit the data to the model 
svr_poly.fit(X, y)

In [None]:
# make predictions, given X 
poly_pred = svr_poly.predict(X)

In [None]:
# poly_pred

In [None]:
# plot the input data and your model 
plt.plot(X[:,0], y, 'k.')
plt.plot(X[:,0], poly_pred, 'b-')

In [None]:
# RMSE 
from sklearn.metrics import mean_squared_error

In [None]:
# RMSE lin vs. poly(3) 
from sklearn.metrics import mean_squared_error as mse
print('RMSE linear     model: {}'.format(np.round(np.sqrt(mse(y, lin_pred)), 3)))
print('RMSE polynomial model: {}'.format(np.round(np.sqrt(mse(y, poly_pred)), 3)))

# So far all!