# sklearn -- from last week

In [None]:
!/home/atreju/.conda/envs/dhbw/bin/pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import sklearn  # usually: import of individual submodules

## more difficult classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import matthews_corrcoef, ConfusionMatrixDisplay

In [None]:
# data source: https://www.kaggle.com/datasets/purusinghvi/email-spam-classification-dataset/
spam_df = pd.read_csv('../data/spam_emails.csv')

### split train and test datasets

In [None]:
spam_df

In [None]:
# make everything below faster...
spam_df = spam_df[:10000]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(spam_df.text, spam_df.label, test_size=0.1, stratify=spam_df.label)

### turn text into numerical features

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [None]:
# calculate word counts for each word in training corpus for each document (=email)
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(x_train)

In [None]:
word_counts

In [None]:
# tfidf - normalize by document length, and word frequency in corpys

In [None]:
tfidf_transformer = TfidfTransformer()
word_frequencies = tfidf_transformer.fit_transform(word_counts)

In [None]:
word_frequencies

In [None]:
classifier = tree.DecisionTreeClassifier()
classifier.fit(word_frequencies, y_train)

In [None]:
# expected quality on training set
matthews_corrcoef(y_train, classifier.predict(word_frequencies))

In [None]:
word_counts_test = vectorizer.transform(x_test)
word_frequencies_test = tfidf_transformer.transform(word_counts_test)
word_frequencies_test

In [None]:
ConfusionMatrixDisplay.from_estimator(classifier, word_frequencies_test, y_test)

In [None]:
matthews_corrcoef(y_test, classifier.predict(word_frequencies_test))

In [None]:
# training a less-overfit model

In [None]:
classifier = tree.DecisionTreeClassifier(max_depth=2)
classifier.fit(word_frequencies, y_train)

In [None]:
# but this one is (probably) underfit:
matthews_corrcoef(y_train, classifier.predict(word_frequencies))

In [None]:
matthews_corrcoef(y_test, classifier.predict(word_frequencies_test))

### hyperparameter tuning and cross-validation

In [None]:
# cross-validation with N folds on a trained classifier
from sklearn.model_selection import cross_val_score
cross_val_score(classifier, word_frequencies, y_train, cv=10, scoring='matthews_corrcoef')

In [None]:
mean_scores = []
for depth in [2, 5, 10, 20, 30]:
    print(f'Running cross-validation for {depth=}')
    classifier = tree.DecisionTreeClassifier(max_depth=depth)
    scores = cross_val_score(classifier, word_frequencies, y_train, cv=3, scoring='matthews_corrcoef')
    mean_scores.append((scores.mean(), scores.std()))

In [None]:
mean_scores

In [None]:
final_classifier = tree.DecisionTreeClassifier(max_depth=20)
final_classifier.fit(word_frequencies, y_train)

In [None]:
# this seems a fair tradeoff
# it's not very great though, still..
matthews_corrcoef(y_test, final_classifier.predict(word_frequencies_test))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [2, 5, 10, 20, 30],
    'min_samples_split':  [2, 10, 50]
}

In [None]:
grid = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, n_jobs=4, cv=5, scoring='matthews_corrcoef', verbose=1)
grid

In [None]:
grid.fit(word_frequencies, y_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
# again, final evaluation on the test set
matthews_corrcoef(y_test, grid.best_estimator_.predict(word_frequencies_test))

### Pipelines

In [None]:
# Many manual steps above, hard to reproduce, deploy, ...

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
spam_pipeline = Pipeline([
    ('vectorize', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classify', tree.DecisionTreeClassifier(max_depth=20))
])

In [None]:
# no more word_frequencies, just the text
spam_pipeline.fit(x_train, y_train)

In [None]:
matthews_corrcoef(y_test, spam_pipeline.predict(x_test))

### Comparing different Algorithms

In [None]:
class AlgorithmSelector:
    def fit(self, X, y, **kwargs):
        self.classifier.fit(X, y, **kwargs)
        return self

    def predict(self, X, y=None):
        return self.classifier.predict(X)
        
    def score(self, X, y):
        return self.classifier.score(X, y)

    def set_params(self, **kwargs):
        self.classifier = kwargs.pop('classifier')
        self.classifier.set_params(**kwargs)
        return self

In [None]:
spam_pipeline = Pipeline([
    ('vectorize', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classify', AlgorithmSelector())
])

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

param_grid = [
    {
        'tfidf__use_idf': (True, False),
        'classify__classifier': [SGDClassifier()],
        'classify__max_iter': [50, 80],
    },
    {
        'classify__classifier': [tree.DecisionTreeClassifier()],
        'classify__max_depth': [3, 10, 20]
    },
    {
        'classify__classifier': [MultinomialNB()],
        'classify__alpha': (1e-2, 1e-3, 1e-1)
    },
]

In [None]:
grid = GridSearchCV(spam_pipeline, param_grid, n_jobs=4, cv=3, scoring='matthews_corrcoef', verbose=1)
grid

In [None]:
grid.fit(x_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
# SGDClassifier is even fast enough to fit on the full dataset

In [None]:
spam_df = pd.read_csv('../data/spam_emails.csv')
x_train, x_test, y_train, y_test = train_test_split(spam_df.text, spam_df.label, test_size=0.1, stratify=spam_df.label)

In [None]:
grid.best_estimator_.fit(x_train, y_train)

In [None]:
matthews_corrcoef(y_test, grid.best_estimator_.predict(x_test))

# Recap: What did we talk about?

## Week 1: Datatypes

- **Syntax** and **Semantics**
- **Python Properties** (multi-paradigm, typing, whitespace)
- **Data Types** and their Properties
- **Operators**: Arithmetic, (augmented) assignment, comparison, logical, bitwise

## Week 2: Control Structures

- **compound statements**: clause, header, suite
- **conditionals**: if/elif/else
- **loops**: while/for, break/continue, else
- **comprehensions**: list/dict/set-comprehensions
- **pattern matching**: match, case
- **exception handling**: try/except/else/finally, raise
- **context managers**: with
- **functions**: definitions, arguments and returns, varargs, default values, docstrings
- **first-class functions**: anonymous functions, assigning functions to variables, functions as argument and return value
- **decorators**

## Week 3: Classes

- **codestyle**: PEP-8
- **scoping**: local/non-local/global/builtin
- **classes**:
  - **concepts**: class, instance attribute, method
  - **advantages**: modular, reusable, maintainable, encapsulation, real-world modeling
  - **construction**: creation, class and instance attributes, protected and private attributes
  - **inheritance**
  - **dunder methods**

## Week 4 & 5: Standard Library

- **importing packages**
- **keywords vs builtins**
- **standard library packages**
  - file management
  - time & date
  - dealing with text
  - math and random numbers
  - file formats
  - generic services (os, sys, logging, time)

## Week 6: Advanced Concepts

- **Iterables and Iterators**: concepts, `__iter__` and `__next__`, generator functions, genexp, yield, coroutines
- **Dicts and Sets**: how do they work under the hood
- **Type Hints**: optional but useful

## Week 7: working with numbers

- **numpy**: ndarray, shape, indexing, broadcasting
- **matplotlib**: basic plotting in python, types of plots
- **scipy**: advanced math -- linear algebra, statistics, fft, interpo;lation, ...

## Week 8: working with tabular data

- **pandas**: DataFrame, filter, sort, group, aggregate, join, NaNs
- **plotnine**: high-level plotting
- **seaborn**: plotting for statistics

## Week 9: Machine Learning

- **nomenclature**: definition of terms and concepts
- **sklearn**: classification, regression, clustering

# Further Reading

## Data Science / ML

- **tensorflow/pytorch**: deep learning
- **keras**: simplified interface for tf/pytorch.
- **ray**: orchestrate machine learning on clusters
- **statsmodels**: statistical modeling and hypothesis testing
- **xgboost/catboost/lightgbm**: gradient boosting models, gradient boosted trees
- **huggingface/transformers**: website and libraries with datasets, pipelines and pre-trained models focussed on NLP

## Web development

- **requests**: client side library for REST requests
- **aiohttp**: asynchronous requests
- **fastapi**: server side library for REST interfaces
- **flask/django**: web frameworks including HTML templating etc.
- **beautifulsoup/scrapy**: web scraping
- **pyjwt, oauth2**: jwt tokens and authentication
- **streamlit/dash/gradio**: quickly build dashboards and ML model interfaces

## Others

- **tkinter**: part of the standard libaray for developing GUI
- **pygtk/pyQT**: cross platform interfaces with GTK or QT respectively.
- **wxPython**: native UI elements on OSX/Linux/Windows
- **pillow**: image manipulation
- **pygame**: :)