In [1]:
# https://www.kaggle.com/uciml/glass
import pyforest
import numpy as np
from sklearn.model_selection import GridSearchCV
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

print('Done')

ModuleNotFoundError: No module named 'PIL'

# The Purpose
is to try to predict the rainy days basing on the atmospheric pressure changes
I got the data for the year 2021 from this website: http://meteo.ftj.agh.edu.pl/archivalData

# Reading Input Data


In [None]:
DATA = pd.read_csv("input/data2.csv", sep=';')
DATA.info()


# Exploring the Data, planning the preprocessing


In [None]:
DATA.head()

In [None]:
DATA['time'] = pd.to_datetime(DATA['time'], format='%Y-%m-%d %H:%M:%S.%f')
DATA.head()

## Columns


In [None]:
from libs.simpleplotter import simple_features_overview
simple_features_overview(DATA)

# Correlations between data

In [None]:
# Heatmap
data_correlations = DATA.corr()
fig, ax = plt.subplots(figsize=(22, 20))
sns.heatmap(data_correlations, annot=True)

##

In [None]:
# from libs.simpleplotter import simple_correlations
# simple_correlations(DATA, "rainIntensity")

## Missing, Categorical & Not Useful Data


In [None]:
DATA.isnull().sum()

Also, columns: Ticket, Name will be dropped as they have only unique strings, so they can't become Categorical columns
Sex, Embarked - will be converted to Categorical

In [None]:
DATA.describe(include='object')


# Data Processing


In [None]:
# Let's run the preprocessing on both train and test data

PROCESSED = DATA.copy(deep=True)

# Filling-up empty records
PROCESSED['minPm10'].fillna(PROCESSED["minPm10"].mean(), inplace=True)
PROCESSED['maxPm10'].fillna(PROCESSED["maxPm10"].mean(), inplace=True)
PROCESSED['averagePm10'].fillna(PROCESSED["averagePm10"].mean(), inplace=True)
PROCESSED['maxWindDirection'].fillna(PROCESSED["maxWindDirection"].mean(), inplace=True)
PROCESSED['averageWindDirection'].fillna(PROCESSED["averageWindDirection"].mean(), inplace=True)
PROCESSED['minWindDirection'].fillna(PROCESSED["minWindDirection"].mean(), inplace=True)
PROCESSED['maxWindSpeed'].fillna(PROCESSED["maxWindSpeed"].mean(), inplace=True)
PROCESSED['minWindSpeed'].fillna(PROCESSED["minWindSpeed"].mean(), inplace=True)
PROCESSED['averageWindSpeed'].fillna(PROCESSED["averageWindSpeed"].mean(), inplace=True)

# copy of the time column, for predictions
time = PROCESSED['time'].copy()
PROCESSED.drop('time', inplace=True, axis='columns')

# I want to use classificators therefore I need to create is_raining column with binary data

PROCESSED['is_rain'] = np.where(PROCESSED['rainAccumulation']>0, 1, 0)

PROCESSED.drop(['rainAccumulation', 'rainIntensity'], inplace=True, axis='columns')


PROCESSED.isnull().sum()

In [None]:
PROCESSED.head(20)


# Post-Processing Data Analysis


In [None]:
# Heatmap
data_correlations = PROCESSED.corr()
plt.subplots(figsize=(22, 20))
sns.heatmap(data_correlations, cmap = 'Blues', annot=True)

# Preparing for Model Training

## Separating target from features

In [None]:
y = PROCESSED['is_rain']
X = PROCESSED.drop(['is_rain'], axis=1)
X.info()

## Splitting train data into train & validation data
as we can see the number of records in train data is lowered

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.5, test_size=0.5, random_state=0)
X_train.info()

# Models Training & Hyper-params Tuning for Different Classification Models


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# svc=SVC(probability=True, kernel='linear')
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier

# helper function to retrieve model name from model object
from libs.simple_processing import get_model_name

params = [
    {
        'model': KNeighborsClassifier(),
        'hyperparams': {
            'n_neighbors': range(16, 20, 1),
            'n_jobs': range(4, 5, 1)
        }
    },
    # {
    #     'model': DecisionTreeClassifier(),
    #     'hyperparams': {
    #         'max_depth': [1, 2, 3, 4, 5]
    #     }
    # },
    # {
    #     'model': RandomForestClassifier(n_jobs=4, max_features="auto", bootstrap=False),
    #     'hyperparams': {
    #         'criterion' :['gini', 'entropy'],
    #         'n_estimators': range(100, 300, 50),
    #         'n_jobs': range(4, 5, 1),
    #         'max_depth': range(6, 9, 1)
    #     }
    # },
    {
        'model': GradientBoostingClassifier(random_state=0, max_features="auto"),
        'hyperparams': {
            'n_estimators': range(15, 25, 1),
            'learning_rate': np.arange(0.01, 0.1, 0.01),
            'max_depth': range(1, 5, 1)
        }
    }
]

grid_model = {}

# also tried to use cv=4 and 6-15 but it made results worse
for p in params:
    print('> Model:', get_model_name(p['model']))
    grid_model = GridSearchCV(p['model'], p['hyperparams'], cv=5, n_jobs=-1, scoring='accuracy', verbose=1)
    grid_model.fit(X_train, y_train)
    print('Best params:', grid_model.best_params_)
    print('Best score:', grid_model.best_score_)


print('Done.')

# Hyper-params Tuning Summary


## The Winner

In [None]:
model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
pred_results = pd.DataFrame({'y_valid' : y_valid, 'y_pred': y_pred})
pred_results

In [None]:
model = GradientBoostingClassifier(learning_rate=0.09, max_depth=4, n_estimators=24)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
pred_results = pd.DataFrame({'y_valid' : y_valid, 'y_pred': y_pred})

pred_results

## Score
Accuracy score / Dokładność [(TP+TN) / (TP+TN+FP+FN)]: how many of the values were predicted correctly?
Accuracy count: number of correct predictions
Precision score / Precyzja [TP / (TP+FP)]: how many passengers that the model thought survived actually did survive?
Recall score / Pełność [TP / (TP+FN]: how many of the actual survivors the model correctly predicted?
(also known as true positive rate / wskaźnik skuteczności - ryzyko niepoprawnego oznaczenia ofiary wypadku)
F1 score [2(prec x rec) / (prec + rec)]: combination of the above precision and recall scores

In [None]:
from libs.simple_processing import print_scores

print_scores(y_valid, y_pred)

## Receiver Operating Characteristic (ROC)

In [None]:
from libs.simpleplotter import simple_roc

simple_roc(y_valid, y_pred)

## Confusion Matrix / Macierz pomyłek

Now, let's take a look at the results

In [None]:
from libs.simpleplotter import simple_confusion_matrix
# conf_matrix = confusion_matrix(y_true=y_valid, y_pred=y_pred)
# conf_matrix
conf_matrix = simple_confusion_matrix(y_valid, y_pred, model.classes_)

In [None]:
# crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_valid)
print('TP:', conf_matrix[1][1])
print('TN:', conf_matrix[0][0])
print('FP:', conf_matrix[0][1])
print('FN:', conf_matrix[1][0])

## Which features became decision makers for the model?

In [None]:
from libs.simpleplotter import feature_importance

feature_importance(model, X_train)

# Generating Model Predictions For Test Data & Saving the results for Kaggle Competition

In [None]:
# y_test = model.predict(X)
# output = pd.DataFrame({'Id': IDs, 'Species': y_test})
# output.to_csv('./submission.csv', index=False)
# SUBMISSION = pd.read_csv("./submission.csv")
# SUBMISSION