# Pittsburgh Bridges Data Set
### Preprocessing stage: Heat map visualization after having cleaned the data

- https://www.datacamp.com/community/tutorials/categorical-datas
- http://cmdlinetips.com/
- https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py
- https://chrisalbon.com/

In [1]:
# === STANDARD IMPORTS ==== #
print(__doc__)

import pandas as pd
import numpy as np

%matplotlib inline
# Matplotlib pyplot provides plotting API
import matplotlib as mpl
from matplotlib import pyplot as plt
import chart_studio.plotly.plotly as py
import seaborn as sns

Automatically created module for IPython interactive environment


In [2]:
# === UTILS IMPORTS ==== #
from utils.display_utils import display_heatmap
from utils.display_utils import show_frequency_distribution_predictors
from utils.display_utils import show_categorical_predictor_values
from utils.display_utils import  show_cum_variance_vs_components

from utils.preprocessing_utils import preprocess_categorical_variables
from utils.preprocessing_utils import  preprocessing_data_rescaling

from utils.training_utils import sgd_classifier_grid_search
from utils.training_utils import plot_roc_crossval

In [3]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [4]:
# === READ INPUT DATASET ==== #
dataset_path = '/home/franec94/Documents/datasets/datasets_folders/pittsburgh-bridges-data-set'
dataset_name = 'bridges.data.csv'

# column_names = ['IDENTIF', 'RIVER', 'LOCATION', 'ERECTED', 'PURPOSE', 'LENGTH', 'LANES', 'CLEAR-G', 'T-OR-D', 'MATERIAL', 'SPAN', 'REL-L', 'TYPE']
column_names = ['RIVER', 'LOCATION', 'ERECTED', 'PURPOSE', 'LENGTH', 'LANES', 'CLEAR-G', 'T-OR-D', 'MATERIAL', 'SPAN', 'REL-L', 'TYPE']
dataset = pd.read_csv('{}/{}'.format(dataset_path, dataset_name), names=column_names, index_col=0)

FileNotFoundError: [Errno 2] File /home/franec94/Documents/datasets/datasets_folders/pittsburgh-bridges-data-set/bridges.data.csv does not exist: '/home/franec94/Documents/datasets/datasets_folders/pittsburgh-bridges-data-set/bridges.data.csv'

In [None]:
# === SHOW SOME STANDARD DATASET INFOS ==== #
print('Dataset shape: {}'.format(dataset.shape))
print(dataset.info())

In [None]:
# === SHOWING FIRSTS N-ROWS AS THEY ARE STORED WITHIN DATASET === #
dataset.head(5)

In [None]:
# === INVESTIGATING DATASET IN ORDER TO DETECT NULL VALUES === #
print('Before preprocessing dataset and handling null values')
result = dataset.isnull().values.any()
print('There are any null values ? Response: {}'.format(result))

result = dataset.isnull().sum()
print('Number of null values for each predictor:\n{}'.format(result))

In [None]:
# === DISCOVERING VALUES WITHIN EACH PREDICTOR DOMAIN === #
columns_2_avoid = ['ERECTED', 'LENGTH', 'LOCATION', 'LANES']
# columns_2_avoid = None
list_columns_2_fix = show_categorical_predictor_values(dataset, columns_2_avoid)

In [None]:
# === FIXING, UPDATING NULL VALUES CODED AS '?' SYMBOL  === #
# === WITHIN EACH CATEGORICAL VARIABLE, IF DETECTED ANY === #
print('Before', dataset.shape)
for _, predictor in enumerate(list_columns_2_fix):
    dataset = dataset[dataset[predictor] != '?']
print('After', dataset.shape)

_ = show_categorical_predictor_values(dataset, columns_2_avoid)

In [None]:
# === INTERMEDIATE RESULT FOUNDED === #
preprocess_categorical_variables(dataset, columns_2_avoid)
print(dataset.info())

In [None]:
# dataset.boxplot('RIVER','TYPE',rot = 30,figsize=(5,6))

In [None]:
dataset.head(5)

In [None]:
# dataset[['LENGTH', 'SPAN', 'LANES']] = dataset[['LENGTH', 'SPAN', 'LANES']].replace(to_replace='?', value=None, method='bfill')
# print(dataset['SPAN'].value_counts())
# print(dataset['LENGTH'].value_counts())
# print(dataset['LANES'].value_counts())

print('Before', dataset.shape)
columns_2_map = ['ERECTED', 'LANES']
for _, predictor in enumerate(columns_2_map):
    dataset = dataset[dataset[predictor] != '?']
    dataset[predictor] = np.array(list(map(lambda x: int(x), dataset[predictor].values)))
print('After', dataset.shape)
print(dataset.info())
print(dataset.head(5))

In [None]:
print('Before', dataset.shape)
columns_2_map = ['LOCATION', 'LANES', 'LENGTH']    
for _, predictor in enumerate(columns_2_map):
    dataset = dataset[dataset[predictor] != '?']
    dataset[predictor] = np.array(list(map(lambda x: float(x), dataset[predictor].values)))
print('After', dataset.shape)    
print(dataset.info())
print(dataset.head(5))

# columns_2_avoid = None
list_columns_2_fix = show_categorical_predictor_values(dataset, None)

In [None]:
result = dataset.isnull().values.any()
# print('After handling null values\nThere are any null values ? Response: {}'.format(result))

result = dataset.isnull().sum()
# print('Number of null values for each predictor:\n{}'.format(result))

In [None]:
dataset.head(5)

In [None]:
dataset.describe(include='all')

In [None]:
columns_2_avoid = ['ERECTED', 'LENGTH', 'LOCATION']
show_frequency_distribution_predictors(dataset, columns_2_avoid)

In [None]:
corr_result = dataset.corr()

In [None]:
display_heatmap(corr_result)

In [None]:
columns = dataset.columns
target_col = 'T-OR-D'

y = np.array(list(map(lambda x: 0 if x == 1 else 1, dataset[target_col].values)))
print(dataset['T-OR-D'].value_counts())
X = dataset.loc[:, dataset.columns != target_col]

In [None]:
# Standardizing the features
scaler_methods = ['minmax', 'standard', 'norm']
scaler_method = 'standard'
rescaledX = preprocessing_data_rescaling(scaler_method, X)

In [None]:
n_components = rescaledX.shape[1]
pca = PCA(n_components=n_components)
# pca = PCA(n_components=2)

#X_pca = pca.fit_transform(X)
pca = pca.fit(rescaledX)
X_pca = pca.transform(rescaledX)
    
fig = show_cum_variance_vs_components(pca, n_components)

py.sign_in('franec94', 'STMaADdoKsk66UekCPGa')
py.iplot(fig, filename='selecting-principal-components {}'.format(scaler_method))

In [None]:
# scaler_methods = ['minmax', 'standard', 'norm']
scaler_methods = []
for _, scaler_method in enumerate(scaler_methods):
    rescaledX = preprocessing_data_rescaling(scaler_method, X)
    
    n_components = rescaledX.shape[1]
    pca = PCA(n_components=n_components)
    # pca = PCA(n_components=2)

    #X_pca = pca.fit_transform(X)
    pca = pca.fit(rescaledX)
    X_pca = pca.transform(rescaledX)
    
    show_cum_variance_vs_components(pca, n_components)
    fig = show_cum_variance_vs_components(pca, n_components)

    # py.sign_in('franec94', 'QbLNKpC0EZB0kol0aL2Z')
    py.iplot(fig, filename='selecting-principal-components {}'.format(scaler_method))

In [None]:
principal_components = [pc for pc in '2,5,6,7,8,9,10'.split(',')]
for _, pc in enumerate(principal_components):
    n_components = int(pc)
    
    cum_var_exp_up_to_n_pcs = np.cumsum(pca.explained_variance_ratio_)[n_components-1]
    print(f"Cumulative varation explained up to {n_components} pcs = {cum_var_exp_up_to_n_pcs}")

In [None]:
plot_roc_crossval(rescaledX, y)