In [42]:
# base 
import os 
import sys
sys.path.append('..')
import src
import random 
import json 
import numpy as np
import pandas as pd 
import pandas.api.types as types
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import missingno as msno
import pickle as pkl
from urllib.request import urlretrieve 
from typing import List, Set, Dict, Tuple
from typing import Union, Any, Optional, Iterable, Hashable

# ml preprocessing 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler

import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier


# validation 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_curve, roc_auc_score, precision_score, recall_score, plot_confusion_matrix

# pipelines 
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

In [8]:
# test
src.config.TRAIN_FILE_PATH

'/home/sanc/data/final'

In [9]:
%matplotlib inline 
sns.set_style('white')
sns.set_palette('deep')

In [10]:
# the iris dataset is used as a toy df 
from sklearn.datasets import fetch_openml

iris = fetch_openml(name="iris", version=1, as_frame=True)

df = iris['data']

In [11]:
type(df)

pandas.core.frame.DataFrame

### Statistics 

In [26]:
df_target = iris['target']

df_target.head()

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: class, dtype: category
Categories (3, object): ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

In [32]:
df_joined = pd.concat([df, df_target], axis=1)

In [33]:
df_joined

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


### Data preprocessing steps
- Creating - any features to engineer? 
- Correcting - any anomalies to correct? 
- Converting - any dtypes to correct? 
- Completing - any missing values to complete? 

In [15]:
# try to use type hinting where possible so as to prevent unintended behaviour
# Example function 
def double(n: str) -> str: 
    '''
    docstrings here
    '''
    return n*2 

In [44]:
## helper functions 

def drop_duplicate_cols(df: pd.DataFrame, cols: Hashable) -> pd.DataFrame: 
    pass  


def check_columns(): pass


In [19]:
def missingness_checks(df: pd.DataFrame) -> None:
    
    print(f'NUMBER OF MISSING COLUMNS: {df.isna().sum().sum()}')
    print(f'MISSING COLUMNS (0: NO MISSING VALUES, 1: MISSING VALUES')
    print(df.isna().sum())
    print('\n')
    print('MISSINGNESS THROUGHOUT THE DATA')
    msno.matrix(df) 
    plt.show()
    print('MISSINGNESS CORRELATIONS')
    msno.heatmap(df)
    plt.show() 

def get_dtypes(df: pd.DataFrame) -> Dict:
    return df.dtypes.to_dict()

def quick_plot(df: pd.DataFrame, hue_var: str = None, diag_kind: str = 'kde') -> None: 
    sns.pairplot(df, hue=hue_var, diag_kind=diag_kind)
    plt.show() 

In [39]:
def get_numeric_columns(df: pd.DataFrame) -> List: 
    return [col for col in df.columns if types.is_numeric_dtype(df[col])]


def get_categorical_columns(df: pd.DataFrame) -> List: 
    return [col for col in df.columns if types.is_categorical_dtype(df[col])]


def convert_to_dtype(col: pd.Series, type: str = 'categorical') -> pd.Series:
    if type not in ['numeric', 'categorical']: 
        raise ValueError('Please enter a valid dtype of either: "numeric" or "categorical"') 
    elif type == 'numeric': 
        return pd.to_numeric(col, errors='raise')
    return col.astype('category')

def replace_missing_values(df: pd.DataFrame, cols: Union[str, Iterable[str], Hashable], value) -> pd.DataFrame:
    return df.fillna(value={cols: value})

def return_value_counts(df: pd.DataFrame) -> None: 
    '''
    ahtgildash fsidfh asiulfs

    '''
    for col in df.columns: 
        print(col.upper())
        print('####################################')
        print(df[col].value_counts())
        print('\n')

def set_up_fig(rows: int=1) -> None: 
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16,9))
    for s in ['top', 'right']: 
        ax.spines[s].set_visible(False)

def quick_eda(df: pd.DataFrame) -> None: 
    print(f'DATAFRAME HAS {df.shape[0]} ROWS AND {df.shape[1]} COLS')
    print(df.info())
    display(df.describe().T)
    display(df.head(5))
## standardize columns 

def standardize_cols(col: str) -> str: 
    return col.lower().strip().replace(' ', '-') 

def visualize_cols(col: str) -> str: 
    return col.capitalize().replace('-', ' ')


In [None]:
df.sepallength = df.sepallength.astype(pd.CategoricalDtype())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sepallength = df.sepallength.astype(pd.CategoricalDtype())


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sepallength  150 non-null    category
 1   sepalwidth   150 non-null    float64 
 2   petallength  150 non-null    float64 
 3   petalwidth   150 non-null    float64 
dtypes: category(1), float64(3)
memory usage: 5.1 KB
