In [1]:
import math
import pandas as pd
import numpy as np

## 1.0 - Quartis

In [4]:
data = [150,151,152,152,153,154,155,155,155]

In [5]:
def median(data):
    n = len(data)
    if n%2 == 0:
        p = (n//2)
        me = (data[p-1] + data[p])/2
    else:
        p = math.ceil(n/2)-1
        me = data[p]

    return p, me

In [6]:
p2, q2 = median(data)
p1, q1 = median(data[:p2])
p3, q3 = median(data[p2:])

In [7]:
min(data), q1, q2, q3, max(data)

(150, 151.5, 153, 155, 155)

Using libraries

In [8]:
data = pd.DataFrame(data)

In [9]:
q1 = np.quantile(data, 0.25)
q3 = np.quantile(data, 0.75)
di = q3 - q1

inferior_limit = q1 - (di * 1.5)
superior_limit = q3 + (di * 1.5)

inferior_limit, superior_limit

(147.5, 159.5)

that way all data bellow of 147.5 and above 159.5 will be outliers

## 2.0 - Variance

In [10]:
np.var(data)

0    3.111111
dtype: float64

In [11]:
# semi manually
m = data.mean()
n = len(data)
variance = (abs(data - m)**2).sum()/n
variance

0    3.111111
dtype: float64

## 2.0 - Standard deviation

In [12]:
np.sqrt(variance)

0    1.763834
dtype: float64

Libraries

In [13]:
np.std(data)

0    1.763834
dtype: float64

In [15]:
data.std(ddof=0)

0    1.763834
dtype: float64

In [48]:
df = pd.read_csv('../dataset/census.csv')

In [49]:
df.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [51]:
df['education-num'].head().std()

2.8284271247461903

## 3.0 - Classification

In [2]:
df = pd.read_csv('../dataset/credit_data.csv')

In [4]:
df.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
i#clientid,2000.0,1000.5,577.494589,1.0,500.75,1000.5,1500.25,2000.0
income,2000.0,45331.600018,14326.327119,20014.48947,32796.459717,45789.117313,57791.281668,69995.685578
age,1997.0,40.807559,13.624469,-52.42328,28.990415,41.317159,52.58704,63.971796
loan,2000.0,4444.369695,3045.410024,1.37763,1939.708847,3974.719419,6432.410625,13766.051239
c#default,2000.0,0.1415,0.348624,0.0,0.0,0.0,0.0,1.0


In [8]:
df[df['age'] <= 0]

Unnamed: 0,i#clientid,income,age,loan,c#default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [9]:
# drop the negative ages
df = df[df['age'] > 0]

In [251]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [12]:
df.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [243]:
# sampling train and test
df_test = df.sample(frac=0.05)
df_train = df.drop(index=df_test.index)

# spliting features and target for training and validation
X_train = df_train.iloc[:, 1:4].values
y_train = df_train.iloc[:, 4].values

X_test = df_test.iloc[:, 1:4].values
y_test = df_test.iloc[:, 4].values

In [244]:
def create_list(models):

    result_dict = dict()
    for model in models:
        md = model
        result_dict[type(md).__name__] = {
                        'precision':[],
                        'recall':[],
                        'f1':[]
                }

    return result_dict

In [255]:
def predicting(X, y, models, fold=30):


    result_dict = create_list(models)

    for model in models:
        
        # spliting data
        for i in range(fold):
            x_train, x_test, y_train, y_test = (
                train_test_split(X, y, test_size=0.2, random_state=i, stratify=y))

            # modeling and predicting
            md = model
            md.fit(x_train, y_train)
            y_pred = md.predict(x_test)

            # evaluation results
            result_dict[type(md).__name__][
                'precision'].append(precision_score(y_test, y_pred))
            result_dict[type(md).__name__][
                'recall'].append(recall_score(y_test, y_pred))
            result_dict[type(md).__name__][
                'f1'].append(f1_score(y_test, y_pred))

    return result_dict
   

In [256]:
results = predicting(X_train, y_train, models=[
    GaussianNB(),
    LogisticRegression(),
    RandomForestClassifier()
])

In [257]:
nb = pd.concat(
    [pd.DataFrame(['GaussianNB']*30), pd.DataFrame.from_dict(results['GaussianNB'])],
    axis=1
)

lr = pd.concat(
    [pd.DataFrame(['LogisticRegression']*30), pd.DataFrame.from_dict(results['LogisticRegression'])],
    axis=1

)
rf = pd.concat(
    [pd.DataFrame(['RandomForestClassifier']*30), pd.DataFrame.from_dict(results['RandomForestClassifier'])],
    axis=1
)

results = pd.concat([nb, lr, rf], axis=0)
results.columns = ['model', 'precision', 'recall', 'f1']
results.groupby(['model']).mean()


Unnamed: 0_level_0,precision,recall,f1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GaussianNB,0.838384,0.604321,0.700151
LogisticRegression,0.799538,0.574691,0.661846
RandomForestClassifier,0.968337,0.928395,0.947522


In [277]:
results_valid = results.groupby(['model']).mean()
results_valid['precision_std'] = results[['model', 'precision']].groupby(['model']).std()
results_valid['precision_median'] = results[['model', 'precision']].groupby(['model']).median()
results_valid['precision_var'] = results[['model', 'precision']].groupby(['model']).var()
results_valid

Unnamed: 0_level_0,precision,recall,f1,precision_std,precision_median,precision_var
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GaussianNB,0.838384,0.604321,0.700151,0.054051,0.829268,0.002922
LogisticRegression,0.799538,0.574691,0.661846,0.043953,0.803226,0.001932
RandomForestClassifier,0.968337,0.928395,0.947522,0.023203,0.971402,0.000538


Results of Test dataset

In [248]:
md = RandomForestClassifier()
md.fit(X_train, y_train)
y_pred = md.predict(X_test)

In [250]:
precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)

(0.9333333333333333, 1.0, 0.9655172413793104)

In [252]:
confusion_matrix(y_test, y_pred)

array([[85,  1],
       [ 0, 14]])