# Анализ данных в python

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline

In [None]:
def generate_data(N=1000):
    from scipy.special import expit, logit
    np.random.seed(42)
    return (
        pd.DataFrame({'logit': np.random.randn(N) - 2})
        .assign(pd_true=lambda x: expit(x['logit']),
                pd=lambda x: expit(0.5 * x['logit'] - 1))
        .assign(default_flg=lambda x: np.random.binomial(1, x['pd_true']))
        .drop(columns=['pd_true', 'logit'])
    )

df = generate_data(N=10000)
df.head(3)

## Gain Chart

In [None]:
def plot_gain_chart(target, prob, num_buck=10):    
    data = pd.DataFrame({'target': target, 'predict': prob})
    buckets = pd.qcut(prob, q=num_buck)
    aggregated = data.groupby(buckets).agg({'target': 'mean', 'predict': 'mean'})
    aggregated = aggregated.sort_index(ascending=False).reset_index(drop=True)
    aggregated.plot(kind='bar', grid=True)
    
plot_gain_chart(df['default_flg'], df['pd'])

# Калибровки прогноза

In [None]:
def logit(x):
    return np.log(x / (1 - x))

In [None]:
df['logit'] = logit(df['pd'])

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(df[['logit']], df['default_flg'])
df['pd_calibrated'] = classifier.predict_proba(df[['logit']])[:, 1]

In [None]:
plot_gain_chart(df['default_flg'], df['pd_calibrated'])

In [None]:
from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error

metrics = pd.DataFrame()

for predict in ['pd', 'pd_calibrated']:
    metrics.loc[predict, 'MSE'] = mean_squared_error(df['default_flg'], df[predict])
    metrics.loc[predict, 'AUC'] = roc_auc_score(df['default_flg'], df[predict])
    metrics.loc[predict, 'log_loss'] = log_loss(df['default_flg'], df[predict])

metrics

In [None]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(df[['pd']], df['default_flg'])
df['pd_linear'] = regressor.predict(df[['pd']])

## information value

$$\hat{IV} = \sum_i
 \left(     \frac{bad_i}{bad_{total}} -     \frac{good_i}{good_{total}} \right)
 \left( \log\frac{bad_i}{bad_{total}} - \log\frac{good_i}{good_{total}} \right)
$$

In [None]:
def information_value(target, feature, num_buck=10):
    bucket = pd.qcut(feature, q=num_buck)
    data = pd.DataFrame({'target': target, 'bucket': bucket})
    agg = data.groupby(['bucket', 'target']).size().unstack().fillna(0)
    agg = agg / agg.sum()
    agg['iv'] = (agg[0] - agg[1]) * (np.log(np.clip(agg[0], 0.001, 1)) - np.log(np.clip(agg[1], 0.001, 1)))
    return agg['iv'].sum()

In [None]:
information_value(df['default_flg'], df['pd'])

In [None]:
def generate_pair(iv, n=1000):
    target = np.random.randint(0, 2, size=n)
    feature = np.zeros(n)
    feature[target == 0] = + iv / 2 + np.sqrt(iv) * np.random.randn(np.sum(target == 0))
    feature[target == 1] = - iv / 2 + np.sqrt(iv) * np.random.randn(np.sum(target == 1))
    return target, feature

# Ссылки
* [10 Minutes to pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html)
* [Python Data Science Handbook](https://github.com/jakevdp/PythonDataScienceHandbook) - книга на github по анализу данных в python.