In [3]:
#!/usr/bin/env python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import ggplot

from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from IPython.display import display   

%matplotlib inline

AttributeError: module 'pandas' has no attribute 'tslib'

In [None]:
# 1. You can choose to load the data into pandas DataFrame straight from the URL
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# 2. or you can also download the dataset manually. In this case change the URL accordingly. 
# curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > adult.data
# curl  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names > adult.names

adult = pd.read_table(
    URL,
    
    # specify the file encoding
    encoding="utf-8",
    
    # specify the separator in the data
    sep=",",      # comma separated values
    
    # ignore spaces after the separator
    skipinitialspace=True,
    index_col=None,
    
    # use manual headers
    header=None,
    names=[
        "age", "workclass", "fnlwgt", "education", 
        "education-num", "marital-status", "occupation",
        "relationship", "race", "sex", "capital-gain", 
        "capital-loss", "hours-per-week", "native-country",
        "wage"
    ]
)

pd.set_option("display.max_rows", 10)
display(adult)

In [2]:
adult.info()

groupby_wage = adult.groupby("wage").describe()
for i in groupby_wage.columns.levels[0]:
    groupby_wage[i].index.name= "wage / " + str(i)
    display(groupby_wage[i])

print(adult["wage"].unique())    

NameError: name 'adult' is not defined

In [None]:
# 8.3.3. 범주형 설명변수에서 문제의 복잡도
print(adult["race"].unique())
print(adult["race"][0:5])
print(adult["sex"].unique())
print(adult["sex"][0:5])

design_matrix_race = pd.get_dummies(adult["race"], drop_first=True)
design_matrix_sex = pd.get_dummies(adult["sex"], drop_first=True)
intercept = pd.DataFrame(1, index=np.arange(adult.shape[0]), columns=["(Intercept)"])
example_design_matrix = pd.concat([intercept, design_matrix_race, design_matrix_sex, adult["age"]], axis=1)

# original matrix
display(adult[["race", "sex", "age"]])
# example design matrix
display(example_design_matrix)

# convert each categorical feature using one-hot encoding
obj_df = adult.select_dtypes(include=["object"]).drop("wage", axis=1)
int_df = adult.select_dtypes(include=["int64"])

design_matrix_objs = pd.get_dummies(obj_df, drop_first=True)
design_matrix_adult = pd.concat([intercept, design_matrix_objs, int_df], axis=1)
display(design_matrix_adult)

In [None]:
# 8.4. 훈련, 검증, 테스트셋의 구분
np.random.seed(1709)

''' 1. You can do it the easy way

from sklearn.model_selection import train_test_split
training, test = train_test_split(adult, test_size=0.2)
training, validation = train_test_split(training, test_size=0.25)
print(training.shape)
print(validation.shape)
print(test.shape)
'''

# 2. Or you can split the dataset manually
n = adult.shape[0]
idx = np.arange(n)
np.random.shuffle(idx)

training_size = int(n*0.6)
validate_size = int(n*0.2)

training_idx = idx[:training_size]
validate_idx = idx[training_size:training_size+validate_size]
test_idx = idx[training_size+validate_size:]

training = adult.loc[training_idx]
validation = adult.loc[validate_idx]
test = adult.loc[test_idx]
print(training.shape)
print(validation.shape)
print(test.shape)

In [None]:
# 8.5. 시각화
fig1 = plt.figure(figsize=(5, 5))
sns.set_style("dark", {'axes.grid' : True})

ax1 = fig1.add_subplot(111)
ax1.set_xlabel("age")
ax1.set_ylabel("density")
ax1.set_ylim(0, 0.04)

df1 = training[training["wage"] == "<=50K"]
df2 = training[training["wage"] == ">50K"]

sns.distplot(df1["age"], ax=ax1, hist=False,
             kde_kws={"alpha": .3, "color": "g",
                      "shade": True, "label": "<=50K"})
sns.distplot(df2["age"], ax=ax1, hist=False,
             kde_kws={"alpha": .3, "color": "b",
                      "shade": True, "label": ">50K"})

df3 = training.loc[(training["race"] == "White") |
                   (training["race"] == "Black")]

g = sns.FacetGrid(df3, row="race", col="sex", hue="wage",
                  hue_kws={"color": ["b", "g"]})
g.set(ylim=(0, .05))
g.map(sns.distplot, "age", "wage", hist=False,
      kde_kws={"alpha": .3, "shade": True})

fig2 = plt.figure(figsize=(10, 10))
ax2 = fig2.add_subplot(211)
sns.countplot(x="education-num", hue="wage", data=training, ax=ax2)

plt.show()

In [None]:
# 8.6. 로지스틱 회귀분석

# glms cannot interpret strings.
# We have to assign each label a numeric id. 
le = preprocessing.LabelEncoder()
le.fit(["<=50K", ">50K"])
y = le.transform(training["wage"])
X = design_matrix_adult.loc[training_idx]

lm = sm.GLM(y, sm.add_constant(X), family=sm.families.Binomial())
res = lm.fit()

resid_deviance = res.resid_deviance

print("Deviance Residuals:", "\nMin: {} \nMedian: {} \nMax: {}".format(
       resid_deviance.max(), resid_deviance.min(), resid_deviance.median()))

display(res.summary())

test_X = design_matrix_adult[1:6]
res.predict(test_X)

In [None]:
# 8.6.4. 예측 정확도 지표
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(121)

y_obs = le.transform(validation["wage"])
yhat_lm = res.predict(design_matrix_adult.loc[validate_idx])
df = pd.DataFrame({"x": y_obs, "y": yhat_lm})
sns.boxplot(x="x", y="y", data=df, ax=ax1)

ax1.set_xlabel("y_obs")
ax1.set_ylabel("yhat_lim")

ax2 = fig.add_subplot(122)
ax2.set_ylim(0, 8)

sns.distplot(df[df["x"] == 1]["y"], ax=ax2, hist=False,
             kde_kws={"alpha": .3, "color": "b",
                      "shade": True, "label": "1"})
sns.distplot(df[df["x"] == 0]["y"], ax=ax2, hist=False,
             kde_kws={"alpha": .3, "color": "g",
                      "shade": True, "label": "0"})

print(res.deviance)

plt.show()

In [None]:
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(111)
ax.set_title("ROC curve for GLM")
ax.set_xlabel("False positive rate")
ax.set_ylabel("True positive rate")

fpr, tpr, _ = roc_curve(y_obs, yhat_lm)
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')

plt.show()

print(auc(fpr, tpr))