In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline
data = pd.read_csv("adult.csv")

In [2]:
#Tamaño del conjunto de datos
data.shape  #tenemos 32561 filas y 15 columnas

(32561, 15)

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


## Para los valores perdidos utilizaremos la moda para reemplazarlos
Realizamos un conteo de los atributo

In [4]:
data.count()

age               32561
workclass         32561
fnlwgt            32561
education         32561
education.num     32561
marital.status    32561
occupation        32561
relationship      32561
race              32561
sex               32561
capital.gain      32561
capital.loss      32561
hours.per.week    32561
native.country    32561
income            32561
dtype: int64

# Limpieza de datos inmcopletos
- Calculamos la moda
- El valor de la moda se reemplaza los valores faltantes

In [5]:
data_temp=data
#Reemplazamos en workclass
workclass_moda=data_temp["workclass"].mode()[0]
print("La moda del atributo workclass es: ", workclass_moda)
data_temp.loc[data_temp["workclass"]=="?","workclass"] = "Private"
#Reemplazamos en occupation
occupation_moda=data_temp["occupation"].mode()[0]
print("La moda del atributo occupation es: ", occupation_moda)
data_temp.loc[data_temp["occupation"]=="?","occupation"] = "Private"
#Reemplazamos en native.country
native_country_moda=data_temp["native.country"].mode()[0]
print("La moda del atributo native.country es: ", native_country_moda)
data_temp.loc[data_temp["native.country"]=="?","native.country"] = "United-States"
data_temp.head()

La moda del atributo workclass es:  Private
La moda del atributo occupation es:  Prof-specialty
La moda del atributo native.country es:  United-States


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,Private,77053,HS-grad,9,Widowed,Private,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,Some-college,10,Widowed,Private,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [None]:
data_temp = data[data != '?']
data_temp.head()

In [None]:
workclass_moda=data_temp["workclass"].mode()[0]
#data_limpia.loc[data_limpia["marital.status"]=="Widowed", "marital.status"]="Married"
data_temp.loc[data_temp["workclass"==NaN],"workclass"] = workclass_moda

In [None]:
data_limpia.shape  #Cuantos quedan luego de eliminar

Se reduce la categoria en el atributo marital.status, para un mejor modelo se va agrupar en solteros y casados, Married y Never-Married

In [None]:
data["marital.status"].unique()

In [None]:
data_limpia.loc[data_limpia["marital.status"]=="Widowed", "marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Divorced","marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Separated","marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Married-civ-spouse","marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Married-spouse-absent","marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Married-AF-spouse","marital.status"]="Married"
data_limpia.head()

In [None]:
data_limpia["marital.status"].unique()

# Exploracion de datos
### Analizamos la clase income

In [None]:
#Mostramos la clase income
data_limpia["income"].value_counts()

In [None]:
sns.countplot(x=data_limpia["income"] )
plt.show()

In [None]:
#Calculo de la media de income con los atributos numericos
data_limpia.groupby(data_limpia["income"]).mean()
#se observa que capital.gain y capital.loss tienen mucha diferencia, en menor medida la age(edad) y horas
#por semana (hours.per.week)

Calculamos los atributos categoricos, respecto a los numericos, se puede realizar uno por uno

In [None]:
data_limpia.groupby("workclass").mean()

In [None]:
data_limpia.groupby("education").mean()

In [None]:
data_limpia.groupby("marital.status").mean()

In [None]:
data_limpia.groupby("occupation").mean()

In [None]:
data_limpia.groupby("relationship").mean()

In [None]:
data_limpia.groupby("race").mean()

In [None]:
data_limpia.groupby("sex").mean()

In [None]:
data_limpia.groupby("native.country").mean()

In [None]:
#calculamos la autocorrelacion
data_limpia.corr()
#Sin considerar a income, solo los datos numericos
#Se observa que age, fnlwgt, education.num, capital.gain, capital.loss y hours.per.week tienen baja correlacion.

# Visualizaciones

In [None]:
#Educacion vs income para ver cual afecta mas, al ser muy alta puede ser un buen predictor
pd.crosstab(data_limpia.education, data_limpia.income).plot(kind="bar")
plt.title("Education - Income")
plt.xlabel("Education")
plt.ylabel("Income")
                                                            

In [None]:
#Marital.statuts vs income para ver cual afecta mas
pd.crosstab(data_limpia["marital.status"] , data_limpia.income).plot(kind="bar")
plt.title("Marital.Status - Income")
plt.xlabel("Marital.Status")
plt.ylabel("Income")

In [None]:
#Occupation vs income para ver cual afecta mas, al ser muy alta puede ser un buen predictor
pd.crosstab(data_limpia["occupation"] , data_limpia.income).plot(kind="bar")
plt.title("occupation - Income")
plt.xlabel("occupation")
plt.ylabel("Income")

In [None]:
#relationship vs income para ver cual afecta mas, al ser muy alta puede ser un buen predictor
pd.crosstab(data_limpia["relationship"] , data_limpia.income).plot(kind="bar")
plt.title("relationship - Income")
plt.xlabel("relationship")
plt.ylabel("Income")

In [None]:
#race vs income para ver cual afecta mas
pd.crosstab(data_limpia["race"] , data_limpia.income).plot(kind="bar")
plt.title("race - Income")
plt.xlabel("racec")
plt.ylabel("Income")

In [None]:
#sex vs income para ver cual afecta mas
pd.crosstab(data_limpia["sex"] , data_limpia.income).plot(kind="bar")
plt.title("sex - Income")
plt.xlabel("sex")
plt.ylabel("Income")

In [None]:
#Nativecountry vs income para ver cual afecta mas
pd.crosstab(data_limpia["native.country"] , data_limpia.income).plot(kind="bar")
plt.title("native.country - Income")
plt.xlabel("native.country")
plt.ylabel("Income")

## Nuevo dataset data_impia2 , sin las columnas duplicadas

In [None]:
#Creo un nueveo dataset, no utilizo education-num, relationship, cappital.gain,capital.loss
data_limpia2=data_limpia[["age", "workclass", "fnlwgt", "education", "marital.status", "occupation", "race", "sex", "hours.per.week","native.country", "income" ]]
data_limpia2.head()

# Aplico Dummies
## Las variables categoricas las convierto en zeros y unos

In [None]:
data_dummy=pd.get_dummies(data_limpia2)
data_dummy.head()