In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn import metrics
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline
data = pd.read_csv("adult.csv")

In [None]:
#Tamaño del conjunto de datos
data.shape  #tenemos 32561 filas y 15 columnas

In [None]:
data.head()

## Para los valores perdidos utilizaremos la moda para reemplazarlos
Realizamos un conteo de los atributo

In [None]:
data.count()

# Limpieza de datos inmcopletos
- Calculamos la moda
- El valor de la moda se reemplaza los valores faltantes

In [None]:
data_temp=data
#Reemplazamos en workclass
workclass_moda=data_temp["workclass"].mode()[0]
print("La moda del atributo workclass es: ", workclass_moda)
data_temp.loc[data_temp["workclass"]=="?","workclass"] = "Private"
#Reemplazamos en occupation
occupation_moda=data_temp["occupation"].mode()[0]
print("La moda del atributo occupation es: ", occupation_moda)
data_temp.loc[data_temp["occupation"]=="?","occupation"] = "Prof-specialty"
#Reemplazamos en native.country
native_country_moda=data_temp["native.country"].mode()[0]
print("La moda del atributo native.country es: ", native_country_moda)
data_temp.loc[data_temp["native.country"]=="?","native.country"] = "United-States"
data_temp.head()

In [None]:
data_temp.shape

In [None]:
data_limpia=data_temp

Se reduce la categoria en el atributo marital.status, para un mejor modelo se va agrupar en solteros y casados, Married y Never-Married

In [None]:
data["marital.status"].unique()

In [None]:
data_limpia.loc[data_limpia["marital.status"]=="Widowed", "marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Divorced","marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Separated","marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Married-civ-spouse","marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Married-spouse-absent","marital.status"]="Married"
data_limpia.loc[data_limpia["marital.status"]=="Married-AF-spouse","marital.status"]="Married"
data_limpia.head()

In [None]:
data_limpia["marital.status"].unique()

# Exploracion de datos
### Analizamos la clase income

In [None]:
#Mostramos la clase income
data_limpia["income"].value_counts()

In [None]:
sns.countplot(x=data_limpia["income"] )
plt.show()

In [None]:
#Calculo de la media de income con los atributos numericos
data_limpia.groupby(data_limpia["income"]).mean()
#se observa que capital.gain y capital.loss tienen mucha diferencia, en menor medida la age(edad) y horas
#por semana (hours.per.week)

Calculamos los atributos categoricos, respecto a los numericos, se puede realizar uno por uno

In [None]:
data_limpia.groupby("workclass").mean()

In [None]:
data_limpia.groupby("education").mean()

In [None]:
data_limpia.groupby("marital.status").mean()

In [None]:
data_limpia.groupby("occupation").mean()

In [None]:
data_limpia.groupby("relationship").mean()

In [None]:
data_limpia.groupby("race").mean()

In [None]:
data_limpia.groupby("sex").mean()

In [None]:
data_limpia.groupby("native.country").mean()

In [None]:
#calculamos la autocorrelacion
data_limpia.corr()
#Sin considerar a income, solo los datos numericos
#Se observa que age, fnlwgt, education.num, capital.gain, capital.loss y hours.per.week tienen baja correlacion.

# Visualizaciones

In [None]:
#Educacion vs income para ver cual afecta mas, al ser muy alta puede ser un buen predictor
pd.crosstab(data_limpia.education, data_limpia.income).plot(kind="bar")
plt.title("Education - Income")
plt.xlabel("Education")
plt.ylabel("Income")
                                                            

In [None]:
#Marital.statuts vs income para ver cual afecta mas
pd.crosstab(data_limpia["marital.status"] , data_limpia.income).plot(kind="bar")
plt.title("Marital.Status - Income")
plt.xlabel("Marital.Status")
plt.ylabel("Income")

In [None]:
#Occupation vs income para ver cual afecta mas, al ser muy alta puede ser un buen predictor
pd.crosstab(data_limpia["occupation"] , data_limpia.income).plot(kind="bar")
plt.title("occupation - Income")
plt.xlabel("occupation")
plt.ylabel("Income")

In [None]:
#relationship vs income para ver cual afecta mas, al ser muy alta puede ser un buen predictor
pd.crosstab(data_limpia["relationship"] , data_limpia.income).plot(kind="bar")
plt.title("relationship - Income")
plt.xlabel("relationship")
plt.ylabel("Income")

In [None]:
#race vs income para ver cual afecta mas
pd.crosstab(data_limpia["race"] , data_limpia.income).plot(kind="bar")
plt.title("race - Income")
plt.xlabel("racec")
plt.ylabel("Income")

In [None]:
#sex vs income para ver cual afecta mas
pd.crosstab(data_limpia["sex"] , data_limpia.income).plot(kind="bar")
plt.title("sex - Income")
plt.xlabel("sex")
plt.ylabel("Income")

In [None]:
#Nativecountry vs income para ver cual afecta mas
pd.crosstab(data_limpia["native.country"] , data_limpia.income).plot(kind="bar")
plt.title("native.country - Income")
plt.xlabel("native.country")
plt.ylabel("Income")

## Nuevo dataset data_impia2 , sin las columnas duplicadas
income lo convierto en 0 para menores de 50 y 1 para mayores de 50

In [None]:
#Creo un nueveo dataset, no utilizo education-num, relationship, cappital.gain,capital.loss
data_limpia2=data_limpia[["age", "workclass", "fnlwgt", "education", "marital.status", "occupation", "race", "sex", "hours.per.week","native.country", "income" ]]
data_limpia2.loc[data_limpia2["income"]=="<=50K","income"]="0"
data_limpia2.loc[data_limpia2["income"]==">50K","income"]="1"
data_limpia2.head()

Aplico dummy, a todos los atributos menos a income

In [None]:
data_x=data_limpia2[["age", "workclass", "fnlwgt", "education", "marital.status", "occupation", "race", "sex", "hours.per.week","native.country"]]
data_x_dum=pd.get_dummies(data_x)
data_x_dum.head()

# Divido en Train y test

In [None]:
# para los valores de x utilizo data_x_dum  que es dummis aplicado a todos menos a income
# Loa valores de y es el atributo income convertido en 1 y 0


X=data_x_dum
y= data_limpia2["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Aplico modelo de Regresion Logistica

- Solo uso x_train y y_train
-Aplico dummies a X_train

### Aplico regresion logistica

In [None]:
logreg=LogisticRegression()
logreg.fit(X_train,y_train)

In [None]:
y_pred=logreg.predict(X_test)
logreg.score(X_test,y_test)


# Cross validation
### Para evitar overfitting


Utilizando cross_val_score nos proporciona los resultados de la precisión que tuvo el modelo logreg en cada pliegue


In [None]:
precision = cross_val_score(logreg,X=X_train, y=y_train,cv=10, n_jobs=-1)

print('precisiones: {}'.format(precision))
print('Precision promedio: {0: .3f} +/- {1: .3f}'.format(np.mean(precision),
                                          np.std(precision)))

# Matriz de confusion

In [None]:
confusion_matrix=confusion_matrix(y_test,y_pred)

In [None]:
confusion_matrix

## Calculo de la precision, recall, f-1 y support

In [None]:
classification_report(y_test,y_pred)

# curva roc

In [None]:
#fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=2)
probs = logreg.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds,pos_label=2)
roc_auc = metrics.auc(fpr, tpr)

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()