In [2]:
import pandas as pd
import numpy as np
import scipy.io as io
from sklearn.model_selection import train_test_split

In [3]:
features_names =  ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country"]
data1 = pd.read_csv(r"adult.csv", header=None, names=features_names + ["target"])

In [4]:
del data1["fnlwgt"]
del data1["education"]

In [5]:
def columnToNumber(item):
    pos = np.where(unique == item)
    if pos : return pos[0][0]
    else : return 0

In [6]:
for feature in data1.columns:
    unique = data1[feature].unique()
    data1[feature] = data1[feature].map(columnToNumber)

In [7]:
data1.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,1,1,1,0,0,1,0,1,0,0
2,2,2,1,2,2,0,0,0,1,0,0,0,0
3,3,2,2,1,2,1,1,0,1,0,0,0,0
4,4,2,0,1,3,2,1,1,1,0,0,1,0


In [8]:
data1.to_csv('datoslimpios.csv', sep=',', encoding='utf-8', index=False, header=False)

In [9]:
X = data1[data1.columns[0:data1.columns.size - 1]]
y = data1["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=43)

print X_train.shape
print y_train.shape

print X_test.shape
print y_test.shape

print X_val.shape
print y_val.shape

(19536, 12)
(19536,)
(6513, 12)
(6513,)
(6512, 12)
(6512,)


In [10]:
X_train.to_csv('xtrain.csv', sep=',', encoding='utf-8', index=False, header=False)
y_train.to_csv('ytrain.csv', sep=',', encoding='utf-8', index=False, header=False)

X_test.to_csv('xtest.csv', sep=',', encoding='utf-8', index=False, header=False)
y_test.to_csv('ytest.csv', sep=',', encoding='utf-8', index=False, header=False)

X_val.to_csv('xval.csv', sep=',', encoding='utf-8', index=False, header=False)
y_val.to_csv('yval.csv', sep=',', encoding='utf-8', index=False, header=False)

<h3>Datos agrupados</h3>

In [28]:
features_names =  ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country"]
data1 = pd.read_csv(r"adult.csv", header=None, names=features_names + ["target"])

In [29]:
def continents(country):
    country = country[1:len(country)]#eliminamos el primer espacio
    
    if country in ["Puerto-Rico", "Cuba", "Honduras", "Jamaica", "Mexico", "Dominican-Republic", "Ecuador", "Haiti",
                   "Guatemala", "Nicaragua", "El-Salvador", "Trinadad&Tobago", "Peru"]: return 0 #South America
    if country in ["Cambodia", "India", "Japan", "China", " Iran", "Philippines", "Vietnam", "Laos", "Taiwan",
                   "Thailand", "Hong"]: return 1 #Asia
    if country in ["England", " Germany", "Greece", " Italy", "Poland", "Portugal", "Ireland", "France", "Hungary", 
                   "Scotland", "Yugoslavia", "Holand-Netherlands"]: return 2 #Europa
    if country in ["South"]: return 3 #Oceania
    if country in ['United-States', "Canada", "Outlying-US", "Columbia"]: return 4 #North-America
    else : return 4

data1.insert(len(data1.columns) - 1, 'native-continent', data1["native-country"].map(continents))

In [30]:
def targetToNumber(salary):
    salary = salary[1:len(salary)]#eliminamos el primer espacio
    
    if salary == "<=50K": return 0
    if salary == ">50K" : return 1
    
data1["target"] = data1["target"].map(targetToNumber)

In [31]:
def raceToNumber(race):
    race = race[1:len(race)]
    
    if race == "White" : return 0
    if race == "Black" : return 1
    if race == "Asisn-Pac-Islander" : return 2
    if race == "Amer-Indian-Eskimo" : return 3
    if race == "Other" : return 4
    else : return 4

data1["race"] = data1["race"].map(raceToNumber)

In [32]:
def workclassToNumber(occupation):
    occupation = occupation[1:len(occupation)]
    
    if occupation in ["State-gov", "Local-gov", "Federal-gov"] : return 0
    if occupation in ["Without-pay", "Never-worked"] : return 1
    if occupation == "Self-emp-not-inc" : return 2
    if occupation == "Self-emp-inc" : return 3
    if occupation == "Private" : return 4
    else : return 0

data1["workclass"] = data1["workclass"].map(workclassToNumber)

In [33]:
def relationShipToNumber(relationship):
    relationship = relationship[1:len(relationship)]
    
    if relationship in ["Not-in-family", "Unmarried"] : return 0
    if relationship in ["Husband", "Wife"] : return 1
    if relationship == "Own-child" : return 2
    if relationship == "Other-relative" : return 3
    else : return 3
    
data1["relationship"] = data1["relationship"].map(relationShipToNumber)

In [34]:
def maritalSatusToNumber(maritalstatus):
    maritalstatus = maritalstatus[1:len(maritalstatus)]
    
    if maritalstatus in ["Never-married", "Divorced", "Married-spouse-absent", "Separated", "Widowed"] : return 0
    if maritalstatus in ["Married-civ-spouse", "Married-AF-spouse"] : return 1
    else : return 0
    
data1["marital-status"] = data1["marital-status"].map(maritalSatusToNumber)

In [35]:
def sexToNumber(sex):
    sex = sex[1:len(sex)]
    
    if sex == "Male" : return 0
    if sex == "Female" : return 1
    else : return 0
    
data1["sex"] = data1["sex"].map(sexToNumber)

In [36]:
def occupationToNumber(occupation):
    unique = data1["occupation"].unique()
    return np.where(unique == occupation)[0][0]

data1["occupation"] = data1["occupation"].map(occupationToNumber)

In [37]:
del data1["fnlwgt"]#no sirve para clasificar solo son pesos (se usaran despues)
del data1["native-country"]#usamos native-continent
del data1["education"]
data1.head()

In [38]:
X = data1[data1.columns[0:data1.columns.size - 1]]
y = data1["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=43)

print X_train.shape
print y_train.shape

print X_test.shape
print y_test.shape

print X_val.shape
print y_val.shape

(19536, 12)
(19536,)
(6513, 12)
(6513,)
(6512, 12)
(6512,)


In [39]:
X_val.to_csv('xval.csv', sep=',', encoding='utf-8', index=False, header=False)
y_val.to_csv('yval.csv', sep=',', encoding='utf-8', index=False, header=False)

In [40]:
data1.head()


Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-continent,target
0,39,0,13,0,0,0,0,0,2174,0,40,4,0
1,50,2,13,1,1,1,0,0,0,0,13,4,0
2,38,4,9,0,2,0,0,0,0,0,40,4,0
3,53,4,7,1,2,1,1,0,0,0,40,4,0
4,28,4,13,1,3,1,1,1,0,0,40,0,0
