In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore') # Para evitar los molestos avisos.
%matplotlib inline


In [4]:
# Asigno los datos a un dataframe:

df_train = pd.read_csv('trainhouses.csv')
df_train.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [7]:
#Suma los null y ordena descendente de mayor a menor
total = df_train.isnull().sum().sort_values(ascending = False)
#Es el porcentaje de los datos en null.
porcentaje = (df_train.isnull().sum() / df_train.isnull().count()).sort_values(ascending = False)
#Creamos una nueva tabla con el total y el porcentaje
missing_data = pd.concat([total, porcentaje], axis = 1, keys = ['Total', 'Porcentaje'])
#mostramos la nueva tabla
missing_data.head(20)

Unnamed: 0,Total,Porcentaje
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageCond,81,0.055479
GarageType,81,0.055479
GarageYrBlt,81,0.055479
GarageFinish,81,0.055479


In [6]:
print(df_train.columns)
print(df_train.shape)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [8]:
#eliminamos las columnas q tienen datos null
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1)

In [9]:
#Ha elimando 81-63 = 18 columnas
print(df_train.shape)
print(df_train.columns)

(1460, 63)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageCars', 'GarageArea',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')


In [10]:
#existe 1 dato null en Electrical
#Con iloc lo buscamos y lo eliminamos
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)

In [11]:
#Su tamanho ha disminuido.
print(df_train.shape)
print(df_train.columns)

(1459, 63)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageCars', 'GarageArea',
       'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')


In [12]:
#Verificamos si hay algun dato null
df_train.isnull().sum().max() # Para comprobar que no hay más datos desaparecidos.

0

In [13]:
df_train.isna().sum().max()

0

In [14]:
df_train['SalePrice'].describe()

count      1459.000000
mean     180930.394791
std       79468.964025
min       34900.000000
25%      129950.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [15]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [16]:
print(df_train.MSZoning.unique())
print(df_train.Street.unique())
print(df_train.LotShape.unique())
print(df_train.LandContour.unique())
print(df_train.Utilities.unique())
print(df_train.LotConfig.unique())
print(df_train.LandSlope.unique())
print(df_train.SaleType.unique())
print(df_train.SaleCondition.unique())
#print(df_train.Neighborhood.unique())
print(df_train.Condition1.unique())
print(df_train.Condition2.unique())
print(df_train.BldgType.unique())
print(df_train.HouseStyle.unique())
print(df_train.RoofStyle.unique())
#print(df_train.RoofMatl.unique())
#print(df_train.Exterior1st.unique())
#print(df_train.Exterior2nd.unique())
print(df_train.ExterQual.unique())
print(df_train.ExterCond.unique())
print(df_train.Foundation.unique())
#print(df_train.BsmtFinSF1.unique())
print(df_train.Heating.unique())
print(df_train.HeatingQC.unique())
print(df_train.CentralAir.unique())
print(df_train.Electrical.unique())
#print(df_train.GrLivArea.unique())
print(df_train.Functional.unique())
print(df_train.PavedDrive.unique())
#print(df_train.WoodDeckSF.unique())
print(df_train.SaleType.unique())
print(df_train.SaleCondition.unique())

['RL' 'RM' 'C (all)' 'FV' 'RH']
['Pave' 'Grvl']
['Reg' 'IR1' 'IR2' 'IR3']
['Lvl' 'Bnk' 'Low' 'HLS']
['AllPub' 'NoSeWa']
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
['Gtl' 'Mod' 'Sev']
['WD' 'New' 'COD' 'ConLD' 'ConLI' 'CWD' 'ConLw' 'Con' 'Oth']
['Normal' 'Abnorml' 'Partial' 'AdjLand' 'Alloca' 'Family']
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
['Gd' 'TA' 'Ex' 'Fa']
['TA' 'Gd' 'Fa' 'Po' 'Ex']
['PConc' 'CBlock' 'BrkTil' 'Wood' 'Slab' 'Stone']
['GasA' 'GasW' 'Grav' 'Wall' 'OthW' 'Floor']
['Ex' 'Gd' 'TA' 'Fa' 'Po']
['Y' 'N']
['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix']
['Typ' 'Min1' 'Maj1' 'Min2' 'Mod' 'Maj2' 'Sev']
['Y' 'N' 'P']
['WD' 'New' 'COD' 'ConLD' 'ConLI' 'CWD' 'ConLw' 'Con' 'Oth']
['Normal' 'Abnorml' 'Partial' 'AdjLand' 'Alloca' 'Family']


In [14]:
#Eliminamos aquellos datos que tienen texto pero son muchos
df_train.drop(['Neighborhood'], axis=1, inplace=True)
df_train.drop(['RoofMatl'], axis=1, inplace=True)
df_train.drop(['Exterior1st'], axis=1, inplace=True)
df_train.drop(['Exterior2nd'], axis=1, inplace=True)
df_train.drop(['BsmtFinSF1'], axis=1, inplace=True)
df_train.drop(['GrLivArea'], axis=1, inplace=True)
df_train.drop(['WoodDeckSF'], axis=1, inplace=True)

In [15]:
print(df_train.columns)
print(df_train.shape)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition',
       'SalePrice'],
      dtype='object')
(1459, 56)


In [17]:
df_train.KitchenQual.unique()
df_train['KitchenQual'].replace(['Gd', 'TA', 'Ex', 'Fa'],[1,2,3,4], inplace=True)

In [18]:
df_train['MSZoning'].replace(['RL', 'RM', 'C (all)', 'FV', 'RH'],[1,2,3,4,5],inplace=True)


In [19]:
df_train['Street'].replace(['Pave', 'Grvl'],[1,0],inplace=True)


In [20]:
df_train['LotShape'].replace(['Reg', 'IR1', 'IR2', 'IR3'],[1,2,3,4],inplace=True)


In [21]:
df_train['LandContour'].replace(['Lvl', 'Bnk', 'Low', 'HLS'],[1,2,3,4],inplace=True)


In [22]:
df_train['Utilities'].replace(['AllPub', 'NoSeWa'],[1,0],inplace=True)


In [23]:
df_train['LotConfig'].replace(['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'],[1,2,3,4,5],inplace=True)


In [24]:
df_train['LandSlope'].replace(['Gtl', 'Mod', 'Sev'],[1,2,3],inplace=True)


In [25]:
df_train['Condition1'].replace(['Norm', 'Feedr', 'PosN', 'Artery', 'RRAe', 'RRNn', 'RRAn', 'PosA', 'RRNe'],[1,2,3,4,5,6,7,8,9],inplace=True)


In [26]:
df_train.Condition2.unique()
df_train['Condition2'].replace(['Norm', 'Artery', 'RRNn', 'Feedr', 'PosN', 'PosA', 'RRAn', 'RRAe'],[1,2,3,4,5,6,7,8],inplace=True)


In [27]:
df_train['BldgType'].replace(['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],[1,2,3,4,5],inplace=True)


In [28]:
df_train['HouseStyle'].replace(['2Story', '1Story', '1.5Fin', '1.5Unf', 'SFoyer', 'SLvl', '2.5Unf', '2.5Fin'],[1,2,3,4,5,6,7,8],inplace=True)


In [29]:
df_train['RoofStyle'].replace(['Gable', 'Hip', 'Gambrel', 'Mansard', 'Flat', 'Shed'],[1,2,3,4,5,6],inplace=True)


In [30]:
df_train['ExterQual'].replace(['Gd', 'TA', 'Ex', 'Fa'],[1,2,3,4],inplace=True)


In [31]:
df_train['ExterCond'].replace(['TA', 'Gd', 'Fa', 'Po', 'Ex'],[1,2,3,4,5],inplace=True)


In [32]:
df_train['Foundation'].replace(['PConc', 'CBlock', 'BrkTil', 'Wood', 'Slab', 'Stone'],[1,2,3,4,5,6],inplace=True)


In [33]:
df_train['Heating'].replace(['GasA', 'GasW', 'Grav', 'Wall', 'OthW', 'Floor'],[1,2,3,4,5,6],inplace=True)


In [34]:
df_train['HeatingQC'].replace(['Ex', 'Gd', 'TA', 'Fa', 'Po'],[1,2,3,4,5],inplace=True)


In [35]:
df_train['CentralAir'].replace(['Y', 'N'],[1,0],inplace=True)


In [36]:
df_train['Electrical'].replace(['SBrkr', 'FuseF', 'FuseA' ,'FuseP', 'Mix'],[1,2,3,4,5],inplace=True)


In [37]:
df_train['Functional'].replace(['Typ', 'Min1', 'Maj1', 'Min2', 'Mod', 'Maj2', 'Sev'],[1,2,3,4,5,6,7],inplace=True)


In [38]:
df_train['PavedDrive'].replace(['Y', 'N', 'P'],[1,2,3],inplace=True)


In [39]:
df_train['SaleType'].replace(['WD', 'New', 'COD', 'ConLD', 'ConLI', 'CWD', 'ConLw', 'Con', 'Oth'],[1,2,3,4,5,6,7,8,9],inplace=True)

In [40]:
df_train['SaleCondition'].replace(['Normal', 'Abnorml', 'Partial', 'AdjLand', 'Alloca', 'Family'],[1,2,3,4,5,6],inplace=True)

In [41]:
print(df_train.shape)

(1459, 63)


In [45]:
df_train.to_excel('/home/terceros/Diplomado_Ciencia_Datos/Practica_2/house.xlsx', index = None, header=True)

In [60]:
print(df_train.info())
print(df_train.dtypes)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1459
Data columns (total 56 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1459 non-null int64
LotArea          1459 non-null int64
Street           1459 non-null int64
LotShape         1459 non-null int64
LandContour      1459 non-null int64
Utilities        1459 non-null int64
LotConfig        1459 non-null int64
LandSlope        1459 non-null int64
Condition1       1459 non-null int64
Condition2       1459 non-null int64
BldgType         1459 non-null int64
HouseStyle       1459 non-null int64
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null int64
ExterQual        1459 non-null int64
ExterCond        1459 non-null int64
Foundation       1459 non-null int64
BsmtFinSF2       1459 non-null int64
BsmtUnfSF        1459 non-null int64
Total

## Verificando el codigo de skearn.linear

Seleccionar las columnas para para crear el test 

In [44]:
house_data = df_train.drop(['Id','SalePrice'],axis=1)
house_data.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,1,8450,1,1,1,1,1,1,1,...,61,0,0,0,0,0,2,2008,1,1
1,20,1,9600,1,1,1,1,2,1,2,...,0,0,0,0,0,0,5,2007,1,1
2,60,1,11250,1,2,1,1,1,1,1,...,42,0,0,0,0,0,9,2008,1,1


Obtenemos el train de los datos

In [45]:
df_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition',
       'SalePrice'],
      dtype='object')

In [46]:
house_target = df_train.SalePrice
house_target.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [47]:
df_train.shape

(1459, 56)

In [48]:
from sklearn.model_selection import train_test_split

## Crear el train y test de los datos de la casa

In [49]:
#test_size es el porcentaje del conjunto de datos desea dividir 
#random_state es número entero, que actuará como semilla para el generador de números aleatorios durante la división.
#   Si se barajan o no los datos antes de dividirse
#test_size=0.33 33 para test y 67 para entrenar, Esta variable depende de la cantidad de datos si son pocos datos 
#puede sera 20 o 30%
#shuffle si es False no se mezclan los datos
#shuffle si es True se mezclan los datos
#Xtrain conjunto de entranamiento
#X_test conjunt de pruebas
#y_train la etiqueta del conjunto de entramietno
#y_etiqueta del conjunto de prueba
X_train, X_test, y_train, y_test = train_test_split(house_data, house_target,
                                                    test_size=0.20,
                                                    random_state=192,
                                                    shuffle =True)

print('Conjunto de datos para Entrenamiento :',len(X_train))
print('Conjunto de datos para Test :',len(X_test))
print('Total es igual a total de columnas de la muestra:',len(X_test)+len(X_train))
print('Tamaño del conjunto de pruebas:',X_test.shape)
print('Tamaño del conjunto de entrenados ',y_test.shape)

Conjunto de datos para Entrenamiento : 1167
Conjunto de datos para Test : 292
Total es igual a total de columnas de la muestra: 1459
Tamaño del conjunto de pruebas: (292, 54)
Tamaño del conjunto de entrenados  (292,)


In [68]:
X_train.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1068,160,2,3964,1,1,1,1,1,1,1,...,20,0,0,0,0,0,6,2006,1,1
544,60,1,17104,1,2,1,1,1,1,1,...,24,0,0,0,0,0,9,2006,2,3
1430,60,1,21930,1,4,1,1,1,1,7,...,40,0,0,0,0,0,7,2006,1,1


In [54]:
y_train.head(5)

1068    151400
544     179665
1430    192140
1134    169000
885     328900
Name: SalePrice, dtype: int64

## Crear el model de regresion

In [56]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [62]:
result = model.fit(X_train, y_train)#debo ajustar el model

In [63]:
#He puesto 0.20 en el conjunto de datos test_size=0.20
from sklearn import metrics
prediction_test = model.predict(X_test)
# Print the prediction accuracy
print (metrics.accuracy_score(y_test, prediction_test))

0.003424657534246575


In [60]:
# He puesto 0.33 en el conjunto de datos test_size=0.33
from sklearn import metrics
prediction_test = model.predict(X_test)
# Print the prediction accuracy
print (metrics.accuracy_score(y_test, prediction_test))

0.006224066390041493


## Realizar grafico

In [2]:

total_precio = df_train.pivot_table('SalePrice', index='salario', columns="SalePrice", aggfunc = sum)


NameError: name 'df_train' is not defined

In [None]:
# Visualize results - to help with deciding which n_neigbors yields the best results (n_neighbors=6, in this case)
neighbors_settings = range(1,10)
plt.plot(neighbors_settings, , label='Precion del conjunto de datos origenes entrenados', marker='o')
plt.plot(neighbors_settings, , label='Precision del conjunto de datos a testear ', marker='o')
plt.ylabel('Precision')
plt.xlabel('Numero de vecinos')
plt.legend()

## Aplicando el Algoritmo del Arbol de decisiones

In [50]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
#Es un arbol de deciones 
# Leer esta pagina para entender
#https://www.aprendemachinelearning.com/arbol-de-decision-en-python-clasificacion-y-prediccion/
#max_depth q tenga una profundidad de 4 
arbol_house = DecisionTreeClassifier(max_depth=None, criterion='entropy', random_state=454)

In [52]:
warnings.filterwarnings('ignore')
#La forma más sencilla de utilizar la validación cruzada es llamar a la función auxiliar cross_val_score 
#en el estimador y el conjunto de datos.
# obtiene un puntaje del arbol_cancer con el metodo cross_val_score()
cross_val_score(arbol_house, house_data, house_target)
#Luego entrenamos nuestro clasificador usando el método de ajuste fit
#Entrena los datos X_train para predecir y_train
arbol_cancer.fit(X_train,y_train)

print ("Puntuación de los datos origen X_test y los datos entrenados y_test son :",arbol_cancer.score(X_test,y_test))
print ("Puntuacion con respecto al origen :",arbol_cancer.score(X_train,y_train))

Puntuación de los datos origen X_test y los datos entrenados y_test son : 0.00684931506849315
Puntuacion con respecto al origen : 0.9991431019708654
