In [25]:
import os
import pandas as pd
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

base_path = './'
train_filepath = base_path + 'train.csv'
test_filepath = base_path + 'test.csv'

if os.path.exists(train_filepath):
    print('Loading data....')
    train_data = pd.read_csv(train_filepath)
    test_data = pd.read_csv(test_filepath)
    print("\n\nShow head data")
    print(train_data.head())
    print("\n\nDescribe data")
    print(train_data.describe())
    
    cols_with_missing = ['{} {} nullos de {}'.format(col,train_data[col].isnull().sum(),train_data[col].isnull().count())
                         for col in train_data.columns
                         if train_data[col].isnull().any()]
    print("\n\n")
    print(cols_with_missing)
    
    print('\n\nSetup and Data Loaded Complete')
else:
    print('Files not exists')

Loading data....


Show head data
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            3

## Column Selection
I delete cabin column because has many null values

In [99]:
X_data = train_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
y_data = train_data['Survived']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,test_size=1/4,random_state=0)

print('\nX data')
print(X_data[:10])


print('\n\nY data')
print(y_data[:10])

print('\nFinished loaded...')


X data
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0       3    male  22.0      1      0   7.2500        S
1       1  female  38.0      1      0  71.2833        C
2       3  female  26.0      0      0   7.9250        S
3       1  female  35.0      1      0  53.1000        S
4       3    male  35.0      0      0   8.0500        S
5       3    male   NaN      0      0   8.4583        Q
6       1    male  54.0      0      0  51.8625        S
7       3    male   2.0      3      1  21.0750        S
8       3  female  27.0      0      2  11.1333        S
9       2  female  14.0      1      0  30.0708        C


Y data
0    0
1    1
2    1
3    1
4    0
5    0
6    0
7    0
8    1
9    1
Name: Survived, dtype: int64

Finished loaded...


## Column Nomalization
 I fill age with average

In [149]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer = imputer.fit(X_train['Age'].values.reshape(-1, 1))

X_train_norm = X_train.copy()
X_test_norm = X_test.copy()

X_train_norm.loc[:,'Age'] = imputer.transform(X_train['Age'].values.reshape(-1, 1))
X_test_norm.loc[:,'Age'] = imputer.transform(X_test['Age'].values.reshape(-1, 1))

I delete 2 rows of embarked with null values


In [150]:
mask = ~pd.isnull(X_train).any(axis=1)
X_train_norm = X_train[mask]
y_train = y_train[mask]

mask = ~pd.isnull(X_test).any(axis=1)
X_test_norm = X_test[mask]
y_test = y_test[mask] 


I applied one hot encoder to sex and embarked

In [151]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
object_cols = ['Sex','Embarked']
columns = ['']
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train_norm[object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test_norm[object_cols]))


# One-hot encoding removed index; put it back
OH_cols_train.index = X_train_norm.index
OH_cols_test.index = X_test_norm.index

columns = []
column = ''
columns_to_drop = []
for encoder in OH_encoder.categories_:
    for column in encoder:
        columns.append('is_{}'.format(column))
    columns_to_drop.append('is_{}'.format(column))
    
OH_cols_train.columns = columns
OH_cols_test.columns = columns

OH_cols_train = OH_cols_train.drop(columns_to_drop,axis=1)
OH_cols_test =OH_cols_test.drop(columns_to_drop,axis=1)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train_norm = X_train_norm.drop(object_cols, axis=1)
num_X_test_norm = X_test_norm.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train_norm, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test_norm, OH_cols_test], axis=1)

print(OH_X_train)


     Pclass   Age  SibSp  Parch     Fare  is_female  is_C  is_Q
105       3  28.0      0      0   7.8958        0.0   0.0   0.0
68        3  17.0      4      2   7.9250        1.0   0.0   0.0
253       3  30.0      1      0  16.1000        0.0   0.0   0.0
320       3  22.0      0      0   7.2500        0.0   0.0   0.0
706       2  45.0      0      0  13.5000        1.0   0.0   0.0
..      ...   ...    ...    ...      ...        ...   ...   ...
835       1  39.0      1      1  83.1583        1.0   1.0   0.0
192       3  19.0      1      0   7.8542        1.0   0.0   0.0
629       3  29.9      0      0   7.7333        0.0   0.0   1.0
559       3  36.0      1      0  17.4000        1.0   0.0   0.0
684       2  60.0      1      1  39.0000        0.0   0.0   0.0

[666 rows x 8 columns]


In [169]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification

clf = make_pipeline(StandardScaler(),
                     LinearSVC(random_state=0,max_iter=10000,dual=False, tol=1e-5))
clf.fit(OH_X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc',
                 LinearSVC(dual=False, max_iter=10000, random_state=0,
                           tol=1e-05))])

In [174]:
y_predict = clf.predict(OH_X_train)
y_predict_test = clf.predict(OH_X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_predict_test)

print(cm)

#Applying k-Flod Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=clf, X=OH_X_train, y=y_train, cv=10)

print(accuracies.mean())
#Indica la varianza
print(accuracies.std())


[[116  23]
 [ 24  60]]
0.7958842152872003
0.030346067981925667


In [167]:
from sklearn.model_selection import validation_curve
train_scores,test_scores = validation_curve(LinearSVC(random_state=0,max_iter=10000,dual=False, tol=1e-5),X_train,y_train,cv=5)
np.mean(train_scores,axis=1)
#Gráfico error entrenamiento vs error testeo
plt.plot(np.mean(train_scores,axis=1))
plt.plot(np.mean(test_scores,axis=1))
plt.xticks(np.arange(24),n)

TypeError: validation_curve() missing 2 required keyword-only arguments: 'param_name' and 'param_range'