# Confronto tra modelli

In [2]:
import graphlab

In [4]:
dataset = graphlab.SFrame.read_csv('Classification/data/train.csv', verbose=False)

In [5]:
dataset = dataset['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked','Survived']

In [6]:
dataset.head()

Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
3,male,22.0,1,0,7.25,,S,0
1,female,38.0,1,0,71.2833,C85,C,1
3,female,26.0,0,0,7.925,,S,1
1,female,35.0,1,0,53.1,C123,S,1
3,male,35.0,0,0,8.05,,S,0
3,male,,0,0,8.4583,,Q,0
1,male,54.0,0,0,51.8625,E46,S,0
3,male,2.0,3,1,21.075,,S,0
3,female,27.0,0,2,11.1333,,S,1
2,female,14.0,1,0,30.0708,,C,1


In [7]:
dataset['Cabin'] = dataset['Cabin'].apply(lambda x : 0 if x =='' else 1)

In [8]:
dataset = graphlab.SFrame.dropna(dataset)

In [9]:
dataset['Sex'] = dataset['Sex'].apply(lambda x : 1 if x=='male' else 0) 

In [10]:
dataset['Age'] = dataset['Age'].apply(lambda x : 1 if x > 16 else 0)

In [11]:
dataset['Survived'] = dataset['Survived'].apply(lambda x : -1 if x == 0 else x )

In [12]:
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Cabin','Embarked']
target = 'Survived'

In [13]:
dataset.head()

Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
3,1,1,1,0,7.25,0,S,-1
1,0,1,1,0,71.2833,1,C,1
3,0,1,0,0,7.925,0,S,1
1,0,1,1,0,53.1,1,S,1
3,1,1,0,0,8.05,0,S,-1
1,1,1,0,0,51.8625,1,S,-1
3,1,0,3,1,21.075,0,S,-1
3,0,1,0,2,11.1333,0,S,1
2,0,0,1,0,30.0708,0,C,1
3,0,0,1,1,16.7,1,S,1


In [14]:
data, data_test = dataset.random_split(.8, seed = 0)

In [15]:
folds = graphlab.toolkits.cross_validation.KFold(data, 10)

CLASSIFICATION

In [16]:
max_accuracy = 0
best_l2 = 0
best_model = None

for l2_value in [1e-25, 1e-10, 1e-6, 1e-4, 1e-3, 1e2]:
    for train, valid in folds:
        model = graphlab.logistic_classifier.create(train, target, features, 
                                                    l2_penalty = l2_value, validation_set=None, verbose=False)
        accuratezza =  model.evaluate(valid)['accuracy']
        if accuratezza > max_accuracy:
            max_accuracy = accuratezza
            best_l2 = l2_value
            best_model = model
    
print'BEST l2: ', best_l2, ' with accuracy: ', max_accuracy
print best_model.coefficients

BEST l2:  1e-25  with accuracy:  0.839285714286
+-------------+-------+-------+------------------+------------------+
|     name    | index | class |      value       |      stderr      |
+-------------+-------+-------+------------------+------------------+
| (intercept) |  None |   1   |  4.40747815708   |  0.779557769799  |
|    Pclass   |  None |   1   | -0.788024969686  |  0.215484134364  |
|     Sex     |  None |   1   |  -2.54873011845  |  0.259613600295  |
|     Age     |  None |   1   |  -1.83372574395  |  0.400412322002  |
|    SibSp    |  None |   1   | -0.496708041748  |  0.153850010753  |
|    Parch    |  None |   1   | -0.132391564656  |  0.15814266064   |
|     Fare    |  None |   1   | 0.00342427853902 | 0.00308062737772 |
|    Cabin    |  None |   1   |  0.423089307188  |  0.364177608479  |
|   Embarked  |   C   |   1   |  0.517651342683  |  0.329558877408  |
|   Embarked  |   Q   |   1   | -0.0878555554041 |  0.620837525339  |
+-------------+-------+-------+-----------

REGRESSION

In [17]:
max_accuracy = 0
best_l2 = 0
best_model = None

for l2_value in [1e-25, 1e-10, 1e-6, 1e-4, 1e-3, 1e2]:
    for train, valid in folds:
        model = graphlab.linear_regression.create(train, target, features, 
                                                  l2_penalty = l2_value, validation_set=None, verbose=False)
        accuratezza =  model.evaluate(valid)['rmse']
        if accuratezza > max_accuracy:
            max_accuracy = accuratezza
            best_l2 = l2_value
            best_model = model
    
print'BEST l2: ', best_l2, ' with accuracy: ', max_accuracy
print best_model.coefficients

BEST l2:  1e-25  with accuracy:  0.880180694817
+-------------+-------+------------------+------------------+
|     name    | index |      value       |      stderr      |
+-------------+-------+------------------+------------------+
| (intercept) |  None |  1.38295859384   |  0.21116074043   |
|    Pclass   |  None | -0.238194833375  | 0.0620116009043  |
|     Sex     |  None | -0.920324148204  | 0.0738409960645  |
|     Age     |  None | -0.599369657608  |  0.111064710384  |
|    SibSp    |  None | -0.165924217223  | 0.0416692023016  |
|    Parch    |  None | -0.0453779156403 | 0.0469563379764  |
|     Fare    |  None | 0.00109918615458 | 0.00085643750389 |
|    Cabin    |  None |  0.277737921561  |  0.113951108751  |
|   Embarked  |   C   |  0.211852705118  | 0.0949075452597  |
|   Embarked  |   Q   | -0.101213821079  |  0.178521932438  |
+-------------+-------+------------------+------------------+
[11 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use prin

DECISION TREE

In [18]:
max_accuracy = 0
best_model = None

for train, valid in folds:
        model = graphlab.decision_tree_classifier.create(train, target, features, 
                                                         validation_set=None, verbose=False)
        accuratezza =  model.evaluate(valid)['accuracy']
        if accuratezza > max_accuracy:
            max_accuracy = accuratezza
            best_model = model
    
print'BEST with accuracy: ', max_accuracy
print best_model

BEST with accuracy:  0.875
Class                          : DecisionTreeClassifier

Schema
------
Number of examples             : 511
Number of feature columns      : 8
Number of unpacked features    : 8
Number of classes              : 2

Settings
--------
Number of trees                : 1
Max tree depth                 : 6
Training time (sec)            : 0.0075
Training accuracy              : 0.865
Validation accuracy            : None
Training log_loss              : 0.5573
Validation log_loss            : None



BOOSTED

In [19]:
max_accuracy = 0
best_model = None

for train, valid in folds:
        model = graphlab.boosted_trees_classifier.create(train, target, features, 
                                                         validation_set=None, verbose=False)
        accuratezza =  model.evaluate(valid)['accuracy']
        if accuratezza > max_accuracy:
            max_accuracy = accuratezza
            best_model = model
    
print'BEST with accuracy: ', max_accuracy
print best_model

BEST with accuracy:  0.894736842105
Class                          : BoostedTreesClassifier

Schema
------
Number of examples             : 510
Number of feature columns      : 8
Number of unpacked features    : 8
Number of classes              : 2

Settings
--------
Number of trees                : 10
Max tree depth                 : 6
Training time (sec)            : 0.0567
Training accuracy              : 0.898
Validation accuracy            : None
Training log_loss              : 0.2985
Validation log_loss            : None



SVM

In [20]:
max_accuracy = 0
best_model = None

for train, valid in folds:
        model = graphlab.svm_classifier.create(train, target, features, 
                                        validation_set=None, max_iterations=500, verbose=False)
        accuratezza =  model.evaluate(valid)['accuracy']
        if accuratezza > max_accuracy:
            max_accuracy = accuratezza
            best_model = model
    
print'BEST with accuracy: ', max_accuracy
print best_model

BEST with accuracy:  0.803571428571
Class                          : SVMClassifier

Schema
------
Number of coefficients         : 11
Number of examples             : 511
Number of classes              : 2
Number of feature columns      : 8
Number of unpacked features    : 8

Hyperparameters
---------------
Mis-classification penalty     : 1.0

Training Summary
----------------
Solver                         : lbfgs
Solver iterations              : 72
Solver status                  : SUCCESS: Optimal solution found.
Training time (sec)            : 0.0589

Settings
--------
Train Loss                     : 239.1864

Highest Positive Coefficients
-----------------------------
(intercept)                    : 1.6113
Embarked[C]                    : 0.0825
Cabin                          : 0.038
Embarked[]                     : 0.01
Fare                           : 0.0037

Lowest Negative Coefficients
----------------------------
Sex                            : -2.0204
Age                