In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error

casualty =  pd.read_csv("/Users/swimmingcircle/cs156_code/session03/casualty_train.csv")

## Classification Model 

In [26]:
casualty.head()

Unnamed: 0,casualty_class,gender,age,severe,pedestrian_location,pedestrian_movement,travel,year
0,passenger,female,33,False,,,motorbike,2007
1,passenger,female,20,False,,,car,2005
2,passenger,male,52,False,,,car,2006
3,passenger,female,17,False,,,pedestrian,2012
4,passenger,female,20,False,,,motorbike,2010


In [29]:
def unique_col(df): 
    cols= df.columns 
    for col in cols:
        print(f'type of {col} class:{df[col].unique()}')

unique_col(casualty)

type of casualty_class class:['passenger' 'pedestrian' 'driver']
type of gender class:['female' 'male']
type of age class:['33' '20' '52' '17' '40' '23' '45' '8' '7' '43' '34' '32' '31' '29' '50'
 '4' '12' '25' '27' '38' '39' '51' '37' '86' '10' '46' '36' '28' '60' '57'
 '65' '21' '68' '3' '42' '18' '26' '24' '44' '61' '48' '5' '11' '35' '58'
 '13' '30' '56' '22' '84' '1' '15' '49' '63' '53' '41' '74' '73' '77' '19'
 '71' '75' '16' '47' '14' '64' '76' '2' '81' '79' '54' '67' '9' '83' '66'
 '87' '78' '80' '69' '59' '93' '70' '72' '88' '6' '85' '55' '82' '62' '89'
 '92' '0' '90' '98' '95' '91' '96' '97' '94' '99']
type of severe class:[False  True]
type of pedestrian_location class:[nan 'In road' 'Close to Crossing' 'Not on Crossing' 'Unknown'
 'Pedestrian Crossing' 'Footpath' 'Zig-Zag']
type of pedestrian_movement class:[nan 'In Rd Not Crossing' 'Drivers N/Side' 'Unknown Or Other'
 'Drivers O/Side' 'Back To Traffic' 'Drivers N/Side Msk'
 'Drivers O/Side Msk' 'Facing Traffic' 'In Rd Not 

In [6]:
casualty = casualty[casualty.age != 'Unknown']

In [8]:
y = casualty['severe']
X = casualty.loc[:,  casualty.columns!= 'severe']

In [9]:
def label_encoding(df, cols):
    le = LabelEncoder()
    for col in cols:
        df.loc[:,col] = le.fit_transform(df.loc[:,col])
    return df
cols = ['casualty_class', 'gender','pedestrian_location',
       'pedestrian_movement', 'travel', 'year']
X = label_encoding(X, cols)
X

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,casualty_class,gender,age,pedestrian_location,pedestrian_movement,travel,year
0,1,0,33,7,9,3,2
1,1,0,20,7,9,2,0
2,1,1,52,7,9,2,1
3,1,0,17,7,9,5,7
4,1,0,20,7,9,3,5
...,...,...,...,...,...,...,...
232830,1,0,40,7,9,2,9
232831,1,0,41,7,9,5,8
232832,2,0,45,0,1,0,5
232833,2,1,10,0,2,0,4


In [10]:
X.astype(int)

Unnamed: 0,casualty_class,gender,age,pedestrian_location,pedestrian_movement,travel,year
0,1,0,33,7,9,3,2
1,1,0,20,7,9,2,0
2,1,1,52,7,9,2,1
3,1,0,17,7,9,5,7
4,1,0,20,7,9,3,5
...,...,...,...,...,...,...,...
232830,1,0,40,7,9,2,9
232831,1,0,41,7,9,5,8
232832,2,0,45,0,1,0,5
232833,2,1,10,0,2,0,4


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

### Choice of classification model: Logistic vs KNN 

- Logistic regression: parametric model 
- KNN:  non parametric model 

If we need examine the parameters e.g. have interpretable meaning for ceofs, run some tests to obtain p-value and discard non significant paramters. We will need to use parametric. 

Since we need to interpret the parameters, we need to choose parametric model. I choose logistic regression. 

In [33]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state=0)
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [44]:
#calculate the probability
probs = log_model.predict_proba(X_test)
probs

array([[0.93602597, 0.06397403],
       [0.94343572, 0.05656428],
       [0.93458453, 0.06541547],
       ...,
       [0.94219315, 0.05780685],
       [0.89800218, 0.10199782],
       [0.88803041, 0.11196959]])

The left column is the probability of the data point = 1, the right column is the probability of the data point = 0.

In [45]:
y

0         True
1        False
2        False
3        False
4        False
         ...  
58203     True
58204     True
58205     True
58206    False
58207    False
Name: severe, Length: 54277, dtype: bool

In [13]:
coeff_parameter = pd.DataFrame(log_model.coef_.reshape(-1,1), X.columns,columns=['Coefficient'])
coeff_parameter.loc['intercept', :] = log_model.intercept_
coeff_parameter

Unnamed: 0,Coefficient
casualty_class,0.260269
gender,-0.273917
age,0.00746
pedestrian_location,-0.122312
pedestrian_movement,-0.074118
travel,0.129585
year,-0.072317
intercept,-1.257919


### Model Evaluation

#### Confusion matrix

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[38608,     0],
       [ 4714,     0]])

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
print('accuracy score is', accuracy_score(y_test, y_pred))
print('precision score is', precision_score(y_test, y_pred, average=None))
print('recall score is', recall_score(y_test, y_pred, average=None))

accuracy score is 0.891186925811366
precision score is [0.89118693 0.        ]
recall score is [1. 0.]


  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.89      1.00      0.94     38608
        True       0.00      0.00      0.00      4714

    accuracy                           0.89     43322
   macro avg       0.45      0.50      0.47     43322
weighted avg       0.79      0.89      0.84     43322



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Interpret the classification report table

Precision = Accuracy of positive predictions = $\frac{TP}{(TP + FP)}$

Recall = Fraction of positives that were correctly identified.
Recall = $\frac{TP}{(TP + FN)}$

F1 Score = $\frac{2*(Recall * Precision)}{(Recall + Precision)}$


From the table, we see that 
- False: the non severe group 
  - precision: 0.89 = 89% = Among all the non severe we classify, we have 11% of severe being classfied wrongly in the non severe group.
  - recall: 1 = 100% = we successfully capture 100% of non severe cases.  
  - f1-score:0.94 = We have high accruacy for non severe cases when considering both recall and precision. 
  - support: 38608 data points
- True: the severe group 
  - precision: 0
  - recall: 0
  - f1-score:0
  - support: 4714 data points

From the confusion matrix, since severe cases is 0, so all the scores from the severe group are 0s.

### ROC curve

In [41]:
probs

array([[0.93602597, 0.06397403],
       [0.94343572, 0.05656428],
       [0.93458453, 0.06541547],
       ...,
       [0.94219315, 0.05780685],
       [0.89800218, 0.10199782],
       [0.88803041, 0.11196959]])

In [39]:
# calculate roc curve
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test, probs)
# auc = roc_auc_score(y_test, probs)
# print('AUC: %.3f' % auc)
print(fpr, tpr, thresholds)

ValueError: y should be a 1d array, got an array of shape (43322, 2) instead.

### Question
- How to interpret the parameter in logistic regression 

In [17]:
casualty_test =  pd.read_csv("/Users/swimmingcircle/cs156_code/session03/casualty_test.csv")