# Gaussian Naive Bayes Classification using mpg data 

In [1]:
from sklearn.cross_validation import train_test_split
import pandas



### Read in mpg dataset from seaborn

In [2]:
mpg = pandas.read_csv("/users/danielcorcoran/desktop/github_repos/python_nb_visualization/seaborn_official_datasets/mpg.csv")

In [3]:
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


### For this example we want to predict how many cyclinders a car has based on metrics
- All cyclinders are in one column, these can be split using pandas `get_dummies()` or vectorization in sklearn

In [4]:
mpg["cylinders"].value_counts()

4    204
8    103
6     84
3      4
5      3
Name: cylinders, dtype: int64

### Create a dataframe splitting cyclinders column from mpg

In [5]:
cylinders_split = pandas.get_dummies(mpg["cylinders"], prefix="CYLINDERS_")

In [6]:
cylinders_split.head()

Unnamed: 0,CYLINDERS__3,CYLINDERS__4,CYLINDERS__5,CYLINDERS__6,CYLINDERS__8
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


### Merge the cyclinders_split dataframe to the original

In [7]:
mpg_expanded = pandas.concat([mpg, cylinders_split], axis = 1)

In [8]:
mpg_expanded.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,CYLINDERS__3,CYLINDERS__4,CYLINDERS__5,CYLINDERS__6,CYLINDERS__8
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu,0,0,0,0,1
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320,0,0,0,0,1
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite,0,0,0,0,1
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst,0,0,0,0,1
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino,0,0,0,0,1


### Extract the feature matrix (X) and target matrix (y)

In [9]:
y = mpg_expanded["CYLINDERS__4"]
X = mpg_expanded[["displacement", "horsepower", "weight", "acceleration"]]

In [10]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: CYLINDERS__4, dtype: uint8

In [11]:
X.head()

Unnamed: 0,displacement,horsepower,weight,acceleration
0,307.0,130.0,3504,12.0
1,350.0,165.0,3693,11.5
2,318.0,150.0,3436,11.0
3,304.0,150.0,3433,12.0
4,302.0,140.0,3449,10.5


### Feature Matrix cannot be used in model until all nulls are managed
- can either drop them or calculate new values using imputation

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 4 columns):
displacement    398 non-null float64
horsepower      392 non-null float64
weight          398 non-null int64
acceleration    398 non-null float64
dtypes: float64(3), int64(1)
memory usage: 12.5 KB


In [13]:
X["horsepower"].fillna(X["horsepower"].mean(), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 4 columns):
displacement    398 non-null float64
horsepower      398 non-null float64
weight          398 non-null int64
acceleration    398 non-null float64
dtypes: float64(3), int64(1)
memory usage: 12.5 KB


### Split data into training and test portions, default test size is 0.25

In [15]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state = 1)

In [16]:
from sklearn.naive_bayes import GaussianNB

### Create model

In [17]:
model = GaussianNB()

### Fit model to training data

In [18]:
model.fit(Xtrain, ytrain)

GaussianNB(priors=None)

### Make prediction using model and Xtest

In [19]:
y_model = model.predict(Xtest)

### Accuracy of model using Xtest data

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, y_model)

0.97