## Naive Bayse Classifiers
Naive Bayes classifiers are a collection of classification algorithms based on Bayes’ Theorem. It is not a single algorithm but a family of algorithms where all of them share a common principle, i.e. every pair of features being classified is independent of each other. To start with, let us consider a dataset.<br>
<img src="D:\\Bays.png" width="300" height="300" align="center"/><br>
where,
<ul>
<li>P(A) and P(B) are the probabilities of events A and B also P(B) is never equal to zero.</li>
<li>P(A|B) is the probability of event A when event B happens</li>
<li>P(B|A) is the probability of event B when A happens</li>
</ul>

In [3]:
from sklearn.datasets import load_iris
iris = load_iris() # store the feature matrix (X) and response vector (y)
X = iris.data
y = iris.target # splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
from sklearn.naive_bayes import GaussianNB
gnb  = GaussianNB()
gnb.fit(X_train, y_train)
# making predictions on the testing set
y_pred = gnb.predict(X_test)
# comparing actual response values (y_test) with predicted response values 
from sklearn import metrics
print('Gassian Naive Bayes model accuracy(in %):',metrics.accuracy_score(y_test, y_pred)*100)


Gassian Naive Bayes model accuracy(in %): 95.0


In [4]:
import pandas as pd

Series - single Dimension<br>
DataFrame - More then 1 Dimension

In [6]:
a = range(0,100,5)
pd.Series(a)

0      0
1      5
2     10
3     15
4     20
5     25
6     30
7     35
8     40
9     45
10    50
11    55
12    60
13    65
14    70
15    75
16    80
17    85
18    90
19    95
dtype: int64

In [7]:
a = "hi","how","are","you"
pd.Series(a)

0     hi
1    how
2    are
3    you
dtype: object

In [8]:
a = [1,2,3,"india",34.47]
pd.Series(a)

0        1
1        2
2        3
3    india
4    34.47
dtype: object

In [9]:
marks = [34,45,56,67,76,78]
sub = ["tam","eng","mat","sci","soc","pt"]
df = pd.Series(marks,index=sub)

In [10]:
df

tam    34
eng    45
mat    56
sci    67
soc    76
pt     78
dtype: int64

In [11]:
df.index

Index(['tam', 'eng', 'mat', 'sci', 'soc', 'pt'], dtype='object')

In [12]:
df.values

array([34, 45, 56, 67, 76, 78], dtype=int64)

In [13]:
marks = {"tam":34,"eng":45,"mat":56}
pd.Series(marks)

tam    34
eng    45
mat    56
dtype: int64

In [14]:
subjects = {'Maths','Science','Art and Craft','Social Science'}

data = {'Maths':60,'Science':89,'English':76,'Social Science':86}

marks_series = pd.Series(data, index=subjects)

print(marks_series)

Science           89.0
Maths             60.0
Social Science    86.0
Art and Craft      NaN
dtype: float64


In [15]:
marks_series.isnull()

Science           False
Maths             False
Social Science    False
Art and Craft      True
dtype: bool

df[ condition or filter or selection ]

In [17]:
marks_series[marks_series.isnull()]

Art and Craft   NaN
dtype: float64

In [18]:
marks_series[marks_series.notnull()]

Science           89.0
Maths             60.0
Social Science    86.0
dtype: float64

In [19]:
marks_series.notnull()

Science            True
Maths              True
Social Science     True
Art and Craft     False
dtype: bool

In [20]:
df

tam    34
eng    45
mat    56
sci    67
soc    76
pt     78
dtype: int64

In [21]:
df >= 60

tam    False
eng    False
mat    False
sci     True
soc     True
pt      True
dtype: bool

In [22]:
df[df >= 60]

sci    67
soc    76
pt     78
dtype: int64

In [23]:
df

tam    34
eng    45
mat    56
sci    67
soc    76
pt     78
dtype: int64

In [24]:
marks = [34,45,56,67,76,78]
sub = ['tam','eng','mat','sci','soc','pt']
df = pd.Series(marks,index=sub)

In [25]:
df[ (df > 40) & (df < 60)]

eng    45
mat    56
dtype: int64

In [26]:
df[ (df > 40) | (df < 60)]

tam    34
eng    45
mat    56
sci    67
soc    76
pt     78
dtype: int64

In [27]:
df

tam    34
eng    45
mat    56
sci    67
soc    76
pt     78
dtype: int64

In [28]:
df[0:]

tam    34
eng    45
mat    56
sci    67
soc    76
pt     78
dtype: int64

In [29]:
df[0::2]

tam    34
mat    56
soc    76
dtype: int64

In [30]:
df.iloc[-1] = 100

In [31]:
pd.Series(df)

tam     34
eng     45
mat     56
sci     67
soc     76
pt     100
dtype: int64

In [32]:
df.loc["pt"]

100

In [33]:
df["chem"] = 91

In [34]:
df

tam      34
eng      45
mat      56
sci      67
soc      76
pt      100
chem     91
dtype: int64

In [35]:
df[[1,-3,-1]] = 99

  df[[1,-3,-1]] = 99


In [36]:
df

tam      34
eng      99
mat      56
sci      67
soc      99
pt      100
chem     99
dtype: int64

In [37]:
df.loc[['eng','pt']]

eng     99
pt     100
dtype: int64

In [38]:
df[['eng','pt']]

eng     99
pt     100
dtype: int64

In [39]:
df.sort_values(ascending=False)

pt      100
eng      99
soc      99
chem     99
sci      67
mat      56
tam      34
dtype: int64

In [40]:
df.sort_index(ascending=False)

tam      34
soc      99
sci      67
pt      100
mat      56
eng      99
chem     99
dtype: int64

In [41]:
df.index

Index(['tam', 'eng', 'mat', 'sci', 'soc', 'pt', 'chem'], dtype='object')

In [42]:
type(df.index)

pandas.core.indexes.base.Index

In [43]:
df.sort_values(ascending=False).iloc[0:3]

pt     100
eng     99
soc     99
dtype: int64

In [44]:
df.rank(ascending=False)

tam     7.0
eng     3.0
mat     6.0
sci     5.0
soc     3.0
pt      1.0
chem    3.0
dtype: float64

In [45]:
df = pd.read_csv(r"D:\car_prices.csv")

In [46]:
df.head()

Unnamed: 0,make,model,year,mileage,price,color,state
0,Chevrolet,Cruze,2011,99157,37488,white,TX
1,Toyota,Cruze,2017,19882,34176,red,IL
2,Chevrolet,Altima,2011,77718,40297,blue,CA
3,Nissan,Fusion,2012,83613,43492,silver,TX
4,Ford,Cruze,2016,95349,26655,red,NY


In [47]:
df[['make','model']]

Unnamed: 0,make,model
0,Chevrolet,Cruze
1,Toyota,Cruze
2,Chevrolet,Altima
3,Nissan,Fusion
4,Ford,Cruze
...,...,...
995,Honda,Altima
996,Chevrolet,Accord
997,Nissan,Cruze
998,Honda,Accord


In [48]:
df.columns

Index(['make', 'model', 'year', 'mileage', 'price', 'color', 'state'], dtype='object')

In [49]:
df['country'] = 'India'

In [50]:
df

Unnamed: 0,make,model,year,mileage,price,color,state,country
0,Chevrolet,Cruze,2011,99157,37488,white,TX,India
1,Toyota,Cruze,2017,19882,34176,red,IL,India
2,Chevrolet,Altima,2011,77718,40297,blue,CA,India
3,Nissan,Fusion,2012,83613,43492,silver,TX,India
4,Ford,Cruze,2016,95349,26655,red,NY,India
...,...,...,...,...,...,...,...,...
995,Honda,Altima,2017,25154,43139,red,FL,India
996,Chevrolet,Accord,2017,68264,15632,white,TX,India
997,Nissan,Cruze,2016,17417,30832,silver,IL,India
998,Honda,Accord,2011,68878,10801,blue,TX,India


In [51]:
set(df['year'])

{2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018}

In [52]:
# 2014 < Low Model
# 2016 > High Model
# else - Average Model
df["year_status"] = ["low_model" if i < 2014 else "high model" if i > 2016 else "average_model" for i in df['year']]

In [53]:
df['status'] = [ 'LM' if year < 2014 else 'hm' for year in df['year']]

In [54]:
df

Unnamed: 0,make,model,year,mileage,price,color,state,country,year_status,status
0,Chevrolet,Cruze,2011,99157,37488,white,TX,India,low_model,LM
1,Toyota,Cruze,2017,19882,34176,red,IL,India,high model,hm
2,Chevrolet,Altima,2011,77718,40297,blue,CA,India,low_model,LM
3,Nissan,Fusion,2012,83613,43492,silver,TX,India,low_model,LM
4,Ford,Cruze,2016,95349,26655,red,NY,India,average_model,hm
...,...,...,...,...,...,...,...,...,...,...
995,Honda,Altima,2017,25154,43139,red,FL,India,high model,hm
996,Chevrolet,Accord,2017,68264,15632,white,TX,India,high model,hm
997,Nissan,Cruze,2016,17417,30832,silver,IL,India,average_model,hm
998,Honda,Accord,2011,68878,10801,blue,TX,India,low_model,LM


In [55]:
a = []
for i in df['year']:
    if i < 2013:
        a.append('low model')
    elif i > 2016:
        a.append('high model')
    else:
        a.append('average model')

In [56]:
for i in df['year']:
    if i < 2014:
        print('LM')
    elif i < 2016:
        print('AM')
    else:
        print('HM')

LM
HM
LM
LM
HM
HM
LM
LM
HM
AM
LM
LM
LM
AM
LM
AM
AM
HM
LM
AM
LM
LM
LM
HM
LM
LM
AM
HM
LM
LM
HM
LM
LM
AM
LM
AM
HM
LM
AM
AM
AM
LM
AM
LM
LM
LM
LM
AM
HM
LM
LM
HM
LM
AM
HM
HM
AM
LM
AM
AM
HM
LM
AM
LM
LM
AM
LM
LM
HM
AM
AM
LM
LM
LM
LM
AM
HM
LM
LM
HM
LM
LM
AM
LM
LM
AM
AM
HM
HM
AM
LM
LM
LM
LM
LM
HM
LM
AM
LM
LM
LM
LM
HM
HM
LM
AM
HM
LM
HM
LM
LM
AM
LM
HM
HM
HM
AM
LM
AM
LM
AM
LM
LM
AM
AM
LM
LM
LM
AM
LM
LM
LM
AM
HM
HM
HM
LM
LM
AM
LM
LM
LM
AM
HM
LM
LM
LM
AM
HM
LM
HM
HM
AM
HM
LM
AM
HM
HM
LM
LM
HM
LM
LM
LM
LM
HM
LM
AM
HM
HM
LM
AM
HM
AM
AM
HM
AM
HM
LM
LM
AM
HM
LM
AM
LM
HM
AM
HM
HM
AM
AM
LM
LM
AM
LM
LM
AM
HM
HM
AM
AM
AM
LM
HM
HM
HM
HM
LM
HM
HM
LM
LM
LM
HM
LM
LM
LM
HM
LM
LM
HM
LM
HM
HM
HM
LM
LM
HM
AM
AM
AM
HM
HM
AM
AM
LM
LM
HM
LM
AM
AM
AM
AM
LM
HM
LM
LM
LM
LM
AM
LM
LM
HM
LM
HM
LM
HM
AM
AM
AM
LM
LM
AM
LM
LM
AM
HM
HM
AM
AM
HM
AM
LM
AM
HM
AM
HM
HM
AM
HM
LM
LM
HM
AM
HM
LM
AM
HM
HM
LM
HM
AM
LM
HM
AM
LM
LM
AM
HM
HM
LM
HM
LM
AM
LM
HM
LM
HM
AM
AM
AM
AM
LM
HM
AM
LM
AM
HM
LM
LM
LM
LM
LM
HM
HM
AM
AM
LM
LM
AM
LM
LM
HM
H

In [57]:
def status(x):
    if x < 2013:
        return 'low model'
    elif x > 2016:
        return 'high model'
    else:
        return 'average model'

status(2016)

'average model'

In [58]:
df['def_year'] = df['year'].apply(status)
df

Unnamed: 0,make,model,year,mileage,price,color,state,country,year_status,status,def_year
0,Chevrolet,Cruze,2011,99157,37488,white,TX,India,low_model,LM,low model
1,Toyota,Cruze,2017,19882,34176,red,IL,India,high model,hm,high model
2,Chevrolet,Altima,2011,77718,40297,blue,CA,India,low_model,LM,low model
3,Nissan,Fusion,2012,83613,43492,silver,TX,India,low_model,LM,low model
4,Ford,Cruze,2016,95349,26655,red,NY,India,average_model,hm,average model
...,...,...,...,...,...,...,...,...,...,...,...
995,Honda,Altima,2017,25154,43139,red,FL,India,high model,hm,high model
996,Chevrolet,Accord,2017,68264,15632,white,TX,India,high model,hm,high model
997,Nissan,Cruze,2016,17417,30832,silver,IL,India,average_model,hm,average model
998,Honda,Accord,2011,68878,10801,blue,TX,India,low_model,LM,low model


In [59]:
df['year'].map(status)

0          low model
1         high model
2          low model
3          low model
4      average model
           ...      
995       high model
996       high model
997    average model
998        low model
999        low model
Name: year, Length: 1000, dtype: object

In [60]:
df

Unnamed: 0,make,model,year,mileage,price,color,state,country,year_status,status,def_year
0,Chevrolet,Cruze,2011,99157,37488,white,TX,India,low_model,LM,low model
1,Toyota,Cruze,2017,19882,34176,red,IL,India,high model,hm,high model
2,Chevrolet,Altima,2011,77718,40297,blue,CA,India,low_model,LM,low model
3,Nissan,Fusion,2012,83613,43492,silver,TX,India,low_model,LM,low model
4,Ford,Cruze,2016,95349,26655,red,NY,India,average_model,hm,average model
...,...,...,...,...,...,...,...,...,...,...,...
995,Honda,Altima,2017,25154,43139,red,FL,India,high model,hm,high model
996,Chevrolet,Accord,2017,68264,15632,white,TX,India,high model,hm,high model
997,Nissan,Cruze,2016,17417,30832,silver,IL,India,average_model,hm,average model
998,Honda,Accord,2011,68878,10801,blue,TX,India,low_model,LM,low model


In [61]:
x = lambda a : a**2

In [62]:
a = [1,2,3,4,5,77,8]
list(map(x,a))

[1, 4, 9, 16, 25, 5929, 64]

In [63]:
import seaborn as sns
df = sns.load_dataset('tips')

In [64]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [65]:
df.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [66]:
df[df['sex']=='Male']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
236,12.60,1.00,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
239,29.03,5.92,Male,No,Sat,Dinner,3
241,22.67,2.00,Male,Yes,Sat,Dinner,2


In [67]:
df[(df['sex']=='Male') | (df['sex']=='Dinner')]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
236,12.60,1.00,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
239,29.03,5.92,Male,No,Sat,Dinner,3
241,22.67,2.00,Male,Yes,Sat,Dinner,2


In [68]:
df.head(20)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [69]:
df.sort_values("total_bill",ascending=False).head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
170,50.81,10.0,Male,Yes,Sat,Dinner,3
212,48.33,9.0,Male,No,Sat,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
156,48.17,5.0,Male,No,Sun,Dinner,6
182,45.35,3.5,Male,Yes,Sun,Dinner,3
102,44.3,2.5,Female,Yes,Sat,Dinner,3
197,43.11,5.0,Female,Yes,Thur,Lunch,4
142,41.19,5.0,Male,No,Thur,Lunch,5
184,40.55,3.0,Male,Yes,Sun,Dinner,2
95,40.17,4.73,Male,Yes,Fri,Dinner,4


In [70]:
df.sort_values('day',ascending=True)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
121,13.42,1.68,Female,No,Thur,Lunch,2
133,12.26,2.00,Female,No,Thur,Lunch,2
132,11.17,1.50,Female,No,Thur,Lunch,2
131,20.27,2.83,Female,No,Thur,Lunch,2
130,19.08,1.50,Male,No,Thur,Lunch,2
...,...,...,...,...,...,...,...
158,13.39,2.61,Female,No,Sun,Dinner,2
157,25.00,3.75,Female,No,Sun,Dinner,4
156,48.17,5.00,Male,No,Sun,Dinner,6
186,20.90,3.50,Female,Yes,Sun,Dinner,3


In [71]:
df.sort_values(['day','total_bill'],ascending=[True,False])

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
197,43.11,5.00,Female,Yes,Thur,Lunch,4
142,41.19,5.00,Male,No,Thur,Lunch,5
85,34.83,5.17,Female,No,Thur,Lunch,4
141,34.30,6.70,Male,No,Thur,Lunch,6
83,32.68,5.00,Male,Yes,Thur,Lunch,2
...,...,...,...,...,...,...,...
53,9.94,1.56,Male,No,Sun,Dinner,2
43,9.68,1.32,Male,No,Sun,Dinner,2
178,9.60,4.00,Female,Yes,Sun,Dinner,2
6,8.77,2.00,Male,No,Sun,Dinner,2


df.rename('col': 'new col',axis='columns')

df.drop(['cost'],axis=1)

df(['state','year'],axis='columns')

df.drop([998,999,1000],axis='rows')

df.drop(range(900,1001),axis='rows')

In [77]:
df[0:10]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [147]:
df = pd.read_csv(r"D:\car_prices.csv")

In [149]:
df.groupby(['make'],as_index=False)['price'].sum()

Unnamed: 0,make,price
0,Chevrolet,5993337
1,Ford,5596750
2,Honda,4991672
3,Nissan,6625961
4,Toyota,6168502


In [153]:
df.groupby(['make'],as_index=False)['price'].mean()

Unnamed: 0,make,price
0,Chevrolet,28676.253589
1,Ford,29929.144385
2,Honda,28687.770115
3,Nissan,29580.183036
4,Toyota,29944.184466


In [155]:
df.groupby(['make'],as_index=False)['price'].max()

Unnamed: 0,make,price
0,Chevrolet,49998
1,Ford,49907
2,Honda,49971
3,Nissan,49989
4,Toyota,49871


In [157]:
df.groupby(['make'],as_index=False)['price'].min()

Unnamed: 0,make,price
0,Chevrolet,10037
1,Ford,10539
2,Honda,10032
3,Nissan,10091
4,Toyota,10022


In [159]:
df.groupby(['make'],as_index=False)['price'].count()

Unnamed: 0,make,price
0,Chevrolet,209
1,Ford,187
2,Honda,174
3,Nissan,224
4,Toyota,206


In [161]:
df.columns

Index(['make', 'model', 'year', 'mileage', 'price', 'color', 'state'], dtype='object')

In [167]:
df.groupby(['make'],as_index=False)[['price','mileage']].sum()

Unnamed: 0,make,price,mileage
0,Chevrolet,5993337,11785774
1,Ford,5596750,11117284
2,Honda,4991672,9365561
3,Nissan,6625961,12224548
4,Toyota,6168502,11304892


In [169]:
df.groupby(['make','model'],as_index=False)[['price','mileage']].sum()

Unnamed: 0,make,model,price,mileage
0,Chevrolet,Accord,1645972,2937453
1,Chevrolet,Altima,977067,1934418
2,Chevrolet,Camry,918270,1944557
3,Chevrolet,Cruze,1173161,2443224
4,Chevrolet,Fusion,1278867,2526122
5,Ford,Accord,1187503,2574035
6,Ford,Altima,1179400,2364989
7,Ford,Camry,1004501,1744651
8,Ford,Cruze,1173994,2445811
9,Ford,Fusion,1051352,1987798
