# Python Programming for Machine learning

## Code In AI - NumPy, Matplotlib, and Pandas Package

## NumPy Package

See the document 
- https://docs.scipy.org/doc/numpy/reference/index.html

### Check versions of installed packages

In [1]:
import numpy as np
import pandas as pd
print("Numpy version ", np.__version__)
print("Pandas version {}".format(pd.__version__))

Numpy version  1.16.3
Pandas version 0.24.2


### NumPy array

In [2]:
import numpy as np

cvalues = [20.1, 20.8, 21.9, 22.5, 22.7, 
           22.3, 21.8, 21.2, 20.9, 20.1]

C = np.array(cvalues)
print(C)

[20.1 20.8 21.9 22.5 22.7 22.3 21.8 21.2 20.9 20.1]


### 2-D & multidimensional NumPy array

In [3]:
A = np.array([[3.4,8.7,9.9],
              [1.1,-7.8,-0.7],
              [4.1,12.3,4.8]])
print(A)
print(A.ndim)

[[ 3.4  8.7  9.9]
 [ 1.1 -7.8 -0.7]
 [ 4.1 12.3  4.8]]
2


In [4]:
B = np.array([[[111,112],[121,122]],
              [[211,212],[221,222]],
              [[311,312],[321,322]]])
print(B)
print(B.ndim)

[[[111 112]
  [121 122]]

 [[211 212]
  [221 222]]

 [[311 312]
  [321 322]]]
3


### Array Shape 

In [5]:
x = np.array([[67, 63, 87],
              [77, 69, 59],
              [85, 87, 99],
              [79, 72, 71],
              [63, 89, 93],
              [68, 92, 78]])

print(np.shape(x))

(6, 3)


In [6]:
print(x.shape)

(6, 3)


### Indexing

In [7]:
A = np.array([
    [11,12,13,14,15],
    [21,22,23,24,25],
    [31,32,33,34,35],
    [41,42,43,44,45],
    [51,52,53,54,55]
])
print(A[:3,2:])

[[13 14 15]
 [23 24 25]
 [33 34 35]]


### Numerical operations on NumPy array

In [8]:
lst = [1,2,5,10]
v = np.array(lst)
v = v + 2
print(v)

[ 3  4  7 12]


In [9]:
print(v * 2.2)

[ 6.6  8.8 15.4 26.4]


In [10]:
print(v - 1.38)

[ 1.62  2.62  5.62 10.62]


In [11]:
print(v ** 2)

[  9  16  49 144]


### Arithmetic operations with two arrays

In [12]:
A = np.array([[11, 12, 13],
              [21, 22, 23],
              [31, 32, 33]])
B = np.ones((3,3))
print("Adding to arrays:")
print(A + B)

Adding to arrays:
[[12. 13. 14.]
 [22. 23. 24.]
 [32. 33. 34.]]


In [13]:
print("\nMultiplying two arrays:")
print(A * (B + 1))


Multiplying two arrays:
[[22. 24. 26.]
 [42. 44. 46.]
 [62. 64. 66.]]


### Matrix multiplication

In [14]:
np.dot(A,B)

array([[36., 36., 36.],
       [66., 66., 66.],
       [96., 96., 96.]])

### Definition of dot product

In [15]:
# For 1-D arrays, dot product is computed as vector dot product
x = np.array([3, -2])
y = np.array([-4, 1])
print(np.dot(x,y))

-14


In [16]:
# For 2-D arrays, dot product is computed as matrix multiplication
A = np.array([[1, 2, 3],
              [3, 2, 1]])
B = np.array([[2, 3],
              [1, -1],
              [1, 2]])
print(np.dot(A,B))

[[7 7]
 [9 9]]


Dot product is basically a projection/elemination of one dimension out.

### Comparison operator

In [17]:
A = np.array([[11, 12, 13],
              [21, 22, 23],
              [31, 32, 33]])
B = np.array([[11, 102, 13],
              [201, 22, 203],
              [31, 32, 303]])
A == B

array([[ True, False,  True],
       [False,  True, False],
       [ True,  True, False]])

### Compare two arrays

In [18]:
print(np.array_equal(A, B))

False


In [19]:
print(np.array_equal(A, A))

True


### Broadcasting

In [20]:
B = np.array([1, 2, 3])
print("Multiplication with broadcasting: ")
print(A*B)

Multiplication with broadcasting: 
[[11 24 39]
 [21 44 69]
 [31 64 99]]


In [21]:
print("... and now addition with broadcasting: ")
print(A+B)

... and now addition with broadcasting: 
[[12 14 16]
 [22 24 26]
 [32 34 36]]


### Concatenating arrays

In [22]:
x = np.array([11,22])
y = np.array([18,7,6])
z = np.array([1,3,5])
c = np.concatenate((x,y,z))
print(c)

[11 22 18  7  6  1  3  5]


### Vector stacking

In [23]:
A = np.array([3, 4, 5])
B = np.array([1, 9, 0])
print(np.row_stack((A, B)))

[[3 4 5]
 [1 9 0]]


In [24]:
print(np.column_stack((A, B)))

[[3 1]
 [4 9]
 [5 0]]


In [25]:
np.shape(A)

(3,)

## Pandas Package
See the document
- http://pandas.pydata.org/pandas-docs/stable/

In [26]:
import pandas as pd

In [27]:
s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])
print(s)
print(type(s))

a    0.396535
b    0.567806
c   -0.308046
d   -0.109099
e   -0.115823
dtype: float64
<class 'pandas.core.series.Series'>


In [28]:
d = {'one': [1,2,3,4],
     'two':[5,6,7,8]}
df=pd.DataFrame(d)
print(df)

   one  two
0    1    5
1    2    6
2    3    7
3    4    8


In [36]:
df.iloc[0:2,1]

0    5
1    6
Name: two, dtype: int64

### Creating DataFrame from NumPy array

In [29]:
data = np.array([['','Col1','Col2'],
                 ['Row1',1,2],
                 ['Row2',3,4]])
df = pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:])
print(df)

     Col1 Col2
Row1    1    2
Row2    3    4


### Creating DataFrame from Dictionary

In [30]:
my_dict = {'First': ['1','3'],
           'Second': ['1','2'],
           'Third': ['2','4']}
df = pd.DataFrame(my_dict)
print(df)

  First Second Third
0     1      1     2
1     3      2     4


### Creating DataFrame from files

In [31]:
bankData = pd.read_csv("bank-data.csv",sep=";")
print(bankData.shape)
bankData.info()

(45211, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


### DataFrame dimension

In [32]:
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))

In [33]:
# Use the shape property
print(df.shape)

(2, 3)


In [34]:
# or use the len() function with the index property
print(len(df.index))

2


### Viewing DataFrame: head()

In [35]:
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Viewing DataFrame: tail()

In [36]:
bankData.tail()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,no


### Viewing DataFrame: columns

In [37]:
bankData.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

### Viewing DataFrame: describe()

In [38]:
bankData.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


### Sorting data by column name

In [41]:
bankData.sort_values(by='age').head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
42954,18,student,single,unknown,no,108,no,no,cellular,9,feb,92,1,183,1,success,yes
41273,18,student,single,unknown,no,3,no,no,cellular,25,aug,130,2,-1,0,unknown,yes
42146,18,student,single,secondary,no,156,no,no,cellular,4,nov,298,2,82,4,other,no
43637,18,student,single,unknown,no,348,no,no,cellular,5,may,443,4,-1,0,unknown,yes
40736,18,student,single,primary,no,1944,no,no,telephone,10,aug,122,3,-1,0,unknown,no


### Select a column

In [42]:
bankData['job'].head()

0      management
1      technician
2    entrepreneur
3     blue-collar
4         unknown
Name: job, dtype: object

In [43]:
bankData.job.head()

0      management
1      technician
2    entrepreneur
3     blue-collar
4         unknown
Name: job, dtype: object

### Select multiple columns: [['c1','c2']]

In [44]:
bankData[['job','age']].head()

Unnamed: 0,job,age
0,management,58
1,technician,44
2,entrepreneur,33
3,blue-collar,47
4,unknown,33


### Indexing (aka filter rows): [Start:End] 

In [45]:
bankData.loc[0:5]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no


In [46]:
bankData[0:5]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### More complex selection - .loc and .iloc
- Filter both rows and columns using .loc and .iloc

In [54]:
bankData.loc[1:5]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no


In [47]:
bankData.loc[1:5,['job','age','education']]

Unnamed: 0,job,age,education
1,technician,44,secondary
2,entrepreneur,33,secondary
3,blue-collar,47,unknown
4,unknown,33,unknown
5,management,35,tertiary


In [55]:
bankData.iloc[:5,:3]

Unnamed: 0,age,job,marital
0,58,management,married
1,44,technician,single
2,33,entrepreneur,married
3,47,blue-collar,married
4,33,unknown,single


### Boolean indexing by isin

In [57]:
bankData[bankData.job.isin(['management'])].head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
21,56,management,married,tertiary,no,779,yes,no,unknown,5,may,164,1,-1,0,unknown,no
26,39,management,single,tertiary,no,255,yes,no,unknown,5,may,296,1,-1,0,unknown,no


### Boolean indexing by condition

In [58]:
bankData[bankData.age > 30].head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Add column

In [60]:
oneColumn = np.ones(len(bankData))
bankData['one'] = oneColumn
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,one
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,1.0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,1.0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,1.0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,1.0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,1.0


### Delete column using del

In [61]:
del bankData['one']
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Delete column using .drop()
- .drop() will return the dataframe that the specify column had been remove.

In [62]:
bankData['one'] = oneColumn #add column
bankData2 = bankData.drop('one',axis=1)
bankData2.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


### Missing data

In [63]:
# Read flith data
flightData = pd.read_csv('flights.csv')

In [64]:
# Show rows with missing data
flightData[flightData.dep_delay.isnull()].head()

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
838,839,2013,1,1,,1630,,,1815,,EV,4308,N18120,EWR,RDU,,416,16,30,2013-01-01 16:00:00
839,840,2013,1,1,,1935,,,2240,,AA,791,N3EHAA,LGA,DFW,,1389,19,35,2013-01-01 19:00:00
840,841,2013,1,1,,1500,,,1825,,AA,1925,N3EVAA,LGA,MIA,,1096,15,0,2013-01-01 15:00:00
841,842,2013,1,1,,600,,,901,,B6,125,N618JB,JFK,FLL,,1069,6,0,2013-01-01 06:00:00
1777,1778,2013,1,2,,1540,,,1747,,EV,4352,N10575,EWR,CVG,,569,15,40,2013-01-02 15:00:00


### Remove missing data rows

In [65]:
flightData.shape

(120780, 20)

In [66]:
flightData.dropna(how='any').shape

(117422, 20)

In [67]:
flightData.dropna(how='any').head()

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
0,1,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00
1,2,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00
2,3,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00
3,4,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00
4,5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00


### Filling missing data

In [68]:
x = np.mean(flightData.dep_delay)
print("%1.1f"%x)

9.7


In [69]:
flightData.fillna(value={'dep_delay':x}).loc[835:840]

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
835,836,2013,1,1,2353.0,2359,-6.0,425.0,445,-20.0,B6,739,N591JB,JFK,PSE,195.0,1617,23,59,2013-01-01 23:00:00
836,837,2013,1,1,2353.0,2359,-6.0,418.0,442,-24.0,B6,707,N794JB,JFK,SJU,185.0,1598,23,59,2013-01-01 23:00:00
837,838,2013,1,1,2356.0,2359,-3.0,425.0,437,-12.0,B6,727,N588JB,JFK,BQN,186.0,1576,23,59,2013-01-01 23:00:00
838,839,2013,1,1,,1630,9.715987,,1815,,EV,4308,N18120,EWR,RDU,,416,16,30,2013-01-01 16:00:00
839,840,2013,1,1,,1935,9.715987,,2240,,AA,791,N3EHAA,LGA,DFW,,1389,19,35,2013-01-01 19:00:00
840,841,2013,1,1,,1500,9.715987,,1825,,AA,1925,N3EVAA,LGA,MIA,,1096,15,0,2013-01-01 15:00:00


### Statistical operations

In [70]:
bankData.mean()

age           40.936210
balance     1362.272058
day           15.806419
duration     258.163080
campaign       2.763841
pdays         40.197828
previous       0.580323
one            1.000000
dtype: float64

In [71]:
bankData.std()

age           10.618762
balance     3044.765829
day            8.322476
duration     257.527812
campaign       3.098021
pdays        100.128746
previous       2.303441
one            0.000000
dtype: float64

In [72]:
bankData.median()

age          39.0
balance     448.0
day          16.0
duration    180.0
campaign      2.0
pdays        -1.0
previous      0.0
one           1.0
dtype: float64

### Dummy variables
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

In [77]:
df = pd.DataFrame({'key':list('bbacab'),'data1':range(6)})
print(df)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   b      5


In [78]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [85]:
bankData.education.value_counts()

secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64

In [79]:
pd.get_dummies(bankData, columns=['education']).head()

Unnamed: 0,age,job,marital,default,balance,housing,loan,contact,day,month,...,campaign,pdays,previous,poutcome,y,one,education_primary,education_secondary,education_tertiary,education_unknown
0,58,management,married,no,2143,yes,no,unknown,5,may,...,1,-1,0,unknown,no,1.0,0,0,1,0
1,44,technician,single,no,29,yes,no,unknown,5,may,...,1,-1,0,unknown,no,1.0,0,1,0,0
2,33,entrepreneur,married,no,2,yes,yes,unknown,5,may,...,1,-1,0,unknown,no,1.0,0,1,0,0
3,47,blue-collar,married,no,1506,yes,no,unknown,5,may,...,1,-1,0,unknown,no,1.0,0,0,0,1
4,33,unknown,single,no,1,no,no,unknown,5,may,...,1,-1,0,unknown,no,1.0,0,0,0,1
