# Mastering Python for Data Science

## Chapter 1: Getting Started with Raw Data

Data comes with various forms, such as Excel, CSV, JSON, databases, and so on. In this chapter we will introduce useful packages for handling data in different forms. 

Then we need to clean the data using other tools

In this chapter we will cover the following topics:

- Exploring arrays with NumPy
- Handling data with pandas
- Reading and writing data from various formats
- Handling missing data
- Manipulating data

### NumPy

In [30]:
import numpy as np

n_array = np.array([[0,1,2,3], [4,5,6,7], [8,9,10,11]])

In [2]:
n_array.ndim

2

In [3]:
n_array.shape

(3, 4)

In [4]:
n_array.size

12

In [5]:
n_array.dtype.name

'int64'

In [6]:
a = np.array([11,12,13,14])
b = np.array([1,2,3,4])
c = a-b

In [7]:
c

array([10, 10, 10, 10])

In [8]:
a**2

array([121, 144, 169, 196])

In [9]:
np.cos(b)

array([ 0.54030231, -0.41614684, -0.9899925 , -0.65364362])

In [10]:
b < 2

array([ True, False, False, False], dtype=bool)

In [14]:
A1 = np.array([[1,1], [0,1]])
A2 = np.array([[2,0], [3,4]])

In [15]:
A1

array([[1, 1],
       [0, 1]])

In [16]:
A2

array([[2, 0],
       [3, 4]])

In [18]:
# Matrix Multiplication
A1 * A2

array([[2, 0],
       [0, 4]])

In [19]:
np.dot(A1, A2)

array([[5, 4],
       [3, 4]])

In [31]:
print(n_array)
# indexing with numpy
n_array[0,1]

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


1

In [32]:
# Slicing with numpy
n_array[ 0 , 0:3 ]

array([0, 1, 2])

In [33]:
# Flattening the array
n_array.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [34]:
# Change the shape to a specific 6x2
n_array.shape = (6,2)
n_array

array([[ 0,  1],
       [ 2,  3],
       [ 4,  5],
       [ 6,  7],
       [ 8,  9],
       [10, 11]])

In [37]:
# Transpose
n_array = n_array.transpose()
n_array

array([[ 0,  2,  4,  6,  8, 10],
       [ 1,  3,  5,  7,  9, 11]])

### Pandas

In [38]:
import pandas as pd
pd.Series(np.random.rand(5))

0    0.166590
1    0.350844
2    0.232682
3    0.908839
4    0.104816
dtype: float64

In [39]:
pd.Series(np.random.rand(5), index=['a', 'b', 'c', 'd', 'e'])

a    0.319026
b    0.796522
c    0.756913
d    0.018102
e    0.799039
dtype: float64

In [40]:
d = {'A': 10, 'B':20, 'C':30}
pd.Series(d)

A    10
B    20
C    30
dtype: int64

In [41]:
# Create with dict of Series
d = {'c1': pd.Series(['A', 'B', 'C']),
     'c2': pd.Series([1, 2., 3., 4.])}
df = pd.DataFrame(d)
df

Unnamed: 0,c1,c2
0,A,1.0
1,B,2.0
2,C,3.0
3,,4.0


In [42]:
d = {'c1': ['A', 'B', 'C', 'D'], 
     'c2': [1, 2.0, 3.0, 4.0]}
df = pd.DataFrame(d)
print(df)

  c1   c2
0  A  1.0
1  B  2.0
2  C  3.0
3  D  4.0


In [43]:
d = {'Item1': pd.DataFrame(np.random.randn(4, 3)), 
     'Item2': pd.DataFrame(np.random.randn(4, 2))}
pd.Panel(d)

<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 0 to 3
Minor_axis axis: 0 to 2