# 0. Package Import

In [2]:
from os.path import join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. Pandas

## Data import witn Pandas

In [4]:
## Data import
df = pd.read_csv(join('data','bikeshare.csv'))
df.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


## Data Control with Pandas



In [5]:
df.shape # 10886 x 12 즉 총 10886개의 데이터에 각각 12가지의 feature가 있다.

(10886, 12)

In [6]:
print(df.shape[0])
print(df.shape[1])

10886
12


In [8]:
# Column 명을 이용한 data slicing 가능(pandas에서만)
print(df['temp'])

0         9.84
1         9.02
2         9.02
3         9.84
4         9.84
         ...  
10881    15.58
10882    14.76
10883    13.94
10884    13.94
10885    13.12
Name: temp, Length: 10886, dtype: float64
0    9.84
1    9.02
2    9.02
3    9.84
4    9.84
Name: temp, dtype: float64


In [9]:
print(df['temp'][0:5])

0    9.84
1    9.02
2    9.02
3    9.84
4    9.84
Name: temp, dtype: float64


In [10]:
temp = df['temp']
print(temp[0:5])

0    9.84
1    9.02
2    9.02
3    9.84
4    9.84
Name: temp, dtype: float64


In [11]:
# index를 이용한 data slicing
print(df.iloc[0:3,0:7])

              datetime  season  holiday  workingday  weather  temp   atemp
0  2011-01-01 00:00:00       1        0           0        1  9.84  14.395
1  2011-01-01 01:00:00       1        0           0        1  9.02  13.635
2  2011-01-01 02:00:00       1        0           0        1  9.02  13.635


In [13]:
temp = df.iloc[:,6] # 6은 6번째(1부터 시작) temp에 해당 
print(temp[0:5])

0    14.395
1    13.635
2    13.635
3    14.395
4    14.395
Name: atemp, dtype: float64


In [17]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [18]:
# temp는 Series 왜냐하면 단일 칼럼이므로
print(type(temp))

<class 'pandas.core.series.Series'>


In [19]:
# 각 columns별로 datatype을 써놓음
print(df.dtypes)

datetime       object
season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
windspeed     float64
casual          int64
registered      int64
count           int64
dtype: object


In [20]:
# temp column의 dtype
print(temp.dtype)

float64


## Table Join with Pandas  (like DB)


In [22]:
new_data = pd.DataFrame({'datetime': ['2011-01-01 00:00:00',
                                      '2011-01-01 01:00:00',
                                      '2011-01-01 02:00:00'], 'biketype': 
                          [1,2,3]})

In [31]:
new_data.head()
# datatime이 key값

Unnamed: 0,datetime,biketype
0,2011-01-01 00:00:00,1
1,2011-01-01 01:00:00,2
2,2011-01-01 02:00:00,3


In [27]:
join_data = pd.merge(df,new_data,how='inner')
join_data.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,biketype
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,3


In [28]:
join_data.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'biketype'],
      dtype='object')

In [29]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [30]:
df.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


## 2. Numpy 

### Data Control with Numpy



In [33]:
# pandas data to numpy array
np_data = np.array(df)
print(np_data)
print(type(np_data)) 

[['2011-01-01 00:00:00' 1 0 ... 3 13 16]
 ['2011-01-01 01:00:00' 1 0 ... 8 32 40]
 ['2011-01-01 02:00:00' 1 0 ... 5 27 32]
 ...
 ['2012-12-19 21:00:00' 4 0 ... 4 164 168]
 ['2012-12-19 22:00:00' 4 0 ... 12 117 129]
 ['2012-12-19 23:00:00' 4 0 ... 4 84 88]]
<class 'numpy.ndarray'>


In [34]:
# numpy array shape
np_data.shape

(10886, 12)

In [35]:
# numpy는 index(정수형)을 이용한 slicing만 가능
print(np_data[:,1])
print(np_data[0:5,0:5])

[1 1 1 ... 4 4 4]
[['2011-01-01 00:00:00' 1 0 0 1]
 ['2011-01-01 01:00:00' 1 0 0 1]
 ['2011-01-01 02:00:00' 1 0 0 1]
 ['2011-01-01 03:00:00' 1 0 0 1]
 ['2011-01-01 04:00:00' 1 0 0 1]]


In [40]:
# temp column만 추출
temp = np_data[:,5]
print(temp)

[9.84 9.02 9.02 ... 13.94 13.94 13.12]
[[9.84]
 [9.02]
 [9.02]
 ...
 [13.94]
 [13.94]
 [13.12]]


In [46]:
# Row vector -> column vector로 전환
# 1. Transpose
temp = np.array([1,2,3,4,5]).reshape(1,5) # row vector
print(temp)
col_temp = temp.T
print(col_temp)

[[1 2 3 4 5]]
[[1]
 [2]
 [3]
 [4]
 [5]]


In [48]:
# 2. reshape 
# 여기서 -1은 자동 추론
reshape_temp = temp.reshape(-1,1)
print(reshape_temp.shape)
print(reshape_temp)

(5, 1)
[[1]
 [2]
 [3]
 [4]
 [5]]


## Random Functions


In [54]:
# random하게 shuffle
# seed를 설정해주지 않으면 매번 shuffle할때마다 결과가 다름.
temp = np.array([1,2,3,4,5])
rand_temp = np.random.permutation(temp)
print(temp)
print(rand_temp)
print(np.random.permutation(temp))

[1 2 3 4 5]
[4 5 1 2 3]
[3 5 2 4 1]


In [56]:
# normal distribution generation
# loc = Mean (“centre”) of the distribution.
# scale = Standard deviation (spread or “width”) of the distribution. 
# Must be non-negative.
# size = shape 
a = np.random.normal(loc=1,scale=1,size=(10,2))
print(a)

[[ 0.52195173  0.5446029 ]
 [ 0.68729489 -1.18480445]
 [ 0.05024488  2.09297693]
 [ 1.90687703  0.89007623]
 [ 1.1197378  -1.34000018]
 [ 0.99497482 -0.29593317]
 [ 0.7333929   0.98562891]
 [-0.29973644 -0.96174915]
 [ 2.18554624  0.6368561 ]
 [ 0.7183961  -0.2015181 ]]
