# Data Munging

## Data loading and preprocessing with pandas

### Fast and easy data loading

In [1]:
import pandas as pd
# 给文件命名名称
iris_filename = 'datasets-uci-iris.csv'
# pd.read_csv读入的是csv这个格式的数据
# names表示读入的是这几列数据
iris = pd.read_csv(iris_filename, sep=',', decimal='.', header=None,
names= ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
'target'])

In [2]:
# If the dataset is available online, 
# and you can follow these steps to
# download it from the Internet:

# 输入一个第三方库
import urllib
# 写上我们需要下载的地址
url = "http://aima.cs.berkeley.edu/data/iris.csv"
# 获取这个地址
set1 = urllib.request.Request(url)
# 打开这个地址里边的内容
iris_p = urllib.request.urlopen(set1)
# 把地址里边的文档进行读入
iris_other = pd.read_csv(iris_p, sep=',', decimal='.',
header=None, names= ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target'])
iris_other.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [5]:
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa


In [6]:
iris.tail(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [7]:
# 输出文件的列名称
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'], dtype='object')

In [8]:
# 将target中的数据按列全部输出
Y = iris['target']
Y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: target, Length: 150, dtype: object

In [14]:
# 想输出两个就将两个括在一起
X = iris[['sepal_length', 'sepal_width']]
X

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


In [15]:
# 打印一下X的格式
print (X.shape)

(150, 2)


In [16]:
print (Y.shape)

(150,)


### Dealing with problematic data

In [1]:
import pandas as pd
fake_dataset = pd.read_csv('a_loading_example_1.csv', sep=',')
fake_dataset

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,,32,3
5,20140915,,57.0,42,2


In [24]:
# 将第0列以日期的格式输出
fake_dataset = pd.read_csv('a_loading_example_1.csv',parse_dates=[0])
fake_dataset

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,,32,3
5,2014-09-15,,57.0,42,2


In [2]:
# 用数值50取代NaN
fake_dataset.fillna(50)

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,50.0,32,3
5,20140915,50.0,57.0,42,2


In [3]:
fake_dataset.fillna(-1)

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,-1.0,32,3
5,20140915,-1.0,57.0,42,2


In [4]:
# mean就是均值的意思
# 用列的的均值axis=0
fake_dataset.fillna(fake_dataset.mean(axis=0))

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,42.8,32,3
5,20140915,80.4,57.0,42,2


In [5]:
# error_bad_liness=False将有问题的行删去
bad_dataset = pd.read_csv('a_loading_example_2.csv',error_bad_lines=False)

b'Skipping line 4: expected 3 fields, saw 4\n'


In [6]:
bad_dataset

Unnamed: 0,Val1,Val2,Val3
0,0,0,0
1,1,1,1
2,3,3,3


### Dealing with big datasets

In [1]:
import pandas as pd
# 命名c1--c5五列
#  chunksize=10区块大小为10行数又这个参数决定 
iris_chunks = pd.read_csv(iris_filename, header=None,names=['C1', 'C2', 'C3', 'C4', 'C5'], chunksize=10)
for chunk in iris_chunks:
    print ('Shape:', chunk.shape)
    print (chunk,'\n')

NameError: name 'iris_filename' is not defined

In [11]:
iris_iterator = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], iterator=True)

NameError: name 'iris_filename' is not defined

In [13]:
# 打印一下数据
print (iris_iterator.get_chunk(10))

NameError: name 'iris_iterator' is not defined

In [32]:
# 查看十行区块的形状
# 每次数据都是往后走的，下次迭代处理的是下一批数据
print (iris_iterator.get_chunk(10).shape)

(10, 5)


In [33]:
print (iris_iterator.get_chunk(20).shape)

(20, 5)


In [34]:
piece = iris_iterator.get_chunk(2)
piece

Unnamed: 0,C1,C2,C3,C4,C5
30,4.8,3.1,1.6,0.2,Iris-setosa
31,5.4,3.4,1.5,0.4,Iris-setosa


In [35]:
# open函数在csv库中
import csv
# open也可以处理数据
# rt是以文本形式进行操作
with open(iris_filename, 'rt') as data_stream:
    # 'rt' mode
#     输出n和row
#  enumerate用于产生新的索引序列
    for n, row in enumerate(csv.DictReader(data_stream,
        fieldnames = ['sepal_length', 'sepal_width',
        'petal_length', 'petal_width', 'target'],
        dialect='excel')):
            if n== 0:
                print (n,row)
            else:
                break

0 OrderedDict([('sepal_length', '5.1'), ('sepal_width', '3.5'), ('petal_length', '1.4'), ('petal_width', '0.2'), ('target', 'Iris-setosa')])


In [36]:
with open(iris_filename, 'rt') as data_stream:
#     输出第一行的结果
    for n, row in enumerate(csv.reader(data_stream,
        dialect='excel')):
            if n==0:
                print (row)
            else:
                break

['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']


In [37]:
# 将数据以5个p次处理的方式进行读入
def batch_read(filename, batch=5):
    # open the data stream
    with open(filename, 'rt') as data_stream:
        # reset the batch
        batch_output = list()
        # iterate over the file
        for n, row in enumerate(csv.reader(data_stream, dialect='excel')):
            # if the batch is of the right size
            if n > 0 and n % batch == 0:
                # yield back the batch as an ndarray
#                 将输出写成数组形式
                yield(np.array(batch_output))
                # reset the batch and restart
                batch_output = list()
            # otherwise add the row to the batch
            batch_output.append(row)
        # when the loop is over, yield what's left
        yield(np.array(batch_output))

In [38]:
import numpy as np
for batch_input in batch_read(iris_filename, batch=3):
    print (batch_input)
    break

[['5.1' '3.5' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.0' '1.4' '0.2' 'Iris-setosa']
 ['4.7' '3.2' '1.3' '0.2' 'Iris-setosa']]


### Accessing other data formats

In [9]:
import pandas as pd
my_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
[1.0]*5, 'Col3': 1.0, 'Col4': 'Hello World!'})
my_own_dataset

Unnamed: 0,Col1,Col2,Col3,Col4
0,0,1.0,1.0,Hello World!
1,1,1.0,1.0,Hello World!
2,2,1.0,1.0,Hello World!
3,3,1.0,1.0,Hello World!
4,4,1.0,1.0,Hello World!


In [7]:
range(5)
for i in range(5):
    print(i)

0
1
2
3
4


In [8]:
a = [1.0]*5
print(a)

[1.0, 1.0, 1.0, 1.0, 1.0]


In [40]:
my_wrong_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
'string', 'Col3': range(2)})

ValueError: arrays must all be same length

In [41]:
range(5)
for i in range(5):
     print(i)

0
1
2
3
4


In [10]:
my_own_dataset.dtypes
# 输出刚刚数组的数据类型

Col1      int64
Col2    float64
Col3    float64
Col4     object
dtype: object

In [43]:
a=[1.0]*5
print(a)

[1.0, 1.0, 1.0, 1.0, 1.0]


In [11]:
my_own_dataset['Col1'] = my_own_dataset['Col1'].astype(float)
my_own_dataset.dtypes
# 将第一列变成浮点型，然后再判断数据类型

Col1    float64
Col2    float64
Col3    float64
Col4     object
dtype: object

### Data preprocessing

In [16]:
# 做数据预处理
mask_feature = iris['sepal_length'] > 6.0
mask_feature
# 判断大于6的有哪些，mask_feature相当于眼膜来判断

0      False
1      False
2      False
3      False
4      False
       ...  
145     True
146     True
147     True
148     True
149    False
Name: sepal_length, Length: 150, dtype: bool

In [17]:
# 下边两步是用New label代替Iris-virginica
mask_target = iris['target'] == 'Iris-virginica'
mask_target

0      False
1      False
2      False
3      False
4      False
       ...  
145     True
146     True
147     True
148     True
149     True
Name: target, Length: 150, dtype: bool

In [18]:
iris.loc[mask_target, 'target'] = 'New label'
iris['target']

0      Iris-setosa
1      Iris-setosa
2      Iris-setosa
3      Iris-setosa
4      Iris-setosa
          ...     
145      New label
146      New label
147      New label
148      New label
149      New label
Name: target, Length: 150, dtype: object

In [48]:
# 判断这个类别中有哪几个标签
iris['target'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'New label'], dtype=object)

In [19]:
grouped_targets_mean = iris.groupby(['target']).mean()
grouped_targets_mean
# 用groupby可以做分组操作   
# .mean（）取均值

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
New label,6.588,2.974,5.552,2.026


In [50]:
grouped_targets_var = iris.groupby(['target']).var()
grouped_targets_var
# .var()取方差

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,0.124249,0.14518,0.030106,0.011494
Iris-versicolor,0.266433,0.098469,0.220816,0.039106
New label,0.404343,0.104004,0.304588,0.075433


In [52]:
iris.sort_values(by='sepal_length').head()
# 按sepal_length进行排序输出前五行

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
13,4.3,3.0,1.1,0.1,Iris-setosa
42,4.4,3.2,1.3,0.2,Iris-setosa
38,4.4,3.0,1.3,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
41,4.5,2.3,1.3,0.3,Iris-setosa


In [38]:
# This is just an example, with no time_series data
# smooth_time_series = pd.rolling_mean(time_series, 5)

In [39]:
# This is just an example, with no time_series data
# median_time_series = pd.rolling_median(time_series, 5)

In [53]:
iris.apply(np.count_nonzero, axis=1).head(10)
# axis=0是列方向，axis=1是按行方向
# 计算每一行上非零数据

0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
dtype: int64

In [41]:
iris.apply(np.count_nonzero, axis=0)

sepal_length    150
sepal_width     150
petal_length    150
petal_width     150
target          150
dtype: int64

In [20]:
iris.applymap(lambda el:len(str(el))).head()
# lambda是一个循环函数
# 将el中的函数转换成字符串的长度

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,3,3,3,3,11
1,3,3,3,3,11
2,3,3,3,3,11
3,3,3,3,3,11
4,3,3,3,3,11


### Data selection

In [1]:
import pandas as pd
dataset = pd.read_csv('a_selection_example_1.csv')
dataset

Unnamed: 0,n,val1,val2,val3
0,100,10,10,C
1,101,10,20,C
2,102,10,30,B
3,103,10,40,B
4,104,10,50,A


In [3]:
# 让第0列作为索引列
dataset = pd.read_csv('a_selection_example_1.csv', index_col=0)
dataset

Unnamed: 0_level_0,val1,val2,val3
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,10,10,C
101,10,20,C
102,10,30,B
103,10,40,B
104,10,50,A


In [4]:
dataset['val3'][104]

'A'

In [46]:
# loction
dataset.loc[104, 'val3']

'A'

In [5]:
# index
dataset.ix[104, 'val3']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  


'A'

In [48]:
dataset.ix[104, 2]

'A'

In [6]:
dataset.iloc[4, 2]

'A'

In [50]:
dataset[['val3', 'val2']][0:2]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [51]:
dataset.loc[range(100, 102), ['val3', 'val2']]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [1]:
dataset.ix[range(100, 102), ['val3', 'val2']]

SyntaxError: invalid syntax (<ipython-input-1-3dd4beafc84a>, line 1)

In [53]:
dataset.ix[range(100, 102), [2, 1]]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [54]:
dataset.iloc[range(2), [2,1]]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


# Working with categorical and textual data

In [None]:
# 数值数据具有统计属性
# 分类数据表示一种可以测量的属性

In [24]:
import pandas as pd
# 用五个类属特征组成一位数列
categorical_feature = pd.Series(['sunny', 'cloudy', 'snowy',
'rainy', 'foggy'])
# get_dummies是虚拟编码，根据布尔数据进行二型编码（0，1）
mapping = pd.get_dummies(categorical_feature)
mapping


Unnamed: 0,cloudy,foggy,rainy,snowy,sunny
0,0,0,0,0,1
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,1,0,0
4,0,1,0,0,0


In [25]:
mapping['sunny']

0    1
1    0
2    0
3    0
4    0
Name: sunny, dtype: uint8

In [26]:
mapping['cloudy']

0    0
1    1
2    0
3    0
4    0
Name: cloudy, dtype: uint8

In [None]:
# Scikit-Learn是python专门针对机器学习应用发展的一款开源开源框架
# 功能：分类，回归，聚类，数据降维，模型选择，数据预处理等

In [28]:
# OneHotEncoder 独热编码（一位编码）N个寄存器对N个状态进行编码
from sklearn.preprocessing import OneHotEncoder
# 类别编码
from sklearn.preprocessing import LabelEncoder
#获取一个 LabelEncoder标签
le = LabelEncoder() 
#对五个特征进行独热编码
ohe = OneHotEncoder() 
# levels五个类属特征
levels = ['sunny', 'cloudy', 'snowy', 'rainy', 'foggy']
# fit 训练 LabelEncoder，transform 使用训练好的LabelEncoder对原数据进行编码
fit_levs = le.fit_transform(levels)
ohe.fit([[fit_levs[0]], [fit_levs[1]], [fit_levs[2]], [fit_levs[3]],
[fit_levs[4]]])
print (ohe.transform([le.transform(['sunny'])]).toarray())
print (ohe.transform([le.transform(['cloudy'])]).toarray())

[[0. 0. 0. 0. 1.]]
[[1. 0. 0. 0. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## A special type of data: text

In [None]:
# 处理文本最常用的方法就是使用词袋——bag of words
# 把文本拆分成单词，每个单词都变成了特征，

In [1]:
# 从sklearn.datasets下载一个20个数字集
from sklearn.datasets import fetch_20newsgroups
# 探索医学和空间方面的数据
categories = ['sci.med', 'sci.space']
twenty_sci_news = fetch_20newsgroups(categories=categories)

In [2]:
print(twenty_sci_news.data[0])

From: flb@flb.optiplan.fi ("F.Baube[tm]")
Subject: Vandalizing the sky
X-Added: Forwarded by Space Digest
Organization: [via International Space University]
Original-Sender: isu@VACATION.VENARI.CS.CMU.EDU
Distribution: sci
Lines: 12

From: "Phil G. Fraering" <pgf@srl03.cacs.usl.edu>
> 
> Finally: this isn't the Bronze Age, [..]
> please try to remember that there are more human activities than
> those practiced by the Warrior Caste, the Farming Caste, and the
> Priesthood.

Right, the Profiting Caste is blessed by God, and may 
 freely blare its presence in the evening twilight ..

-- 
* Fred Baube (tm)



In [3]:
# 查看文件地址
twenty_sci_news.filenames

array(['C:\\Users\\84146\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\61116',
       'C:\\Users\\84146\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.med\\58122',
       'C:\\Users\\84146\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.med\\58903',
       ...,
       'C:\\Users\\84146\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\60774',
       'C:\\Users\\84146\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\60954',
       'C:\\Users\\84146\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.med\\58911'],
      dtype='<U95')

In [4]:
# 输出第一个文件的标签
# target查看文件的标签
print (twenty_sci_news.target[0])
print (twenty_sci_news.target_names[twenty_sci_news.target[0]])

1
sci.space


In [5]:
# 查看第一个文本内容
print(twenty_sci_news.data[0])

From: flb@flb.optiplan.fi ("F.Baube[tm]")
Subject: Vandalizing the sky
X-Added: Forwarded by Space Digest
Organization: [via International Space University]
Original-Sender: isu@VACATION.VENARI.CS.CMU.EDU
Distribution: sci
Lines: 12

From: "Phil G. Fraering" <pgf@srl03.cacs.usl.edu>
> 
> Finally: this isn't the Bronze Age, [..]
> please try to remember that there are more human activities than
> those practiced by the Warrior Caste, the Farming Caste, and the
> Priesthood.

Right, the Profiting Caste is blessed by God, and may 
 freely blare its presence in the evening twilight ..

-- 
* Fred Baube (tm)



In [7]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer统计词出现的次数
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)
# 输出文档数和特征数（单词数）
word_count.shape

(1187, 25638)

In [8]:
print (word_count[0])

  (0, 10827)	2
  (0, 10501)	2
  (0, 17170)	1
  (0, 10341)	1
  (0, 4762)	2
  (0, 23381)	2
  (0, 22345)	1
  (0, 24461)	1
  (0, 23137)	7
  (0, 21382)	1
  (0, 3233)	1
  (0, 10713)	1
  (0, 5650)	3
  (0, 21686)	2
  (0, 8438)	1
  (0, 17217)	1
  (0, 24626)	1
  (0, 13158)	1
  (0, 24151)	1
  (0, 17235)	1
  (0, 20909)	1
  (0, 13359)	1
  (0, 24415)	1
  (0, 24547)	1
  (0, 7646)	1
  :	:
  (0, 15968)	1
  (0, 12362)	1
  (0, 3191)	1
  (0, 23129)	1
  (0, 23241)	1
  (0, 18474)	1
  (0, 24930)	1
  (0, 6017)	3
  (0, 10188)	1
  (0, 3808)	2
  (0, 18642)	1
  (0, 20110)	1
  (0, 18744)	1
  (0, 13318)	1
  (0, 5148)	1
  (0, 11330)	1
  (0, 15246)	1
  (0, 10785)	1
  (0, 5134)	1
  (0, 13384)	1
  (0, 18586)	1
  (0, 12716)	1
  (0, 9796)	1
  (0, 23849)	1
  (0, 10778)	1


In [9]:
word_list = count_vect.get_feature_names()
for n in word_count[2].indices:
    print ('Word "%s" appears %i times' % (word_list[n], word_count[0, n]))

Word "from" appears 2 times
Word "subject" appears 1 times
Word "the" appears 7 times
Word "organization" appears 1 times
Word "lines" appears 1 times
Word "this" appears 1 times
Word "to" appears 1 times
Word "that" appears 1 times
Word "and" appears 2 times
Word "is" appears 1 times
Word "reply" appears 0 times
Word "of" appears 0 times
Word "like" appears 0 times
Word "any" appears 0 times
Word "an" appears 0 times
Word "as" appears 0 times
Word "does" appears 0 times
Word "anyone" appears 0 times
Word "know" appears 0 times
Word "it" appears 0 times
Word "called" appears 0 times
Word "soon" appears 0 times
Word "ab961" appears 0 times
Word "freenet" appears 0 times
Word "carleton" appears 0 times
Word "ca" appears 0 times
Word "robert" appears 0 times
Word "allison" appears 0 times
Word "frequent" appears 0 times
Word "nosebleeds" appears 0 times
Word "national" appears 0 times
Word "capital" appears 0 times
Word "18" appears 0 times
Word "have" appears 0 times
Word "between" appea

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 不计算逆向文本率所以idf是False norm = l1 是数据归一化的参数默认l2
tf_vect = TfidfVectorizer(use_idf=False, norm='l1')
word_freq = tf_vect.fit_transform(twenty_sci_news.data)
# get_feature_names()获取词单
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
    print ('Word "%s" has frequency %0.3f' % (word_list[n], word_freq[0, n]))
#     每个词出现的频率，所以词相加等于一

Word "from" has frequency 0.022
Word "flb" has frequency 0.022
Word "optiplan" has frequency 0.011
Word "fi" has frequency 0.011
Word "baube" has frequency 0.022
Word "tm" has frequency 0.022
Word "subject" has frequency 0.011
Word "vandalizing" has frequency 0.011
Word "the" has frequency 0.077
Word "sky" has frequency 0.011
Word "added" has frequency 0.011
Word "forwarded" has frequency 0.011
Word "by" has frequency 0.033
Word "space" has frequency 0.022
Word "digest" has frequency 0.011
Word "organization" has frequency 0.011
Word "via" has frequency 0.011
Word "international" has frequency 0.011
Word "university" has frequency 0.011
Word "original" has frequency 0.011
Word "sender" has frequency 0.011
Word "isu" has frequency 0.011
Word "vacation" has frequency 0.011
Word "venari" has frequency 0.011
Word "cs" has frequency 0.011
Word "cmu" has frequency 0.011
Word "edu" has frequency 0.022
Word "distribution" has frequency 0.011
Word "sci" has frequency 0.011
Word "lines" has freq

In [11]:
# 词频逆向文本率 数字大的在其他文本中出现的频率比较低
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # Default: use_idf=True
word_tfidf = tfidf_vect.fit_transform(twenty_sci_news.data)
word_list = tfidf_vect.get_feature_names()
for n in word_tfidf[0].indices:
    print ('Word "%s" has tf-idf %0.3f' % (word_list[n], word_tfidf[0, n]))

Word "fred" has tf-idf 0.089
Word "twilight" has tf-idf 0.139
Word "evening" has tf-idf 0.113
Word "in" has tf-idf 0.024
Word "presence" has tf-idf 0.119
Word "its" has tf-idf 0.061
Word "blare" has tf-idf 0.150
Word "freely" has tf-idf 0.119
Word "may" has tf-idf 0.054
Word "god" has tf-idf 0.119
Word "blessed" has tf-idf 0.150
Word "is" has tf-idf 0.026
Word "profiting" has tf-idf 0.150
Word "right" has tf-idf 0.068
Word "priesthood" has tf-idf 0.144
Word "and" has tf-idf 0.049
Word "farming" has tf-idf 0.144
Word "caste" has tf-idf 0.433
Word "warrior" has tf-idf 0.144
Word "practiced" has tf-idf 0.132
Word "those" has tf-idf 0.060
Word "than" has tf-idf 0.052
Word "activities" has tf-idf 0.091
Word "human" has tf-idf 0.084
Word "more" has tf-idf 0.046
Word "are" has tf-idf 0.035
Word "there" has tf-idf 0.039
Word "that" has tf-idf 0.027
Word "remember" has tf-idf 0.077
Word "to" has tf-idf 0.023
Word "try" has tf-idf 0.073
Word "please" has tf-idf 0.071
Word "age" has tf-idf 0.092


In [12]:
text_1 = 'we love data science'
text_2 = 'data science is hard'
documents = [text_1, text_2]
documents

['we love data science', 'data science is hard']

In [14]:
# That is what we say above, the default one
# 用一元语法
count_vect_1_grams = CountVectorizer(ngram_range=(1, 1),
stop_words=[], min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print ("Word list = ", word_list)
# 统计第一个文本中的词
print ("text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices])

Word list =  ['data', 'hard', 'is', 'love', 'science', 'we']
text_1 is described with ['data(1)', 'science(1)', 'is(0)', 'hard(0)']


In [15]:
# Now a bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(2, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print ("Word list = ", word_list)
print ("text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices])

Word list =  ['data science', 'is hard', 'love data', 'science is', 'we love']
text_1 is described with ['we love(1)', 'love data(1)', 'data science(1)']


In [71]:
# Now a uni- and bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(1, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print ("Word list = ", word_list)
print ("text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices])

Word list =  ['data', 'data science', 'hard', 'is', 'is hard', 'love', 'love data', 'science', 'science is', 'we', 'we love']
text_1 is described with ['we(1)', 'love(1)', 'data(1)', 'science(1)', 'we love(1)', 'love data(1)', 'data science(1)']


In [72]:
from sklearn.feature_extraction.text import HashingVectorizer
# 构造一个容量为1000的词桶
hash_vect = HashingVectorizer(n_features=1000)
word_hashed = hash_vect.fit_transform(twenty_sci_news.data)
word_hashed.shape

(1187, 1000)

## Scraping the Web

In [73]:
import urllib.request

url = 'https://en.wikipedia.org/wiki/William_Shakespeare'
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)

In [74]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(response, 'html.parser')

In [75]:
soup.title

<title>William Shakespeare - Wikipedia, the free encyclopedia</title>

In [76]:
section = soup.find_all(id='mw-normal-catlinks')[0]
for catlink in section.find_all("a")[1:]:
    print(catlink.get("title"), "->", catlink.get("href"))

Category:William Shakespeare -> /wiki/Category:William_Shakespeare
Category:1564 births -> /wiki/Category:1564_births
Category:1616 deaths -> /wiki/Category:1616_deaths
Category:16th-century English male actors -> /wiki/Category:16th-century_English_male_actors
Category:English male stage actors -> /wiki/Category:English_male_stage_actors
Category:16th-century English writers -> /wiki/Category:16th-century_English_writers
Category:17th-century English writers -> /wiki/Category:17th-century_English_writers
Category:16th-century dramatists and playwrights -> /wiki/Category:16th-century_dramatists_and_playwrights
Category:17th-century dramatists and playwrights -> /wiki/Category:17th-century_dramatists_and_playwrights
Category:16th-century English poets -> /wiki/Category:16th-century_English_poets
Category:Burials in Warwickshire -> /wiki/Category:Burials_in_Warwickshire
Category:English dramatists and playwrights -> /wiki/Category:English_dramatists_and_playwrights
Category:17th-century 

## Creating NumPy arrays

### From lists to unidimensional arrays

In [6]:
import numpy as np
# Transform a list into a uni-dimensional array
# 定义一个列表
list_of_ints = [1,2,3]
# 将列表转换成数组
Array_1 = np.array(list_of_ints)
Array_1

array([1, 2, 3])

In [7]:
Array_1[1] # let's output the second value

2

In [8]:
# 查看数据结构的对象类型
type(Array_1)

numpy.ndarray

In [9]:
# 查看数组中元素类型
Array_1.dtype # Note: The default dtype depends on the system you're operating.

dtype('int32')

### Controlling the memory size

In [10]:
import numpy as np
# 看一下内存占用情况
Array_1.nbytes # Please note that on 64bit platforms the result will be 24.

12

In [15]:
# 指定一个更适合的类型,减少使用的内存
Array_1 = np.array(list_of_ints, dtype= 'int8')
Array_1.nbytes

3

In [17]:
# 改变数据类型将int变成float
Array_1b = Array_1.astype('float32')
Array_1b
# Array_1b.nbytes

array([1., 2., 3.], dtype=float32)

### Heterogeneous lists

In [18]:
import numpy as np
# 混合数组，有整型，浮点型，字符串，+是拼接的意思
complex_list = [1,2,3] + [1.,2.,3.] + ['a','b','c']
complex_list

[1, 2, 3, 1.0, 2.0, 3.0, 'a', 'b', 'c']

In [21]:
complex_list[:3]

[1, 2, 3]

In [23]:
Array_2 = np.array(complex_list[:3]) # at first the input list is just ints
print ('complex_list[:3]', Array_2.dtype)

complex_list[:3] int32


In [26]:
complex_list[:6]

[1, 2, 3, 1.0, 2.0, 3.0]

In [24]:
Array_2 = np.array(complex_list[:6]) # then it is ints and floats
print ('complex_list[:6]', Array_2.dtype)

complex_list[:6] float64


In [25]:
Array_2 = np.array(complex_list) # finally we add strings
print ('complex_list[:] ',Array_2.dtype)
# <u32表示不多于32个字符组成的编码

complex_list[:]  <U32


In [27]:
# Check if a NumPy array is of the desired numeric type
print (isinstance(Array_2[0],np.number))

False


### From lists to multidimensional arrays

In [30]:
import numpy as np
# Transform a list into a bidimensional array
a_list_of_lists = [[1,2,3],[4,5,6],[7,8,9]]
a_list_of_lists

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [31]:
# 将列表直接转换成数组
Array_2D = np.array(a_list_of_lists )
Array_2D

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [32]:
# 找5所在的位置，第一个表示行
Array_2D[1,1]

5

In [34]:
# Transform a list into a multi-dimensional array
a_list_of_lists_of_lists = [[[1,2],[3,4],[5,6]],
[[7,8],[9,10],[11,12]]]
a_list_of_lists_of_lists

[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]

In [35]:
# 两个矩阵变成三位数组
Array_3D = np.array(a_list_of_lists_of_lists)
Array_3D

array([[[ 1,  2],
        [ 3,  4],
        [ 5,  6]],

       [[ 7,  8],
        [ 9, 10],
        [11, 12]]])

In [37]:
# 第一个判断是第几个矩阵，第二个是当前矩阵所在的行，第三个是当前矩阵所在的列
Array_3D[1,1,1] # Accessing the 5th element

10

In [39]:
{1:2,3:4,5:6}

{1: 2, 3: 4, 5: 6}

In [40]:
# .item()函数能将字典转化成二位数组
np.array(list({1:2,3:4,5:6}.items()))
# http://stackoverflow.com/questions/17695456/why-python-3-needs-wrap-dict-items-with-list
# https://docs.python.org/3.3/library/stdtypes.html#dict-views

array([[1, 2],
       [3, 4],
       [5, 6]])

### Resizing arrays

In [91]:
import numpy as np
# Restructuring a NumPy array shape
original_array = np.array([1, 2, 3, 4, 5, 6, 7, 8])
Array_a = original_array.reshape(4,2)
Array_b = original_array.reshape(4,2).copy()
Array_c = original_array.reshape(2,2,2)
# Attention because reshape creates just views, not copies
original_array[0] = -1

In [92]:
Array_a

array([[-1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8]])

In [93]:
Array_c

array([[[-1,  2],
        [ 3,  4]],

       [[ 5,  6],
        [ 7,  8]]])

In [94]:
Array_b

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

In [95]:
original_array.resize(4,2)
original_array

array([[-1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8]])

In [96]:
original_array.shape = (4,2)

In [97]:
original_array

array([[-1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8]])

### Arrays derived from NumPy functions

In [98]:
import numpy as np
ordinal_values = np.arange(9).reshape(3,3)
ordinal_values

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [99]:
np.arange(9)[::-1]

array([8, 7, 6, 5, 4, 3, 2, 1, 0])

In [100]:
np.random.randint(low=1,high=10,size=(3,3)).reshape(3,3)
# randomly generated, you can obtain a different array!

array([[6, 8, 7],
       [5, 1, 6],
       [9, 7, 3]])

In [101]:
np.zeros((3,3))

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [102]:
np.ones((3,3))

array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

In [103]:
np.eye(3)

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [104]:
fractions = np.linspace(start=0, stop=1, num=10)
fractions

array([ 0.        ,  0.11111111,  0.22222222,  0.33333333,  0.44444444,
        0.55555556,  0.66666667,  0.77777778,  0.88888889,  1.        ])

In [105]:
growth = np.logspace(start=0, stop=1, num=10, base=10.0)
growth

array([  1.        ,   1.29154967,   1.66810054,   2.15443469,
         2.7825594 ,   3.59381366,   4.64158883,   5.9948425 ,
         7.74263683,  10.        ])

In [106]:
std_gaussian = np.random.normal(size=(3,3))
std_gaussian
# randomly generated, you can obtain a different array!

array([[-1.18035881, -0.37632937,  0.00800547],
       [ 1.24898242, -0.92410685,  0.03292396],
       [-0.17619901,  1.10657852, -0.07862104]])

In [107]:
gaussian = np.random.normal(loc=1.0, scale= 3.0, size=(3,3))
gaussian
# randomly generated, you can obtain a different array!

array([[-0.37532441,  4.00680118,  3.86295364],
       [ 6.57138005, -0.41579563,  1.95635974],
       [ 2.80182265, -2.15492916, -0.73519315]])

In [108]:
rand = np.random.uniform(low=0.0, high=1.0, size=(3,3))
# randomly generated, you can obtain a different array!

### Getting an array directly from a file

In [109]:
import numpy as np
housing = np.loadtxt('regression-datasets-housing.csv',delimiter=',', dtype=float)

In [110]:
np.loadtxt('datasets-uci-iris.csv',delimiter=',',dtype=float)

ValueError: could not convert string to float: b'Iris-setosa'

### Extracting data from pandas

In [111]:
import pandas as pd
import numpy as np
housing_filename = 'regression-datasets-housing.csv'
housing = pd.read_csv(housing_filename, header=None)

In [112]:
housing_array = housing.values
housing_array.dtype

dtype('float64')

In [113]:
housing.dtypes

0     float64
1       int64
2     float64
3       int64
4     float64
5     float64
6     float64
7     float64
8       int64
9       int64
10      int64
11    float64
12    float64
13    float64
dtype: object

## NumPy fast operation and computations

In [114]:
import numpy as np
a = np.arange(5).reshape(1,5)
a += 1
a*a

array([[ 1,  4,  9, 16, 25]])

In [115]:
a = np.arange(5).reshape(1,5) + 1
b = np.arange(5).reshape(5,1) + 1
a * b

array([[ 1,  2,  3,  4,  5],
       [ 2,  4,  6,  8, 10],
       [ 3,  6,  9, 12, 15],
       [ 4,  8, 12, 16, 20],
       [ 5, 10, 15, 20, 25]])

In [116]:
a2 = np.array([1,2,3,4,5] * 5).reshape(5,5)
b2 = a2.T
a2 * b2

array([[ 1,  2,  3,  4,  5],
       [ 2,  4,  6,  8, 10],
       [ 3,  6,  9, 12, 15],
       [ 4,  8, 12, 16, 20],
       [ 5, 10, 15, 20, 25]])

In [117]:
print (a2)

[[1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]]


In [118]:
np.sum(a2, axis=0)

array([ 5, 10, 15, 20, 25])

In [119]:
np.sum(a2, axis=1)

array([15, 15, 15, 15, 15])

In [120]:
%timeit -n 1 -r 3 [i+1.0 for i in range(10**6)]
%timeit -n 1 -r 3 np.arange(10**6)+1.0

1 loop, best of 3: 146 ms per loop
1 loop, best of 3: 5.79 ms per loop


In [121]:
import math
%timeit -n 1 -r 3 [math.sqrt(i) for i in range(10**6)]

1 loop, best of 3: 221 ms per loop


In [122]:
%timeit -n 1 -r 3 np.sqrt(np.arange(10**6))

1 loop, best of 3: 8.35 ms per loop


### Matrix operations

In [123]:
import numpy as np
M = np.arange(5*5, dtype=float).reshape(5,5)
M

array([[  0.,   1.,   2.,   3.,   4.],
       [  5.,   6.,   7.,   8.,   9.],
       [ 10.,  11.,  12.,  13.,  14.],
       [ 15.,  16.,  17.,  18.,  19.],
       [ 20.,  21.,  22.,  23.,  24.]])

In [124]:
coefs = np.array([1., 0.5, 0.5, 0.5, 0.5])
coefs_matrix = np.column_stack((coefs,coefs[::-1]))
print (coefs_matrix)

[[ 1.   0.5]
 [ 0.5  0.5]
 [ 0.5  0.5]
 [ 0.5  0.5]
 [ 0.5  1. ]]


In [125]:
np.dot(M,coefs)

array([  5.,  20.,  35.,  50.,  65.])

In [126]:
np.dot(coefs,M)

array([ 25.,  28.,  31.,  34.,  37.])

In [127]:
np.dot(M,coefs_matrix)

array([[  5.,   7.],
       [ 20.,  22.],
       [ 35.,  37.],
       [ 50.,  52.],
       [ 65.,  67.]])

## Slicing and indexing with NumPy arrays

In [128]:
import numpy as np
M = np.arange(10*10, dtype=int).reshape(10,10)

In [129]:
M[2:9:2,:]

array([[20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89]])

In [130]:
M[2:9:2,5:]

array([[25, 26, 27, 28, 29],
       [45, 46, 47, 48, 49],
       [65, 66, 67, 68, 69],
       [85, 86, 87, 88, 89]])

In [131]:
M[2:9:2,5::-1]

array([[25, 24, 23, 22, 21, 20],
       [45, 44, 43, 42, 41, 40],
       [65, 64, 63, 62, 61, 60],
       [85, 84, 83, 82, 81, 80]])

In [132]:
row_index = (M[:,0]>=20) & (M[:,0]<=80)
col_index = M[0,:]>=5
M[row_index,:][:,col_index]

array([[25, 26, 27, 28, 29],
       [35, 36, 37, 38, 39],
       [45, 46, 47, 48, 49],
       [55, 56, 57, 58, 59],
       [65, 66, 67, 68, 69],
       [75, 76, 77, 78, 79],
       [85, 86, 87, 88, 89]])

In [133]:
mask = (M>=20) & (M<=90) & ((M / 10.) % 1 >= 0.5)
M[mask]

array([25, 26, 27, 28, 29, 35, 36, 37, 38, 39, 45, 46, 47, 48, 49, 55, 56,
       57, 58, 59, 65, 66, 67, 68, 69, 75, 76, 77, 78, 79, 85, 86, 87, 88,
       89])

In [134]:
row_index = [1,1,2,7]
col_index = [0,2,4,8]

In [135]:
M[row_index,col_index]

array([10, 12, 24, 78])

In [136]:
M[row_index,:][:,col_index]

array([[10, 12, 14, 18],
       [10, 12, 14, 18],
       [20, 22, 24, 28],
       [70, 72, 74, 78]])

In [137]:
N = M[2:9:2,5:].copy()

### Stacking NumPy arrays

In [138]:
import numpy as np
dataset = np.arange(10*5).reshape(10,5)

In [139]:
single_line = np.arange(1*5).reshape(1,5)
a_few_lines = np.arange(3*5).reshape(3,5)

In [140]:
np.vstack((dataset,single_line))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [ 0,  1,  2,  3,  4]])

In [141]:
np.vstack((dataset,a_few_lines))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [142]:
np.vstack((dataset,single_line,single_line))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [ 0,  1,  2,  3,  4],
       [ 0,  1,  2,  3,  4]])

In [143]:
bias = np.ones(10).reshape(10,1)
np.hstack((dataset,bias))

array([[  0.,   1.,   2.,   3.,   4.,   1.],
       [  5.,   6.,   7.,   8.,   9.,   1.],
       [ 10.,  11.,  12.,  13.,  14.,   1.],
       [ 15.,  16.,  17.,  18.,  19.,   1.],
       [ 20.,  21.,  22.,  23.,  24.,   1.],
       [ 25.,  26.,  27.,  28.,  29.,   1.],
       [ 30.,  31.,  32.,  33.,  34.,   1.],
       [ 35.,  36.,  37.,  38.,  39.,   1.],
       [ 40.,  41.,  42.,  43.,  44.,   1.],
       [ 45.,  46.,  47.,  48.,  49.,   1.]])

In [144]:
bias = np.ones(10)
np.column_stack((dataset,bias))

array([[  0.,   1.,   2.,   3.,   4.,   1.],
       [  5.,   6.,   7.,   8.,   9.,   1.],
       [ 10.,  11.,  12.,  13.,  14.,   1.],
       [ 15.,  16.,  17.,  18.,  19.,   1.],
       [ 20.,  21.,  22.,  23.,  24.,   1.],
       [ 25.,  26.,  27.,  28.,  29.,   1.],
       [ 30.,  31.,  32.,  33.,  34.,   1.],
       [ 35.,  36.,  37.,  38.,  39.,   1.],
       [ 40.,  41.,  42.,  43.,  44.,   1.],
       [ 45.,  46.,  47.,  48.,  49.,   1.]])

In [145]:
np.dstack((dataset*1,dataset*2,dataset*3))

array([[[  0,   0,   0],
        [  1,   2,   3],
        [  2,   4,   6],
        [  3,   6,   9],
        [  4,   8,  12]],

       [[  5,  10,  15],
        [  6,  12,  18],
        [  7,  14,  21],
        [  8,  16,  24],
        [  9,  18,  27]],

       [[ 10,  20,  30],
        [ 11,  22,  33],
        [ 12,  24,  36],
        [ 13,  26,  39],
        [ 14,  28,  42]],

       [[ 15,  30,  45],
        [ 16,  32,  48],
        [ 17,  34,  51],
        [ 18,  36,  54],
        [ 19,  38,  57]],

       [[ 20,  40,  60],
        [ 21,  42,  63],
        [ 22,  44,  66],
        [ 23,  46,  69],
        [ 24,  48,  72]],

       [[ 25,  50,  75],
        [ 26,  52,  78],
        [ 27,  54,  81],
        [ 28,  56,  84],
        [ 29,  58,  87]],

       [[ 30,  60,  90],
        [ 31,  62,  93],
        [ 32,  64,  96],
        [ 33,  66,  99],
        [ 34,  68, 102]],

       [[ 35,  70, 105],
        [ 36,  72, 108],
        [ 37,  74, 111],
        [ 38,  76, 114],
        [ 3

In [146]:
np.insert(dataset, 3, bias, axis=1)

array([[ 0,  1,  2,  1,  3,  4],
       [ 5,  6,  7,  1,  8,  9],
       [10, 11, 12,  1, 13, 14],
       [15, 16, 17,  1, 18, 19],
       [20, 21, 22,  1, 23, 24],
       [25, 26, 27,  1, 28, 29],
       [30, 31, 32,  1, 33, 34],
       [35, 36, 37,  1, 38, 39],
       [40, 41, 42,  1, 43, 44],
       [45, 46, 47,  1, 48, 49]])

In [147]:
np.insert(dataset, 3, dataset.T, axis=1)

array([[ 0,  1,  2,  0,  1,  2,  3,  4,  3,  4],
       [ 5,  6,  7,  5,  6,  7,  8,  9,  8,  9],
       [10, 11, 12, 10, 11, 12, 13, 14, 13, 14],
       [15, 16, 17, 15, 16, 17, 18, 19, 18, 19],
       [20, 21, 22, 20, 21, 22, 23, 24, 23, 24],
       [25, 26, 27, 25, 26, 27, 28, 29, 28, 29],
       [30, 31, 32, 30, 31, 32, 33, 34, 33, 34],
       [35, 36, 37, 35, 36, 37, 38, 39, 38, 39],
       [40, 41, 42, 40, 41, 42, 43, 44, 43, 44],
       [45, 46, 47, 45, 46, 47, 48, 49, 48, 49]])

In [148]:
np.insert(dataset, 3, np.ones(5), axis=0)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [ 1,  1,  1,  1,  1],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49]])