In [18]:
import datascience as ds
import numpy as np
import pandas as pd
import string

# Creating a Table from scratch

In order to create a Table object, you need two things: a list of data columns and a set of labels for those columns.


In [151]:
nfeats = 10
data = list(np.random.randn(nfeats, 100))
labels = [string.ascii_lowercase[i] for i in range(nfeats)]
table = ds.Table(data, labels=labels)

You can also supply a dictionary where keys are labels and values are the column data

In [24]:
data_dict = {i: dat for i, dat in zip(labels, data)}
table = ds.Table(data_dict)

If you want to create a table by passing rows instead of columns, use the `from_rows` method

In [143]:
data_rows = list(np.random.randn(100, nfeats))
table = ds.Table.from_rows(data_rows, column_labels=labels)

# Viewing Data

In a jupyter notebook, simply run a cell with a table in it, and it will display in nice HTML. However, it will cut off rows that are too long to be displayed

In [33]:
table

a,b,c,d,e,f,g,h,i,j
0.406769,1.55196,-0.0931095,-0.529943,1.78406,1.68528,1.84819,-1.9656,-0.596316,0.0727995
1.94665,-0.879404,-0.313696,-1.29964,0.473706,-0.41833,0.980287,0.726492,1.14417,2.3054
-0.229707,0.187612,-2.22901,-0.43925,-0.752164,-2.24453,-0.743171,0.0202779,-0.443438,-0.431859
0.797457,-0.871438,1.51265,0.566548,-0.130461,-1.24734,-0.151947,1.21087,1.90886,-0.362355
0.140173,0.295714,1.3674,1.96185,0.0973633,-0.0300836,0.960287,1.04089,0.153897,-1.4037
1.19691,0.115993,0.327456,-0.0467045,0.091842,0.980452,-1.69135,-0.57797,-0.124096,-0.245333
0.506299,-0.0383577,0.141138,1.03598,0.295396,-0.48896,-0.902124,-0.565999,1.87611,-0.0758977
-0.652332,-0.1099,-1.23547,-0.402245,1.48215,-1.46238,0.529661,0.0996701,-1.06593,-0.324503
-0.515006,-0.223921,1.24077,-0.466819,1.05303,-2.17738,-0.132804,0.0923499,-1.93556,-1.10255
0.995502,1.8292,-0.574803,-1.28468,1.15869,-1.0832,1.21659,2.68339,0.187712,0.22677


In [34]:
# Printing does the same, but with text output
print(table)

a         | b          | c          | d          | e         | f          | g         | h         | i         | j
0.406769  | 1.55196    | -0.0931095 | -0.529943  | 1.78406   | 1.68528    | 1.84819   | -1.9656   | -0.596316 | 0.0727995
1.94665   | -0.879404  | -0.313696  | -1.29964   | 0.473706  | -0.41833   | 0.980287  | 0.726492  | 1.14417   | 2.3054
-0.229707 | 0.187612   | -2.22901   | -0.43925   | -0.752164 | -2.24453   | -0.743171 | 0.0202779 | -0.443438 | -0.431859
0.797457  | -0.871438  | 1.51265    | 0.566548   | -0.130461 | -1.24734   | -0.151947 | 1.21087   | 1.90886   | -0.362355
0.140173  | 0.295714   | 1.3674     | 1.96185    | 0.0973633 | -0.0300836 | 0.960287  | 1.04089   | 0.153897  | -1.4037
1.19691   | 0.115993   | 0.327456   | -0.0467045 | 0.091842  | 0.980452   | -1.69135  | -0.57797  | -0.124096 | -0.245333
0.506299  | -0.0383577 | 0.141138   | 1.03598    | 0.295396  | -0.48896   | -0.902124 | -0.565999 | 1.87611   | -0.0758977
-0.652332 | -0.1099    | -1.23547   

In [44]:
# You can return a table as text
print('TEXT:\n', table.as_text()[:1000])

# Or as html
print('\n\nHTML:\n', table.as_html()[:1000])

TEXT:
 a           | b            | c          | d          | e           | f          | g          | h          | i          | j
0.406769    | 1.55196      | -0.0931095 | -0.529943  | 1.78406     | 1.68528    | 1.84819    | -1.9656    | -0.596316  | 0.0727995
1.94665     | -0.879404    | -0.313696  | -1.29964   | 0.473706    | -0.41833   | 0.980287   | 0.726492   | 1.14417    | 2.3054
-0.229707   | 0.187612     | -2.22901   | -0.43925   | -0.752164   | -2.24453   | -0.743171  | 0.0202779  | -0.443438  | -0.431859
0.797457    | -0.871438    | 1.51265    | 0.566548   | -0.130461   | -1.24734   | -0.151947  | 1.21087    | 1.90886    | -0.362355
0.140173    | 0.295714     | 1.3674     | 1.96185    | 0.0973633   | -0.0300836 | 0.960287   | 1.04089    | 0.153897   | -1.4037
1.19691     | 0.115993     | 0.327456   | -0.0467045 | 0.091842    | 0.980452   | -1.69135   | -0.57797   | -0.124096  | -0.245333
0.506299    | -0.0383577   | 0.141138   | 1.03598    | 0.295396    | -0.48896   | -0.9021

You can also just access the column labels

In [49]:
cols = table.column_labels
print(cols)

('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j')


Transposing your data will not change your column labels. Instead, it will transpose the data __within__ each column.

In [52]:
table.T

a,b,c,d,e,f,g,h,i,j
[ 0.40676911 1.946647 -0.22970666 0.79745695 0.1401 ...,[ 1.55195890e+00 -8.79403941e-01 1.87611501e-01 -8. ...,[-0.09310951 -0.31369648 -2.22901218 1.5126497 1.3673 ...,[-0.52994258 -1.29964241 -0.43925049 0.56654829 1.9618 ...,[ 1.78405945 0.47370562 -0.75216434 -0.1304611 0.0973 ...,[ 1.68528427 -0.41832967 -2.24452539 -1.24733877 -0.0300 ...,[ 1.84819055 0.98028668 -0.74317109 -0.15194665 0.9602 ...,[-1.96559779 0.72649162 0.02027786 1.21087172 1.0408 ...,[-0.59631562 1.1441665 -0.4434379 1.90886405 0.1538 ...,[ 0.07279954 2.30539862 -0.43185873 -0.36235468 -1.4037 ...


You can also sort a table by the values in some column.

In [56]:
table.sort('b')

a,b,c,d,e,f,g,h,i,j
-0.381304,-2.05797,1.48785,0.0392615,1.35183,1.3498,0.496602,0.900159,-1.04252,-0.479272
1.74163,-1.82917,1.19606,0.944638,-1.83992,0.885643,-0.539069,0.780056,-0.533799,0.920045
1.5121,-1.82572,0.862555,0.896797,1.1553,-0.42885,-0.147458,-2.05771,-0.274241,-1.28548
0.633111,-1.50467,-0.864715,0.419723,-0.561586,0.682939,2.11355,-0.661091,-0.0430856,-0.0154582
2.12299,-1.47532,-0.0156141,0.246847,0.183992,-1.72979,0.519849,0.0692791,0.465791,-0.481382
-0.59403,-1.35258,0.942918,0.619401,-0.911106,-0.676922,0.651861,0.688236,-0.974254,-2.23986
0.0525554,-1.29529,-0.830958,-2.93982,0.593177,1.06012,-1.04354,-1.74783,-1.33295,1.11401
-0.250409,-1.26208,0.138758,0.109763,0.219729,-1.06541,0.319839,0.308164,0.372572,-0.671437
0.281824,-1.2529,-0.353788,0.29045,0.623082,-0.837187,0.072082,-0.986154,0.352775,1.50166
-1.00494,-1.22793,-0.0923397,0.593574,-0.57466,-2.27588,1.46305,-0.411265,-1.39012,1.74693


# Selecting data
The simplest way of selecting data is by pulling out individual columns. This can be done by putting the column name in brackets (similar to how dictionaries work). Note that you must supply a single column name.

In [64]:
print(table['a'][:5])

[ 0.40676911  1.946647   -0.22970666  0.79745695  0.14017344]


Alternatively, one can select a subset of columns for viewing with the `.select` method. This lets you give multiple column labels:

In [67]:
table.select(['a', 'b'])

a,b
0.406769,1.55196
1.94665,-0.879404
-0.229707,0.187612
0.797457,-0.871438
0.140173,0.295714
1.19691,0.115993
0.506299,-0.0383577
-0.652332,-0.1099
-0.515006,-0.223921
0.995502,1.8292


Finally, it is also possible to take specific rows from a table with the `.take` method. Note that this uses a list of row indices.

In [71]:
ix_rows = range(10, 20)
table.take(ix_rows)

a,b,c,d,e,f,g,h,i,j
0.662009,1.3396,0.100194,0.112067,1.01076,1.37659,0.270544,-1.95514,0.476391,-1.15728
-0.339129,0.104073,-1.05487,-1.6283,0.127132,1.14525,-0.240215,1.43093,-3.38099,0.623656
-0.0927025,-0.835413,-0.678441,-0.902698,-0.137255,-0.827414,1.21246,-0.968445,1.1132,-2.30774
0.69249,0.574802,1.65091,0.862108,-0.952391,-0.381625,-0.752288,0.972626,1.40986,-0.674008
1.24252,-0.865907,0.796575,-0.627811,0.581739,-0.977999,-0.444162,0.510538,0.539005,-0.717448
0.698097,1.62111,-0.434359,-0.1534,0.671726,-1.2035,-2.56773,1.20782,-0.764151,-0.81407
-1.49591,-1.18452,0.703795,-1.10636,-0.20109,1.03348,-1.84136,3.18499,0.743293,-0.495953
-0.0936518,-0.827222,0.67008,0.066464,0.0681737,0.444703,-1.09244,0.877795,-0.219102,0.272328
0.0607656,-0.104638,0.768964,0.867504,-0.780542,-0.538346,0.937276,-1.64203,2.25342,-1.05416
1.09633,0.193629,0.57007,0.954129,-0.137401,-0.0730438,-0.522157,-1.47996,0.149595,1.32965


Finally, if you want to forget about the column labels and just treat it as a numpy array, you can use the `.columns` attribute. This will return a list of your column values, which is easily converted into a numpy array (but note that this will also transpose the data)

In [112]:
table_array = table.columns
table_array = np.array(table_array)
print(table_array[:5, :5])
print(type(table_array))

[[ 0.40676911  1.946647   -0.22970666  0.79745695  0.14017344]
 [ 1.5519589  -0.87940394  0.1876115  -0.87143836  0.29571392]
 [-0.09310951 -0.31369648 -2.22901218  1.5126497   1.36739613]
 [-0.52994258 -1.29964241 -0.43925049  0.56654829  1.96185154]
 [ 1.78405945  0.47370562 -0.75216434 -0.1304611   0.09736333]]
<class 'numpy.ndarray'>


## Boolean indexing

You can't use boolean operators directly with a Table, but it is quite easy to accomplish the same thing by selecting a subset of the data to create boolean masks, then by using the `.where` method.

In [113]:
table > 1

TypeError: unorderable types: Table() > int()

In [115]:
# We'll convert the matrix to an array
msk = np.array(table.columns) > 1
msk[:5, :5]

array([[False,  True, False, False, False],
       [ True, False, False, False, False],
       [False, False, False,  True,  True],
       [False, False, False, False,  True],
       [ True, False, False, False, False]], dtype=bool)

Now we can use the `.where` method, which takes an array with the length == the number of rows in our Table, and returns the rows where the array is == "value" (which defaults to the value `True`)

In [118]:
table.where(msk[0])

a,b,c,d,e,f,g,h,i,j
1.94665,-0.879404,-0.313696,-1.29964,0.473706,-0.41833,0.980287,0.726492,1.14417,2.3054
1.19691,0.115993,0.327456,-0.0467045,0.091842,0.980452,-1.69135,-0.57797,-0.124096,-0.245333
1.24252,-0.865907,0.796575,-0.627811,0.581739,-0.977999,-0.444162,0.510538,0.539005,-0.717448
1.09633,0.193629,0.57007,0.954129,-0.137401,-0.0730438,-0.522157,-1.47996,0.149595,1.32965
2.02402,0.103886,2.03244,0.232978,-0.133859,1.26937,2.08116,-1.13326,0.134415,-0.152267
1.44412,1.40337,-0.428268,0.0168891,-0.209338,-0.812116,0.686712,0.39729,-1.71596,-0.553601
1.28586,-0.225623,-0.816275,0.503434,0.0173762,1.61857,-0.18299,-0.354406,-2.32813,0.712599
1.41758,0.426221,-1.31336,0.917362,1.03428,0.539896,-1.06334,-0.778038,2.31976,0.525964
2.12299,-1.47532,-0.0156141,0.246847,0.183992,-1.72979,0.519849,0.0692791,0.465791,-0.481382
2.71917,0.385974,0.00784666,0.354093,0.943637,0.170245,-0.814775,0.116205,0.104104,-1.35504


In [129]:
msk_letter = msk.copy().astype(str)
msk_letter[msk] = 'a'
msk_letter[~msk] = 'b'
print(msk_letter[:5, :5])

[['b' 'a' 'b' 'b' 'b']
 ['a' 'b' 'b' 'b' 'b']
 ['b' 'b' 'b' 'a' 'a']
 ['b' 'b' 'b' 'b' 'a']
 ['a' 'b' 'b' 'b' 'b']]


In [128]:
table.where(msk_letter[0], 'b')

a,b,c,d,e,f,g,h,i,j
0.406769,1.55196,-0.0931095,-0.529943,1.78406,1.68528,1.84819,-1.9656,-0.596316,0.0727995
-0.229707,0.187612,-2.22901,-0.43925,-0.752164,-2.24453,-0.743171,0.0202779,-0.443438,-0.431859
0.797457,-0.871438,1.51265,0.566548,-0.130461,-1.24734,-0.151947,1.21087,1.90886,-0.362355
0.140173,0.295714,1.3674,1.96185,0.0973633,-0.0300836,0.960287,1.04089,0.153897,-1.4037
0.506299,-0.0383577,0.141138,1.03598,0.295396,-0.48896,-0.902124,-0.565999,1.87611,-0.0758977
-0.652332,-0.1099,-1.23547,-0.402245,1.48215,-1.46238,0.529661,0.0996701,-1.06593,-0.324503
-0.515006,-0.223921,1.24077,-0.466819,1.05303,-2.17738,-0.132804,0.0923499,-1.93556,-1.10255
0.995502,1.8292,-0.574803,-1.28468,1.15869,-1.0832,1.21659,2.68339,0.187712,0.22677
0.662009,1.3396,0.100194,0.112067,1.01076,1.37659,0.270544,-1.95514,0.476391,-1.15728
-0.339129,0.104073,-1.05487,-1.6283,0.127132,1.14525,-0.240215,1.43093,-3.38099,0.623656


# Adding new data
It is possible to add a new column directly to our Table object. This is done by passing values that have the same number of rows, along with a column name

In [152]:
table['custom_col'] = table['a'] + 100
table['custom_col']

array([ 100.71203456,  101.02437246,  101.42301705,  100.07765536,
        100.7550972 ,   98.9727203 ,   98.66095711,  100.13710886,
        100.33388808,  100.4211912 ,   98.5679226 ,  100.56498883,
        100.07779845,   98.71167394,  101.10158842,  100.71620285,
         99.17222072,   98.38321062,  100.550448  ,   99.2507432 ,
        100.62281287,  101.93299852,  100.38797408,   99.34322741,
         98.53224889,  100.01835533,   99.57473656,   98.23392258,
        101.6291059 ,   99.11885732,   98.77564795,  100.85666664,
        100.18165585,   99.20889181,  100.11799917,  100.70664996,
        101.01061018,   99.12809131,  100.47232331,  101.88621109,
         98.75524666,  100.61839017,  100.13342417,  100.49287363,
         99.08813087,  101.15083087,   99.57929517,   99.74852665,
         99.15111061,  101.49228669,   99.49050557,   99.60957761,
        101.26757898,   99.79825213,  101.95194058,  101.36793204,
        100.82254048,  100.7016186 ,  101.21617388,   99.79459

In [156]:
table['not_a'] = table['a'] < 0
table

a,b,c,d,e,f,g,h,i,j,custom_col,not_a
0.712035,0.321652,-0.177017,-0.276744,0.442014,-0.151805,1.26032,-0.459668,0.858446,1.02393,100.712,False
1.02437,0.171368,0.225927,1.97335,0.301138,-1.30359,0.49244,0.882077,0.320426,0.103817,101.024,False
1.42302,-0.97197,0.531661,0.680842,0.62295,0.539337,-1.026,-1.88412,0.735877,-0.182456,101.423,False
0.0776554,0.335543,0.956517,-0.148156,1.39035,0.231305,0.618361,-0.0933092,-0.0440198,-0.101198,100.078,False
0.755097,0.978997,0.270038,0.352872,0.140897,0.0524315,-0.422537,-1.2683,0.695731,-0.0541399,100.755,False
-1.02728,0.467634,-1.42592,-1.61731,-0.389991,-0.766169,2.03004,-0.812493,-0.867925,-1.86148,98.9727,True
-1.33904,0.273429,0.410062,0.671098,-1.18111,-1.36299,0.0788214,-1.40356,0.765777,-2.18249,98.661,True
0.137109,0.00246671,0.931103,0.164772,-1.0574,-0.780646,0.0208116,0.360339,0.211063,1.23743,100.137,False
0.333888,-0.188111,-2.17362,0.408387,-0.602258,-0.650551,0.137784,-1.23441,0.568559,0.67111,100.334,False
0.421191,-1.91241,2.1556,0.472956,-0.506368,2.20083,-1.64457,-0.835216,-1.56042,1.41558,100.421,False


# Operations

Tables aren't meant to be operated on directly. Instead, you should operate on their data, e.g.:

In [159]:
table + 2

TypeError: unsupported operand type(s) for +: 'Table' and 'int'

In [161]:
table['a+2'] = table['a'] + 2

It is possible to do some simple statistics on the table object as well.

In [None]:
np.mean(table)

Or you can apply an arbitrary function to a specific column:

In [180]:
def turn_into_chris(i):
    return 'chris'

# *each row* of the column will be passed to our function
table['a_chris'] = table.apply(turn_into_chris, 'a')
table['a_chris'][:5]

array(['chris', 'chris', 'chris', 'chris', 'chris'], 
      dtype='<U5')

# Merging / joining / etc

# Grouping

# Plotting
There are a few plotting methods associated with a Table object. These can do some quick visualizations of your data.

In [182]:
table.plot('c')

ValueError: could not convert string to float: 'chris'

# Pivot tables

# IO