# Using Pandas

**pandas** is a data analytics framework written in Python that provides a fast and flexible way to express tabular (relational) data that would normally be done using a Spreadsheet application.

It aims to be a high-level building block for performing data analysis in Python.

**pandas** can read the following data formats:
  - Tabular data (RDBMS table,  Excel spreadsheet, CSV file, Tabbed text file, HTML tables)
  - Ordered/unordered time series data (Logs which are properly formatted)
  - Numpy arrays and similar array-like objects (2-D matrix)


### Pandas fundamental data structures
- Series (1-dimensional) 
- DataFrame (2-dimensional - made up of Series)

Series objects are array-like and can be used in environments that expect numpy arrays

In [2]:
%pylab
%matplotlib inline
import pandas as pd

Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


## Loading data into pandas

In [4]:
df = pd.read_csv("scores.csv")
#df = pd.read_excel("scores.xlsx", "sheet1")
#df  = pd.read_html("html_tables.html")
df = pd.DataFrame()

ValueError: Cannot describe a DataFrame without columns

In [19]:
data = [
 {'name': 'john',   'physics': 66, 'maths': 78, 'compsci': 89 },
 {'name': 'adrian', 'physics': 82, 'maths': 73, 'compsci': 91 },
 {'name': 'bourne', 'physics': 73, 'maths': 94, 'compsci': 86 },
 {'name': 'smith',  'physics': 62, 'maths': 65, 'compsci': 68 },
 {'name': 'jane',   'physics': 79, 'maths': 84, 'compsci': 89 },
 {'name': 'david',  'physics': 84, 'maths': 87, 'compsci': 82 },
 {'name': 'emily',  'physics': 80, 'maths': 93, 'compsci': 84 },
 {'name': 'floyd',  'physics': 87, 'maths': 86, 'compsci': 92 },
 {'name': 'gary',   'physics': 78, 'maths': 72, 'compsci': 84 }]

df = pd.DataFrame(data)
df

Unnamed: 0,compsci,maths,name,physics
0,89,78,john,66
1,91,73,adrian,82
2,86,94,bourne,73
3,68,65,smith,62
4,89,84,jane,79
5,82,87,david,84
6,84,93,emily,80
7,92,86,floyd,87
8,84,72,gary,78


In [5]:
from collections import OrderedDict

data = [
 OrderedDict([('name', 'john'),
              ('physics', '66'),
              ('maths', '78'),
              ('compsci', '89')]),
 OrderedDict([('name', 'adrian'),
              ('physics', '82'),
              ('maths', '73'),
              ('compsci', '91')]),
 OrderedDict([('name', 'bourne'),
              ('physics', '73'),
              ('maths', '94'),
              ('compsci', '86')]),
 OrderedDict([('name', 'smith'),
              ('physics', '62'),
              ('maths', '65'),
              ('compsci', '68')]),
 OrderedDict([('name', 'jane'),
              ('physics', '79'),
              ('maths', '84'),
              ('compsci', '89')]),
 OrderedDict([('name', 'david'),
              ('physics', '84'),
              ('maths', '87'),
              ('compsci', '82')]),
 OrderedDict([('name', 'emily'),
              ('physics', '80'),
              ('maths', '93'),
              ('compsci', '84')]),
 OrderedDict([('name', 'floyd'),
              ('physics', '87'),
              ('maths', '86'),
              ('compsci', '92')]),
 OrderedDict([('name', 'gary'),
              ('physics', '78'),
              ('maths', '72'),
              ('compsci', '84')])]

df = pd.DataFrame(data)
df

Unnamed: 0,name,physics,maths,compsci
0,john,66,78,89
1,adrian,82,73,91
2,bourne,73,94,86
3,smith,62,65,68
4,jane,79,84,89
5,david,84,87,82
6,emily,80,93,84
7,floyd,87,86,92
8,gary,78,72,84


In [75]:
data = {
    'name': ['john', 'adrian', 'bourne', 'smith', 'jane', 'david', 'emily', 'floyd', 'gary'],
    'physics': [66, 82, 73, 62, 79, 84, 80, 87, 78],
    'maths': [78, 73, 94, 65, 84, 87, 93, 86, 72],
    'compsci': [89, 91, 86, 68, 89, 82, 84, 92, 84]
}

df = pd.DataFrame(data)
df
 


Unnamed: 0,compsci,maths,name,physics
0,89,78,john,66
1,91,73,adrian,82
2,86,94,bourne,73
3,68,65,smith,62
4,89,84,jane,79
5,82,87,david,84
6,84,93,emily,80
7,92,86,floyd,87
8,84,72,gary,78


In [25]:
data = {
    'name': ['john', 'adrian', 'bourne', 'smith', 'jane', 'david', 'emily', 'floyd', 'gary'],
    'physics': [66, 82, 73, 62, 79, 84, 80, 87, 78],
    'maths': [78, 73, 94, 65, 84, 87, 93, 86, 72],
    'compsci': [89, 91, 86, 68, 89, 82, 84, 92, 84]
}

df = pd.DataFrame(data)
df.to_excel("scores.xlsx")
 


In [2]:
import pandas as pd

In [3]:
data = {
    'name': ['john', 'adrian', 'bourne', 'smith', 'jane', 'david', 'emily', 'floyd', 'gary'],
    'physics': [66, 82, 73, 62, 79, 84, 80, 87, 78],
    'maths': [78, 73, 94, 65, 84, 87, 93, 86, 72],
    'compsci': [89, 91, 86, 68, 89, 82, 84, 92, 84]
}

df = pd.DataFrame(data)
df["name"] = df["name"].apply(lambda x: x.upper())
df

Unnamed: 0,compsci,maths,name,physics
0,89,78,JOHN,66
1,91,73,ADRIAN,82
2,86,94,BOURNE,73
3,68,65,SMITH,62
4,89,84,JANE,79
5,82,87,DAVID,84
6,84,93,EMILY,80
7,92,86,FLOYD,87
8,84,72,GARY,78


In [33]:
data = {
    'name': ['john', 'adrian', 'bourne', 'smith', 'jane', 'david', 'emily', 'floyd', 'gary'],
    'physics': [66, 82, 73, 62, 79, 84, 80, 87, 78],
    'maths': [78, 73, 94, 65, 84, 87, 93, 86, 72],
    'compsci': [89, 91, 86, 68, 89, 82, 84, 92, 84]
}

df = pd.DataFrame(data)
df.plot?


In [87]:
df.index

RangeIndex(start=0, stop=9, step=1)

In [76]:
df[:4][["name", "physics"]]

Unnamed: 0,name,physics
0,john,66
1,adrian,82
2,bourne,73
3,smith,62


In [83]:
df["physics"].max()

87

In [39]:
df[["name", "maths"]]

Unnamed: 0,name,maths
0,john,78
1,adrian,73
2,bourne,94
3,smith,65
4,jane,84
5,david,87
6,emily,93
7,floyd,86
8,gary,72


In [16]:
df[:4]["name"]

0      john
1    adrian
2    bourne
3     smith
Name: name, dtype: object

In [17]:
df.maths

0    78
1    73
2    94
3    65
4    84
5    87
6    93
7    86
8    72
Name: maths, dtype: object

In [20]:
df.columns

Index(['compsci', 'maths', 'name', 'physics'], dtype='object')

In [21]:
df.index

RangeIndex(start=0, stop=9, step=1)

In [84]:
df.maths.max()

94

In [85]:
df[df["maths"] == df["maths"].max()]

Unnamed: 0,compsci,maths,name,physics
2,86,94,bourne,73


In [24]:
df["name"][df["maths"] == df["maths"].max()]

2    bourne
Name: name, dtype: object

In [25]:
df["name"]

0      john
1    adrian
2    bourne
3     smith
4      jane
5     david
6     emily
7     floyd
8      gary
Name: name, dtype: object

In [98]:
#df.set_index("name", inplace=True)
df.reset_index()

Unnamed: 0,name,compsci,maths,physics
0,john,89,78,66
1,adrian,91,73,82
2,bourne,86,94,73
3,smith,68,65,62
4,jane,89,84,79
5,david,82,87,84
6,emily,84,93,80
7,floyd,92,86,87
8,gary,84,72,78


In [32]:
df.index[:1]

Index(['john'], dtype='object', name='name')

In [33]:
df.reset_index()

Unnamed: 0,name,compsci,maths,physics
0,john,89,78,66
1,adrian,91,73,82
2,bourne,86,94,73
3,smith,68,65,62
4,jane,89,84,79
5,david,82,87,84
6,emily,84,93,80
7,floyd,92,86,87
8,gary,84,72,78


In [44]:
df["name"][df["compsci"] > 85].count()

5

In [51]:
pt = df.pivot_table(columns=["name"])
pt

name,adrian,bourne,david,emily,floyd,gary,jane,john,smith
compsci,91,86,82,84,92,84,89,89,68
maths,73,94,87,93,86,72,84,78,65
physics,82,73,84,80,87,78,79,66,62


In [52]:
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
                          "bar", "bar", "bar", "bar"],
                    "B": ["one", "one", "one", "two", "two",
                          "one", "one", "two", "two"],
                    "C": ["small", "large", "large", "small",
                          "small", "large", "small", "small",
                          "large"],
                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})

df

Unnamed: 0,A,B,C,D
0,foo,one,small,1
1,foo,one,large,2
2,foo,one,large,2
3,foo,two,small,3
4,foo,two,small,3
5,bar,one,large,4
6,bar,one,small,5
7,bar,two,small,6
8,bar,two,large,7


In [56]:
table = df.pivot_table(values='D', index=['A', 'B'],  columns=['C'], aggfunc=np.sum)
table

Unnamed: 0_level_0,C,large,small
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,5.0
bar,two,7.0,6.0
foo,one,4.0,1.0
foo,two,,6.0


In [123]:
import re

line1 = '            bash-2888  [002] ....   214.858474: kmem_cache_free: call_site=c132e814 ptr=d78dd0a0\n'
line2 = '            bash-2833  [001] ....   214.858888: kmalloc: call_site=c13a0a0d ptr=d5265780 bytes_req=188 bytes_alloc=192 gfp_flags=GFP_KERNEL_ACCOUNT|__GFP_ZERO\n'

generic_regex = r"""
    (?P<program>\S+)
    -
    (?P<pid>\d+)
    \s+\[
    (?P<cpu>\d+)
    \]\s+
    (?P<irqs_disabled>.)
    (?P<need_resched>.)
    (?P<irq_context>.)
    (?P<preempt_count>.)
    \s+
    (?P<timestamp>[\d\.]+)
    :\s+
    (?P<function>\w+)
    :\s+.+
"""

alloc_regex = r"""
    bytes_req=(?P<bytes_req>\d+)
    \s+
    bytes_alloc=(?P<bytes_alloc>\d+)
"""
gpattern = re.compile(generic_regex, re.VERBOSE)
alloc_pattern = re.compile(alloc_regex, re.VERBOSE)

from collections import OrderedDict


def parse_trace_log(filename):
    with open(filename) as tracefile:
        for line in tracefile:
            generic = gpattern.search(line)
            if not generic: continue
            rec = OrderedDict(generic.groupdict())
            alloc = alloc_pattern.search(line)
            if alloc:
                rec.update(alloc.groupdict())
            print(rec)


{'program': 'bash', 'pid': '2888', 'cpu': '002', 'irqs_disabled': '.', 'need_resched': '.', 'irq_context': '.', 'preempt_count': '.', 'timestamp': '214.858474', 'function': 'kmem_cache_free'}


In [125]:
tf = pd.read_csv("../Samples/analytics/kmem-trace.csv", chunksize=1024*25)

total_size = 0
for df in tf:
    total_size += df.bytes_alloc.sum()
    
print(total_size)


46668392.0


In [1]:
tf = pd.read_csv("../Samples/analytics/kmem-trace.csv", chunksize=1024*25)

df = tf.get_chunk()

df.groupby("cpu").sum()["bytes_alloc"]

NameError: name 'pd' is not defined

In [163]:
tf = pd.read_csv("../Samples/analytics/kmem-trace.csv", chunksize=1024*25)

bytes_alloc = {0: 0, 1: 0, 2: 0, 3: 0}
for df in tf:
    row = df.groupby("cpu").sum().bytes_alloc
    bytes_alloc[0] += row.get(0, 0)
    bytes_alloc[1] += row.get(1, 0)
    bytes_alloc[2] += row.get(2, 0)
    bytes_alloc[3] += row.get(3, 0)
    
print(bytes_alloc)

{0: 22065744.0, 1: 15694768.0, 2: 4239080.0, 3: 4668800.0}


In [146]:
tf = pd.read_csv("../Samples/analytics/kmem-trace.csv", chunksize=1024*25)

df = tf.get_chunk()

df[df["irqs_disabled"] == 'd']["bytes_alloc"].count()

31

In [147]:
tf = pd.read_csv("../Samples/analytics/kmem-trace.csv", chunksize=1024*25)

df = tf.get_chunk()

df["bytes_alloc"].max()

6976.0

In [161]:
tf = pd.read_csv("../Samples/analytics/kmem-trace.csv", chunksize=1024*25)

df = tf.get_chunk()

df.groupby("program").sum().sort_values("bytes_alloc", ascending=False)[]

Unnamed: 0_level_0,pid,cpu,timestamp,bytes_req,bytes_alloc
program,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bash,39993516,21220,3080940.0,1717576.0,1748312.0
git,22624616,7138,1727341.0,1384830.0,1389880.0
ls,5323613,5395,410135.5,616958.0,617872.0
dbus-daemon,357372,0,35915.32,102872.0,103488.0
cat,1434928,1442,108388.8,40559.0,40992.0
vminfo,735900,0,66509.81,32730.0,35456.0
kworker/u8:5,23120,60,4392.939,980.0,1120.0
kworker/u8:7,1744,2,223.805,192.0,192.0
<idle>,0,866,148060.4,0.0,0.0
automount,76167,0,6829.351,0.0,0.0


In [164]:
df.groupby("program").sum().sort_values

<bound method DataFrame.sort_values of               pid    cpu     timestamp  bytes_req  bytes_alloc
program                                                       
cat      58526835  19941  5.326524e+06   188320.0     188392.0
screen       8385      0  8.016506e+02        0.0          0.0
vmstats      9824      4  1.068349e+03        0.0          0.0>

In [166]:
pd.__version__

'0.22.0'