In [1]:
import os
os.chdir("/home/data")

In [2]:
ls | grep csv | xargs wc -l

   7453216 flights.csv
        10 flights_01.csv
        10 flights_02.csv
      1000 flights_1k.csv
  10000001 fromPandas.csv
        27 get-csvs.sh
  17454264 total


# csvtk

- A cross-platform, efficient, practical and pretty CSV/TSV toolkit in Golang
- [docs](http://bioinf.shenwei.me/csvtk/)
- [Usage](http://bioinf.shenwei.me/csvtk/usage/)
- [Tutorial](http://bioinf.shenwei.me/csvtk/tutorial/)

In [3]:
!csvtk

A cross-platform, efficient and practical CSV/TSV toolkit

Version: 0.13.0

Author: Wei Shen <shenwei356@gmail.com>

Documents  : http://shenwei356.github.io/csvtk
Source code: https://github.com/shenwei356/csvtk

Attention:

    1. The CSV parser requires all the lines have same number of fields/columns.
       Even lines with spaces will cause error.
    2. By default, csvtk thinks your files have header row, if not, switch flag "-H" on.
    3. Column names better be unique.
    4. By default, lines starting with "#" will be ignored, if the header row
       starts with "#", please assign flag "-C" another rare symbol, e.g. '$'.
    5. By default, csvtk handles CSV files, use flag "-t" for tab-delimited files.
    6. If " exists in tab-delimited files, use flag "-l".

Environment variables for frequently used global flags

    - "CSVTK_T" for flag "-t/--tabs"
    - "CSVTK_H" for flag "-H/--no-header-row"

Usage:
  csvtk [command]

Available Commands:
  collapse        collapse one fi

## See column names

In [4]:
!csvtk headers fromPandas.csv

# fromPandas.csv
1	C00
2	A01
3	A02
4	A03
5	C04
6	D05
7	B06
8	C07
9	C08
10	A09


## See first few rows

In [5]:
!csvtk head -n 5 fromPandas.csv

C00,A01,A02,A03,C04,D05,B06,C07,C08,A09
PO,Critical,0.31,-0.02,0.15,0.52,-1.24,-1.12,-1.68,-0.7
AR,Critical,1.33,-0.65,0.29,-1.31,0.32,-1.61,1.27,0.34
AR,Critical,-2.4,-0.23,0.28,0.95,0.82,-0.18,-1.73,-1.44
PO,Critical,0.16,-0.01,0.46,0.09,-0.43,-0.79,-1.5,0.87
PO,Alert,-0.34,-0.37,0.17,0.62,-1.19,1.81,0.66,0.1


## `pretty`

In [None]:
!csvtk pretty -h

In [None]:
!csvtk head fromPandas.csv | csvtk pretty -r

## `sample`

- the `-H` switch removes the header, `-p` specifies proportion

In [None]:
!csvtk sample -H -p 0.01 fromPandas.csv | wc -l

In [None]:
!csvtk sample -p 0.001 fromPandas.csv | head

## `stats` 

In [None]:
!csvtk stats fromPandas.csv

## `cut`

In [None]:
!csvtk cut -h

In [None]:
# by position, ranges
!head fromPandas.csv | csvtk cut -f 2,3,5-7 | csvtk pretty -r

In [None]:
# by exact name
!head fromPandas.csv | csvtk cut -f A05,B07,C00,D04 | csvtk pretty -r

In [None]:
# by fuzzy matching
!head fromPandas.csv | csvtk cut -F -f "A0*,D01" | csvtk pretty -r

In [None]:
# ignoring columns by position, ranges
# csvtk cut -f -3--1 for discarding column 1,2,3
!head fromPandas.csv | csvtk cut -f -5--2,-10--9 | csvtk pretty -r

## `uniq`

In [None]:
!csvtk uniq -h

In [None]:
# will retain the rows corresponding to the first occurence of each value in column
!cat fromPandas.csv | csvtk uniq -f C00

In [None]:
!cat fromPandas.csv | csvtk cut -f D01 | csvtk uniq -f 1

## `freq`

In [None]:
!csvtk freq -h

In [None]:
!cat fromPandas.csv | csvtk freq -f C00

In [None]:
# sort by key
!cat fromPandas.csv | csvtk freq -f C00 -k | csvtk pretty

In [None]:
# sort in descending order of count
!cat fromPandas.csv | csvtk freq -f C00 -n -r | csvtk pretty

In [None]:
# combination of two variables
!cat fromPandas.csv | csvtk freq -f C00,D01 -n -r | csvtk pretty

---

## `plot`

In [None]:
!csvtk plot -h

### Histogram

In [None]:
!cat fromPandas.csv \
| csvtk sample -p 0.01 \
| csvtk plot hist -f "D04" -o hist.png

In [None]:
Image('hist.png', width=400)

### Boxplots

In [None]:
!cat fromPandas.csv \
| csvtk sample -p 0.01 \
| csvtk plot box -g "C00" -f "D04" -o box.png

In [None]:
Image('box.png', width=400)

In [None]:
!cat fromPandas.csv \
| csvtk sample -p 0.01 \
| csvtk plot box -g "C00" -f "D04" --horiz -o box2.png

In [None]:
Image('box2.png', width=400)

In [None]:
import pandas as pd
import numpy as np

%pylab inline

df = (pd.DataFrame({'x': range(5000)})
 .assign(Y = lambda df: np.random.randn(5000).round(2))
 .assign(Z = lambda df: 2 * df['x'] + 5)
 .assign(Grp = pd.Series(list('ABCD')).sample(5000, replace=True).values))

df.to_csv('line.csv', index=False)

In [None]:
!csvtk plot line line.csv -x x -y Y -o lineplot.png

In [None]:
Image('lineplot.png')

### ScatterPlot

In [None]:
!cat line.csv \
| csvtk plot line -x x -y Y -g Grp --scatter -o scatter.png

In [None]:
Image('scatter.png', width=400)

---

## `grep`

In [None]:
!csvtk grep -h

In [None]:
!cat fromPandas.csv | csvtk grep -f C00 -p EN | csvtk head | csvtk pretty -r

In [None]:
# Remore rows containing missing data
!csvtk grep -F -f "*" -r -p "^$" -v

---

## `filter`

In [None]:
!csvtk filter -h

In [None]:
!cat fromPandas.csv | csvtk filter -f "D04>3.00" | wc -l

In [None]:
!cat fromPandas.csv | csvtk filter -F -f "A*>1" | csvtk head | csvtk pretty

---

## `filter2`

In [None]:
!csvtk filter2 -h

In [None]:
!cat fromPandas.csv | csvtk filter2 -f '$A05>1 && $C00=="ES"' | csvtk head | csvtk pretty -r

---

## `rename`

In [None]:
!csvtk rename -h

In [None]:
!cat fromPandas.csv | csvtk rename -f 1,2 -n Lang,Msg | csvtk head | csvtk pretty -r

In [None]:
!csvtk rename2 -h

In [None]:
!head -5 fromPandas.csv \
| csvtk cut -f -2--1 \
| csvtk rename2 -F -f "*" -p "(.*)" -r 'Num_${1}' \
| csvtk pretty -r 

---

## `stats2`

In [None]:
!csvtk stats2 -h

In [None]:
!cat fromPandas.csv | csvtk stats2 -F -f 'A*'

---

## `mutate`

In [None]:
!csvtk mutate -h

In [None]:
!head fromPandas.csv | csvtk mutate -f C00 -n C00_copy

## `sort`

In [None]:
!csvtk sort -h