# CSV files 1: reading a CSV file

## What is a CSV file?

CSV stands for comma-separated values
- It uses a comma (or other delimiter) to separate values
- a file where data is stored in rows

In [2]:
import polars as pl

In [3]:
csv_file = "data/titanic.csv"

In [4]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Header and column names
By default Polars takes the first row of a CSV as the header to set the column names.

### No header
If the first row is not a header we can set `has_header = False` and the column names are `column_1` and so on.

In [6]:
pl.read_csv(
    csv_file,
    has_header=False
).head(2)

column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12
str,str,str,str,str,str,str,str,str,str,str,str
"""PassengerId""","""Survived""","""Pclass""","""Name""","""Sex""","""Age""","""SibSp""","""Parch""","""Ticket""","""Fare""","""Cabin""","""Embarked"""
"""1""","""0""","""3""","""Braund, Mr. Owen Harris""","""male""","""22""","""1""","""0""","""A/5 21171""","""7.25""",,"""S"""


### Rename columns
We can rename columns immediately after the CSV is parsed with `new_columns`

In [7]:
pl.read_csv(
    csv_file,
    new_columns=["passengerid"]
).head(2)

passengerid,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


### Skip rows after the header

In [8]:
pl.read_csv(
    csv_file,
    skip_rows_after_header=1
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


### Skip header

In [9]:
pl.read_csv(
    csv_file,
    skip_rows=1
).head(2)

1,0,3,"Braund, Mr. Owen Harris",male,22,1_duplicated_0,0_duplicated_0,A/5 21171,7.25,Unnamed: 10_level_0,S
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Parsing CSV

In [12]:
CSV_string = b"A,B,C\n0,1,2\n" # b means binary

In [11]:
pl.read_csv(
    CSV_string
)

A,B,C
i64,i64,i64
0,1,2


### Delimiter
Polars assumes the delimiter is a `,`, but this can be changed with the `sep` argument.

In [13]:
tab_CSV_string = b"A\tB\tC\n0\t1\t2\n"

pl.read_csv(
    tab_CSV_string,
    separator="\t"
)

A,B,C
i64,i64,i64
0,1,2


### Comment lines

Comment lines that start with a certain character in the CSV are ignored by setting the `comment_prefix`

In [14]:
comment_CSV_string = b"a,b,c\n#Comment\n0,1,2\n"

pl.read_csv(
    comment_CSV_string,
    comment_prefix="#"
)

a,b,c
i64,i64,i64
0,1,2


### Quotes
Quotes in the CSV are indicated with the `quote_char`

In [16]:
quote_CSV_string = b'name,age\n"Bili, Bili",39\n'

pl.read_csv(
    quote_CSV_string,
    quote_char='"'
)

name,age
str,i64
"""Bili, Bili""",39


### Choosing columns
We can restrict which columns are in the `DataFrame` with the `columns` argument

In [17]:
CSV_string = b"A,B,C\n0,1,2\n"

pl.read_csv(
    CSV_string,
    columns=["A", "C"]
)

A,C
i64,i64
0,2


## Inferring the dtypes
CSVs do not store any dtype of each column. 

Therefore Polars infers the dtype of each column in the CSV. 

Steps:
- Reads the first 100 lines
- Sets the dtype if Polars can infer it or raises an `Exception`

### Number of rows to infer the dtypes
We can adjust the number of lines used for type inference.

In [18]:
pl.read_csv(
    csv_file,
    infer_schema_length=58
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


In [None]:
pl.read_csv(
    csv_file,
    infer_schema_length=55 # Error
).head(2)

ComputeError: could not parse `28.5` as dtype `i64` at column 'Age' (column number 6)

The current offset in the file is 4055 bytes.

You might want to try:
- increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
- specifying correct dtype with the `schema_overrides` argument
- setting `ignore_errors` to `True`,
- adding `28.5` to the `null_values` list.

Original error: ```invalid primitive value found during CSV parsing```

### Setting the schema
We can also define the full schema explicitly with a `dict` when reading the CSV

In [20]:
pl.read_csv(
    csv_file,
    schema={
        "PassengerId": pl.Int32,
        "Survived": pl.Int32,
        "Pclass": pl.Int32,
        "Name": pl.String,
        "Sex": pl.String,
        "Age": pl.Float32,
        "SibSp": pl.Int32,
        "Parch": pl.Int32,
        "Ticket": pl.String,
        "Fare": pl.Float32,
        "Cabin": pl.String,
        "Embarked": pl.String,
    },
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i32,i32,i32,str,str,f32,i32,i32,str,f32,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.283302,"""C85""","""C"""


Pass the schema for all columns is tedious.

Use `schema_overrides` to override the inferred schema for specific columns

In [21]:
pl.read_csv(
    csv_file,
    schema_overrides={
        "PassengerId": pl.Int32,
        "Survived": pl.Int32,
        "Pclass": pl.Int32,
    },
).head(2)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i32,i32,i32,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""


## Handling mixed types and exceptions

In [22]:
mixed_CSV_string = b"A,B\n1.0,1\na,1"

In [23]:
pl.read_csv(
    mixed_CSV_string
)

A,B
str,i64
"""1.0""",1
"""a""",1


Polars cast the dtype to string itself.

### Ignore errors

In [24]:
CSV_string = b"A,B\nTrue,1\n0,1\n"

pl.read_csv(
    CSV_string,
    schema_overrides={"A": pl.Boolean},
    ignore_errors=True
)

A,B
bool,i64
True,1
,1


## Set values to `null`

In [None]:
CSV_string = b"A,B\nTrue,1\nFalse,NaN\n"

pl.read_csv(
    CSV_string,
    null_values="NaN" # this argument can be a list
)

A,B
bool,i64
True,1.0
False,


## Performance of CSV parsing
### Number of threads
The CSV parser in Polars is multithreaded and uses the same number of threads as cores in computer.

We can vary the number of threads with the `n_threads` argument.

### Memory usage
We can potentially reduce memory usage when reading a large CSV with `low_memory = True`

We can also reduce the size of each batch read by the parallel CSV reader with `batch_size`

In [26]:
pl.read_csv(
    csv_file,
    low_memory=True,
    batch_size=10000
)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
887,0,2,"""Montvila, Rev. Juozas""","""male""",27.0,0,0,"""211536""",13.0,,"""S"""
888,1,1,"""Graham, Miss. Margaret Edith""","""female""",19.0,0,0,"""112053""",30.0,"""B42""","""S"""
889,0,3,"""Johnston, Miss. Catherine Hele…","""female""",,1,2,"""W./C. 6607""",23.45,,"""S"""
890,1,1,"""Behr, Mr. Karl Howell""","""male""",26.0,0,0,"""111369""",30.0,"""C148""","""C"""


## Exercises

### Exercise 1

In [27]:
target = pl.DataFrame({"a":[1,2],"b":[3,4],"c":[5,6]})
target

a,b,c
i64,i64,i64
1,3,5
2,4,6


Parse the CSV strings in the following cells

In [29]:
CSV_string = b"Data passed quality control 2020-01-01\na,b,c\n1,3,5\n2,4,6\n"
pl.read_csv(CSV_string, skip_rows=1)

a,b,c
i64,i64,i64
1,3,5
2,4,6


In [30]:
# Rename columns
CSV_string = b"A,B,C\n1,3,5\n2,4,6\n"
pl.read_csv(
    CSV_string,
    new_columns=["a", "b", "c"]
)

a,b,c
i64,i64,i64
1,3,5
2,4,6


In [31]:
# Whitespace delimiter
CSV_string = b"a b c\n1 3 5\n2 4 6\n"
pl.read_csv(
    CSV_string,
    separator=" "
)

a,b,c
i64,i64,i64
1,3,5
2,4,6


In [32]:
# Comment line
CSV_string = b"a,b,c\n#Data passed quality control 2020-01-01\n1,3,5\n2,4,6\n"
pl.read_csv(
    CSV_string,
    comment_prefix="#"
)

a,b,c
i64,i64,i64
1,3,5
2,4,6


This time parse the CSV to produce a `DataFrame` with all columns as 64-bit floats

In [34]:
CSV_string = b"a,b,c\n#Data passed quality control 2020-01-01\n1,3,5\n2,4,6\n"
pl.read_csv(
    CSV_string,
    comment_prefix="#",
    schema_overrides={
        "a": pl.Float64,
        "b": pl.Float64,
        "c": pl.Float64,
    },
)

a,b,c
f64,f64,f64
1.0,3.0,5.0
2.0,4.0,6.0


Find missing data in the CSV and replace with `null`

In [35]:
CSV_string = b"a,b,c\n1,3,5\nNA,4,na\n"
pl.read_csv(
    CSV_string,
    null_values=["NA", "na"]
)

a,b,c
i64,i64,i64
1.0,3,5.0
,4,


## Exercise 2
Parse the NYC taxi CSV with:
- the default number of threads,
- one thread and
- 40 threads
to see if it affects performance.

In [36]:
nyccsv_file = "data/nyc_trip_data_1k.csv"

In [38]:
%%timeit -n1 -r3
pl.read_csv(
    nyccsv_file
)

20.5 ms ± 7.97 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [39]:
%%timeit -n1 -r3
pl.read_csv(
    nyccsv_file,
    n_threads=1
)

35.7 ms ± 14.1 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


In [43]:
%%timeit -n1 -r3
pl.read_csv(
    nyccsv_file,
    n_threads=40
)

39.6 ms ± 18 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
