Link to Medium blog post: https://towardsdatascience.com/pandas-dataframe-playing-with-csv-files-944225d19ff

# Persisting the DataFrame to CSV file

In [2]:
import pandas as pd

In [5]:
# In a nutshell, Pandas DataFrame is nothing but an in-memory representation of excel like data 
# For example:

# dicionary of lists
my_dict = { 'name' : ["a", "b", "c", "d", "e","f", "g"],
                   'age' : [20,27, 35, 55, 18, 21, 35],
                   'designation': ["VP", "CEO", "CFO", "VP", "VP", "CEO", "MD"]}

# convert to dataframe
df = pd.DataFrame(my_dict)

# print the dataframe
df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [6]:
# we can store the data of this DataFrame in CSV format using the API called to_csv(...) of Pandas DataFrame as

df.to_csv('csv_example.csv')

In [7]:
# This problem can be avoided by making sure that the writing of CSV files doesn’t write indexes, because DataFrame will generate it anyway 
# We can do the same by specifying index = False parameter in to_csv(...) function

df.to_csv('csv_example.csv', index=False)

In [9]:
# Now, if we read the file as

df = pd.read_csv('csv_example.csv')

df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


# Playing with Column Header

In [14]:
# It’s possible to have more than one row as column headers by specifying a parameter called header=<integer> in read_csv(...) function
# By default, the value is specified as ‘0’, which means that the top row will be considered as header

df = pd.read_csv('csv_example.csv', header=0)

df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [15]:
# More than one row as column headers

df_csv = pd.read_csv('csv_example.csv', header=[0,1,2])

df_csv

Unnamed: 0_level_0,name,age,designation
Unnamed: 0_level_1,a,20,VP
Unnamed: 0_level_2,b,27,CEO
0,c,35,CFO
1,d,55,VP
2,e,18,VP
3,f,21,CEO
4,g,35,MD


In [16]:
# Skip a specific number of rows to start the header

df_csv = pd.read_csv('csv_example.csv', header=5)

df_csv

Unnamed: 0,e,18,VP
0,f,21,CEO
1,g,35,MD


In [17]:
# Even in the case of having multiple rows as header, actual DataFrame data shall start only with rows after the last header rows

df_csv = pd.read_csv('csv_example.csv', header=[1,2,5])

# The resultant DataFrame will start from row ‘6’
df_csv

Unnamed: 0_level_0,a,20,VP
Unnamed: 0_level_1,b,27,CEO
Unnamed: 0_level_2,e,18,VP
0,f,21,CEO
1,g,35,MD


# Customizing Column Names

In [20]:
# We can still have our own column names by adding a parameter called names in read_csv(...)

df_csv = pd.read_csv('csv_example.csv', names=['a', 'b', 'c'])

# The top row still displays header which is a non desired one
df_csv

Unnamed: 0,a,b,c
0,name,age,designation
1,a,20,VP
2,b,27,CEO
3,c,35,CFO
4,d,55,VP
5,e,18,VP
6,f,21,CEO
7,g,35,MD


In [21]:
# The header parameter in read_csv(…) can be used to skip the row depicting the header

df_csv = pd.read_csv('csv_example.csv', names=['a', 'b', 'c'], header=1)

df_csv

Unnamed: 0,a,b,c
0,b,27,CEO
1,c,35,CFO
2,d,55,VP
3,e,18,VP
4,f,21,CEO
5,g,35,MD


In [25]:
# Another way of doing the same is by skipping the header while writing the CSV files as

df.to_csv('csv_example.csv', index=False, header=False)

df

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


In [29]:
# And while reading ,we can read without skipping the header as

df_csv = pd.read_csv('csv_example.csv', names=['AGE', 'DESIGNATION', 'NAME'])

df_csv

Unnamed: 0,AGE,DESIGNATION,NAME
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


# CSV to (Anything) Separated Value

In [46]:
# The only difference is that we need to pass the separator explicitly in the function while comma is considered by default
# Let’s first create a CSV file using a different separator i.e “:” (A colon)

df.to_csv('csv_example.csv', index=False, sep=':')

# This will create a file where the colon (‘:’) instead of comma (‘,’) shall be used as a separator
df_csv = pd.read_csv('csv_example.csv', sep=':')

df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD


# Setting the Row Index

In [48]:
# By default, Pandas DataFrame generates a row index automatically which we can change by setting any column as the Index as

df_csv.set_index('name', inplace=True)

df_csv

Unnamed: 0_level_0,age,designation
name,Unnamed: 1_level_1,Unnamed: 2_level_1
a,20,VP
b,27,CEO
c,35,CFO
d,55,VP
e,18,VP
f,21,CEO
g,35,MD


In [50]:
# We can do this at the time of loading CSV file by passing a parameter called index_col , which will automatically assign the column depicted by index_col as a row index

df_csv = pd.read_csv('csv_example.csv', sep=":", index_col=1)

df_csv

Unnamed: 0_level_0,name,designation
age,Unnamed: 1_level_1,Unnamed: 2_level_1
20,a,VP
27,b,CEO
35,c,CFO
55,d,VP
18,e,VP
21,f,CEO
35,g,MD


In [52]:
# We can even provide more than one index_col to be treated as index

df_csv = pd.read_csv('csv_example.csv', sep=":", index_col=[0,1])

df_csv

Unnamed: 0_level_0,Unnamed: 1_level_0,designation
name,age,Unnamed: 2_level_1
a,20,VP
b,27,CEO
c,35,CFO
d,55,VP
e,18,VP
f,21,CEO
g,35,MD


# If all rows are not required… Don’t load them

In [53]:
# Specify the number of Rows to be loaded by passing an argument nrows in read_csv(...)

df_csv = pd.read_csv('csv_example.csv', sep=":", nrows=3)

df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO


In [55]:
# By default, read_csv(...) function skips blank line, i.e it will ignore blank lines while loading the file and constructing the DataFrame
# However, in case you want to load blank line(s) for doing some explicit calculations like counting empty records, you should mark skipping blank lines as False

df_csv = pd.read_csv('csv_example.csv', sep=":", skip_blank_lines=False)

df_csv

Unnamed: 0,name,age,designation
0,a,20,VP
1,b,27,CEO
2,c,35,CFO
3,d,55,VP
4,e,18,VP
5,f,21,CEO
6,g,35,MD
