# Writing a data file

# Document

<table align="left">
    <tr>
        <th class="text-align:left">Title</th>
        <td class="text-align:left">Saving a data file</td>
    </tr>
    <tr>
        <th class="text-align:left">Last modified</th>
        <td class="text-align:left">2020-12-23</td>
    </tr>
    <tr>
        <th class="text-align:left">Author</th>
        <td class="text-align:left">Gilles Pilon <gillespilon13@gmail.com></td>
    </tr>
    <tr>
        <th class="text-align:left">Status</th>
        <td class="text-align:left">Active</td>
    </tr>
    <tr>
        <th class="text-align:left">Type</th>
        <td class="text-align:left">Jupyter notebook</td>
    </tr>
    <tr>
        <th class="text-align:left">Created</th>
        <td class="text-align:left">2018-12-21</td>
    </tr>
    <tr>
        <th class="text-align:left">File name</th>
        <td class="text-align:left">02_data_file_write.ipynb</td>
    </tr>
    <tr>
        <th class="text-align:left">Other files required</th>
        <td class="text-align:left">data/cloquet_two_weeks_60_min.csv<br />
                                    data/cloquet_two_weeks_30_min.csv<br />
                                    data/cloquet_two_weeks_15_min.csv
        </td>
    </tr>
</table>

# Introduction

- Write a dataframe to a csv file
- Write a dataframe to an Excel file with one worksheet
- Write several dataframes to an Excel file with multiple worksheets

# Import libraries

In [1]:
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import load_workbook
from openpyxl import Workbook
import datasense as ds
import pandas as pd

# Create three dataframes

In [2]:
size = 4

In [3]:
df1 = ds.create_dataframe(size=size)
df2 = ds.create_dataframe(size=size)
df3 = ds.create_dataframe(size=size)

In [4]:
df1.head(2)

Unnamed: 0,a,b,c,d,i,r,s,t,u,x,y,z
0,73.275838,False,small,0 days,19.243484,0,male,2020-12-23 13:14:00,2020-12-23 13:14:00,0.007606,32,0.727518
1,65.844691,False,large,0 days,81.355234,1,female,2020-12-24 13:14:00,2020-12-24 13:14:00,0.049797,27,0.41078


In [5]:
df1.dtypes

a            float64
b            boolean
c           category
d    timedelta64[ns]
i            float64
r             object
s             object
t     datetime64[ns]
u     datetime64[ns]
x            float64
y              Int64
z            float64
dtype: object

# Write a dataframe to a csv file

In [6]:
ds.save_file(
    df=df1,
    file_name='data/just_a_test.csv'
)

In [7]:
just_a_test = ds.read_file(
    file_name='data/just_a_test.csv'
)

In [8]:
just_a_test.head(2)

Unnamed: 0,a,b,c,d,i,r,s,t,u,x,y,z
0,73.275838,False,small,0 days,19.243484,0,male,2020-12-23 13:14:00,2020-12-23 13:14:00,0.007606,32,0.727518
1,65.844691,False,large,0 days,81.355234,1,female,2020-12-24 13:14:00,2020-12-24 13:14:00,0.049797,27,0.41078


In [9]:
just_a_test.dtypes

a    float64
b       bool
c     object
d     object
i    float64
r      int64
s     object
t     object
u     object
x    float64
y      int64
z    float64
dtype: object

## Fix dtypes on import

In [10]:
convert_dict = {
    'b': 'boolean',
    'c': 'category',
    'r': 'str',
    'y': 'Int64'
}
parse_dates = ['t', 'u']
time_delta_columns = ['d']
just_a_test = ds.read_file(
    file_name='data/just_a_test.csv',
    dtype=convert_dict,
    parse_dates=parse_dates,
    time_delta_columns=time_delta_columns
)

In [11]:
just_a_test.head(2)

Unnamed: 0,a,b,c,d,i,r,s,t,u,x,y,z
0,73.275838,False,small,0 days,19.243484,0,male,2020-12-23 13:14:00,2020-12-23 13:14:00,0.007606,32,0.727518
1,65.844691,False,large,0 days,81.355234,1,female,2020-12-24 13:14:00,2020-12-24 13:14:00,0.049797,27,0.41078


In [12]:
just_a_test.dtypes

a            float64
b            boolean
c           category
d    timedelta64[ns]
i            float64
r             object
s             object
t     datetime64[ns]
u     datetime64[ns]
x            float64
y              Int64
z            float64
dtype: object

## Write dataframes to an Excel workbook

In [13]:
path = 'data/even_another_file.xlsx'
engine = 'openpyxl'
with pd.ExcelWriter(path=path, engine=engine) as writer:
    df1.to_excel(
        excel_writer=writer,
        sheet_name='sheet_one',
        index=False
    )
    df2.to_excel(
        excel_writer=writer,
        sheet_name='sheet_two',
        index=False
    )
    df3.to_excel(
        excel_writer=writer,
        sheet_name='sheet_three',
        index=False
    )
writer.save()

In [14]:
wb2 = load_workbook(filename='data/even_another_file.xlsx')
wb2.sheetnames

['sheet_one', 'sheet_two', 'sheet_three']

## Read worksheets from an Excel workbook

In [15]:
ws1 = wb2['sheet_one']
from itertools import islice
data = ws1.values
cols = next(data)[:]
data = list(data)
# idx = [row[0] for row in data]
data = (islice(row, None) for row in data)
df_one = pd.DataFrame(data, columns=cols)

In [16]:
df_one.head(2)

Unnamed: 0,a,b,c,d,i,r,s,t,u,x,y,z
0,73.275838,False,small,0,19.243484,0,male,2020-12-23 13:14:00,2020-12-23 13:14:00,0.007606,32,0.727518
1,65.844691,False,large,0,81.355234,1,female,2020-12-24 13:14:00,2020-12-24 13:14:00,0.049797,27,0.41078


In [17]:
df_one.dtypes

a           float64
b              bool
c            object
d             int64
i           float64
r            object
s            object
t    datetime64[ns]
u    datetime64[ns]
x           float64
y             int64
z           float64
dtype: object

In [18]:
convert_dict = {
    'b': 'boolean',
    'c': 'category',
    'y': 'Int64'
}
time_delta_columns = ['d']

In [19]:
df_one = df_one.astype(convert_dict)
for column in time_delta_columns:
    df_one[column] = pd.to_timedelta(df_one[column])

In [20]:
df_one.dtypes

a            float64
b            boolean
c           category
d    timedelta64[ns]
i            float64
r             object
s             object
t     datetime64[ns]
u     datetime64[ns]
x            float64
y              Int64
z            float64
dtype: object

In [21]:
ws2 = wb2['sheet_two']
from itertools import islice
data = ws2.values
cols = next(data)[:]
data = list(data)
# idx = [row[0] for row in data]
data = (islice(row, None) for row in data)
df_two = pd.DataFrame(data, columns=cols)

In [22]:
df_two.dtypes

a           float64
b              bool
c            object
d             int64
i           float64
r            object
s            object
t    datetime64[ns]
u    datetime64[ns]
x           float64
y             int64
z           float64
dtype: object

In [23]:
df_two = df_two.astype(convert_dict)
for column in time_delta_columns:
    df_two[column] = pd.to_timedelta(df_two[column])

In [24]:
df_two.dtypes

a            float64
b            boolean
c           category
d    timedelta64[ns]
i            float64
r             object
s             object
t     datetime64[ns]
u     datetime64[ns]
x            float64
y              Int64
z            float64
dtype: object

In [25]:
ws3 = wb2['sheet_two']
from itertools import islice
data = ws3.values
cols = next(data)[:]
data = list(data)
# idx = [row[0] for row in data]
data = (islice(row, None) for row in data)
df_three = pd.DataFrame(data, columns=cols)

In [26]:
df_three.dtypes

a           float64
b              bool
c            object
d             int64
i           float64
r            object
s            object
t    datetime64[ns]
u    datetime64[ns]
x           float64
y             int64
z           float64
dtype: object

In [27]:
df_three = df_three.astype(convert_dict)
for column in time_delta_columns:
    df_three[column] = pd.to_timedelta(df_three[column])

In [28]:
df_three.dtypes

a            float64
b            boolean
c           category
d    timedelta64[ns]
i            float64
r             object
s             object
t     datetime64[ns]
u     datetime64[ns]
x            float64
y              Int64
z            float64
dtype: object

## Read worksheets from an Excel workbook

In [29]:
df_workbook = pd.read_excel(
    io=path,
    sheet_name=None,
    engine='openpyxl'
)

In [30]:
df_workbook

{'sheet_one':            a      b       c  d          i  r       s                   t  \
 0  73.275838  False   small  0  19.243484  0    male 2020-12-23 13:14:00   
 1  65.844691  False   large  0  81.355234  1  female 2020-12-24 13:14:00   
 2  35.343751   True   large  0  68.873317  1    male 2020-12-25 13:14:00   
 3  47.497957  False  medium  0  66.646702  0  female 2020-12-26 13:14:00   
 
                     u         x   y         z  
 0 2020-12-23 13:14:00  0.007606  32  0.727518  
 1 2020-12-24 13:14:00  0.049797  27  0.410780  
 2 2020-12-25 13:14:00 -0.413870  13  0.634297  
 3 2020-12-26 13:14:00  0.767391  62  0.532035  ,
 'sheet_two':            a     b       c  d          i  r       s                   t  \
 0  14.409521  True   large  0  27.309143  0  female 2020-12-23 13:14:00   
 1  36.621084  True   large  0  56.093995  1    male 2020-12-24 13:14:00   
 2  37.566962  True  medium  0  76.940037  0    male 2020-12-25 13:14:00   
 3  60.389591  True   small  0  63.09

In [31]:
df_workbook.keys()

dict_keys(['sheet_one', 'sheet_two', 'sheet_three'])

In [32]:
df_01 = df_workbook['sheet_one']
df_01.head(2)

Unnamed: 0,a,b,c,d,i,r,s,t,u,x,y,z
0,73.275838,False,small,0,19.243484,0,male,2020-12-23 13:14:00,2020-12-23 13:14:00,0.007606,32,0.727518
1,65.844691,False,large,0,81.355234,1,female,2020-12-24 13:14:00,2020-12-24 13:14:00,0.049797,27,0.41078


In [33]:
df_01.dtypes

a           float64
b              bool
c            object
d             int64
i           float64
r             int64
s            object
t    datetime64[ns]
u    datetime64[ns]
x           float64
y             int64
z           float64
dtype: object

In [34]:
for key in df_workbook.keys():
    print(key)

sheet_one
sheet_two
sheet_three


In [35]:
df_01 = df_workbook[key]
df_01.head()

Unnamed: 0,a,b,c,d,i,r,s,t,u,x,y,z
0,73.711321,True,medium,0,82.819832,1,male,2020-12-23 13:14:00,2020-12-23 13:14:00,-1.39789,36,0.618988
1,74.87077,True,large,0,26.35596,0,female,2020-12-24 13:14:00,2020-12-24 13:14:00,1.111683,37,0.766808
2,72.478425,True,large,0,30.319281,0,male,2020-12-25 13:14:00,2020-12-25 13:14:00,1.373472,16,0.190473
3,25.264526,False,large,0,21.849043,1,female,2020-12-26 13:14:00,2020-12-26 13:14:00,-0.582534,42,0.675817


In [36]:
df_01.dtypes

a           float64
b              bool
c            object
d             int64
i           float64
r             int64
s            object
t    datetime64[ns]
u    datetime64[ns]
x           float64
y             int64
z           float64
dtype: object

In [None]:
convert_dict = {
    'b': 'boolean',
    'c': 'category',
    'r': 'str',
    'y': 'Int64'
}
time_delta_columns = ['d']

In [73]:
df_01 = df_01.astype(convert_dict)
for column in time_delta_columns:
    df_01[column] = pd.to_timedelta(df_01[column])

In [74]:
df_01.dtypes

a            float64
b            boolean
c           category
d    timedelta64[ns]
i            float64
r              int64
s             object
t     datetime64[ns]
u     datetime64[ns]
x            float64
y              Int64
z            float64
dtype: object

In [69]:
for key, name in zip(df_workbook.keys(), ['df111', 'df222', 'df333']):
#for i in df_workbook:
    name = pd.DataFrame(df_workbook.get(key))
    print(name)
    print(name.dtypes)

           a      b       c  d          i  r       s                   t  \
0  73.275838  False   small  0  19.243484  0    male 2020-12-23 13:14:00   
1  65.844691  False   large  0  81.355234  1  female 2020-12-24 13:14:00   
2  35.343751   True   large  0  68.873317  1    male 2020-12-25 13:14:00   
3  47.497957  False  medium  0  66.646702  0  female 2020-12-26 13:14:00   

                    u         x   y         z  
0 2020-12-23 13:14:00  0.007606  32  0.727518  
1 2020-12-24 13:14:00  0.049797  27  0.410780  
2 2020-12-25 13:14:00 -0.413870  13  0.634297  
3 2020-12-26 13:14:00  0.767391  62  0.532035  
a           float64
b              bool
c            object
d             int64
i           float64
r             int64
s            object
t    datetime64[ns]
u    datetime64[ns]
x           float64
y             int64
z           float64
dtype: object
           a     b       c  d          i  r       s                   t  \
0  14.409521  True   large  0  27.309143  0  femal

# References

- [pandas to_excel](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_excel.html)

- [pandas to_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html)

- [pandas API reference](https://pandas.pydata.org/pandas-docs/stable/api.html)

- [pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)