<p style="font-family: Arial; font-size:2.5em;color:purple; font-style:bold"><br>
INFS 772 Spring 2019 Week 6<br><br>Data Loading and Storage
</p><br>

<p style="font-family: Arial; font-size:1.5em;color:blue; font-style:bold"><br>

Reading and Writing Data from/to csv/txt Files
<br><br>
How to analyze a big file in smaller chunks with pandas chunksize
<br><br>
Using defaultdict: a new dictionary-like object
<br><br>
      --  Automatically creating default values for nonexistent keys
      <br><br>
Working with JSON Data<br><br>
Working with Binary Data Formats<br><br>
      --  Using HDF5 Format<br><br>
Interacting with Databases<br><br>
      --  SQLite<br><br>
      
</p>

In [2]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6)) # Set the current rc params.
np.set_printoptions(precision=4, suppress=True)

## Reading and Writing Data in Text Format

In [2]:
!type examples\ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


#### read_csv()

header : int, list of int, default ‘infer’
Row number(s) to use as the column names, and the start of the data. 

#### Default behavior is to infer the column names

In [3]:
df = pd.read_csv('examples/ex1.csv')

df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### sep : str, default ‘,’
Delimiter to use.

In [4]:
pd.read_table('examples/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
!type examples\ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


**names : List of column names to use. If file contains no header row, then you should explicitly pass header=None.**

In [3]:
pd.read_csv('examples/ex2.csv', header=None)
# pd.read_csv('examples/ex2.csv', names=['aa', 'bb', 'cc', 'dd', 'message'])

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [16]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('examples/ex2.csv', names=names, index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [6]:
!type examples\csv_mindex.csv
parsed = pd.read_csv('examples/csv_mindex.csv', index_col=['key1', 'key2'])
parsed

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [18]:
parsed.index

MultiIndex(levels=[['one', 'two'], ['a', 'b', 'c', 'd']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3]],
           names=['key1', 'key2'])

In [19]:
!type examples\ex3.txt

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491


In [20]:
data=open('examples/ex3.txt')
print(data)

<_io.TextIOWrapper name='examples/ex3.txt' mode='r' encoding='cp1252'>


In [21]:
list(open('examples/ex3.txt'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

In [22]:
result = pd.read_table('examples/ex3.txt', sep='\s+')
# \s+ - matches sequence of one or more whitespace characters.
# default value is sep='\t': tab
result

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


**skiprows : list-like or integer or callable, default None**

    Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file.

    If callable, the callable function will be evaluated against the row indices, returning True if the row should be skipped and False otherwise. An example of a valid callable argument would be lambda x: x in [0, 2].


In [24]:
!type examples\ex4.csv
pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [25]:
!type examples\ex5.csv
result = pd.read_csv('examples/ex5.csv')
result
pd.isnull(result) #  pandas.isnull(obj): Detect missing values

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo


Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


**na_values : Additional strings to recognize as NA/NaN**

In [28]:
result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])
result1 = pd.read_csv('examples/ex5.csv')
result.merge(result1, on='something')

Unnamed: 0,something,a_x,b_x,c_x,d_x,message_x,a_y,b_y,c_y,d_y,message_y
0,one,1,2,3.0,4,,1,2,3.0,4,
1,two,5,6,,8,world,5,6,,8,world
2,three,9,10,11.0,12,foo,9,10,11.0,12,foo


In [26]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('examples/ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


### Reading Text Files in Pieces

In [7]:
pd.options.display.max_rows = 30

In [None]:
!type examples\ex6.csv

In [9]:
result = pd.read_csv('examples/ex6.csv')
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.817480,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.358480,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.637830,2.172201,G


In [30]:
pd.read_csv('examples/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


**chunksize : int, default None**

    Return TextFileReader object for iteration. 

#### By specifying a chunksize to read_csv, the return value will be an iterable object of type TextFileReader

In [10]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0x1fd09657710>

In [16]:
chunker.chunksize

1000

### How to analyze a big file in smaller chunks with pandas chunksize?
Let us see an example of loading a big csv file in smaller chunks. We will use the gapminder data as an example with chunk size 1000. Here the chunk size 500 means, we will be reading 500 lines at a time.


In [18]:
# link to gapminder data as csv file
# from software carpentry website
csv_url='http://bit.ly/2cLzoxH'
# use chunk size 500
c_size = 500

Let us use pd.read_csv to read the csv file in chunks of 500 lines with chunksize=500 option. The code below prints the shape of the each smaller chunk data frame. Note that the first three chunks are of size 500 lines. Pandas is clever enough to know that the last chunk is smaller than 500 and load only the remaining line in the data frame, in this case 204 lines.

In [19]:
# load the big file in smaller chunks
for gm_chunk in pd.read_csv(csv_url,chunksize=c_size):
    print(gm_chunk.shape)

(500, 6)
(500, 6)
(500, 6)
(204, 6)


### defaultdict: a new dictionary-like object

let us read the CSV file in chunks of 500 lines and compute the number entries (or rows) per each continent in the data set.

Let us use defaultdict from collections to keep a counter of number of rows per continent.

### Why use defaultdict?

Dictionaries are a convenient way to store data for later retrieval by name (key). Keys must be unique, immutable objects, and are typically strings. The values in a dictionary can be anything. For many applications the values are simple types such as integers and strings.

It gets more interesting when the values in a dictionary are collections (lists, dicts, etc.) In this case, the value (an empty list or dict) must be initialized the first time a given key is used. While this is relatively easy to do manually, the defaultdict type automates and simplifies these kinds of operations.

In [21]:
from collections import defaultdict
# default value of int is 0 with defaultdict
continent_dict = defaultdict(int) 

Let us load the big CSV file with chunnksize=500 and count the number of continent entries in each smaller chunk using the defaultdict.

In [23]:
for gm_chunk in pd.read_csv(csv_url,chunksize=500):
    for c in gm_chunk['continent']:
        continent_dict[c] += 1

In [24]:
print(continent_dict)

defaultdict(<class 'int'>, {'Asia': 396, 'Europe': 360, 'Africa': 624, 'Americas': 300, 'Oceania': 24})


#### In the next example, we start with a list of states and cities. We want to build a dictionary where the keys are the state abbreviations and the values are lists of all cities for that state. To build this dictionary of lists, we use a defaultdict with a default factory of list. A new list is created for each new key.

In [27]:
city_list = [('TX','Austin'), ('TX','Houston'), ('NY','Albany'), ('NY', 'Syracuse'), ('NY', 'Buffalo'), ('NY', 'Rochester'), ('TX', 'Dallas'), ('CA','Sacramento'), ('CA', 'Palo Alto'), ('GA', 'Atlanta')]

cities_by_state = defaultdict(list)

for state, city in city_list:
     cities_by_state[state].append(city)
        
for state, cities in cities_by_state.items():
     print(state, ', '.join(cities))

TX Austin, Houston, Dallas
NY Albany, Syracuse, Buffalo, Rochester
CA Sacramento, Palo Alto
GA Atlanta


In [34]:
def default_factory():
    return 'default value'

d = defaultdict(default_factory, foo='bar')
print('d:', d)
print('foo =>', d['foo'])
print('no_foo =>', d['no_foo'])

d: defaultdict(<function default_factory at 0x000001FD09672950>, {'foo': 'bar'})
foo => bar
no_foo => default value


### Back to our examples/ex6.csv

In [32]:
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)

tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0) # Fill missing (NaN) values with 0.

tot = tot.sort_values(ascending=False)

In [33]:
tot[:10]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

In [34]:
sum(tot[:])

10000.0

### Writing Data to Text Format

In [35]:
data = pd.read_csv('examples/ex5.csv')
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [36]:
data.to_csv('examples/out2.csv')
!type examples\out2.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [37]:
import sys
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [57]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [58]:
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [59]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [41]:
dates = pd.date_range('1/1/2000', periods=7)
ts = pd.Series(np.arange(7), index=dates)
ts.to_csv('examples/mynewtsfile.csv')
!type examples\tseries.csv

2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


### Working with Delimited Formats
#### The csv module implements classes to read and write tabular data in CSV format. It allows programmers to say, “write this data in the format preferred by Excel,” or “read data from this file which was generated by Excel,” without knowing the precise details of the CSV format used by Excel. Programmers can also describe the CSV formats understood by other applications or define their own special-purpose CSV formats.

#### The csv module’s reader and writer objects read and write sequences.

In [42]:
!type examples\ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"


In [44]:
import csv
f = open('examples/ex7.csv')

reader = csv.reader(f)
type(f),type(reader)

(_io.TextIOWrapper, _csv.reader)

In [45]:
for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [46]:
with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))

In [47]:
header, values = lines[0], lines[1:]

In [48]:
data_dict = {h: v for h, v in zip(header, zip(*values))} # * unpacks argument lists
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL

reader = csv.reader(f, dialect=my_dialect)

reader = csv.reader(f, delimiter='|')

with open('mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two', 'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))

### JSON Data
### JSON (JavaScript Object Notation) is a lightweight data interchange format inspired by JavaScript object literal syntax

In [53]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]},
              {"name": "Katie", "age": 38,
               "pets": ["Sixes", "Stache", "Cisco"]}]
}
"""

In [52]:
type(obj)

str

**Decoding JSON:**

In [55]:
import json
result = json.loads(obj)
result

{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 30, 'name': 'Scott', 'pets': ['Zeus', 'Zuko']},
  {'age': 38, 'name': 'Katie', 'pets': ['Sixes', 'Stache', 'Cisco']}]}

In [57]:
type(result)

dict

**Compact encoding:**

In [58]:
asjson = json.dumps(result)
asjson

'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 30, "pets": ["Zeus", "Zuko"]}, {"name": "Katie", "age": 38, "pets": ["Sixes", "Stache", "Cisco"]}]}'

**Pretty printing:**

In [59]:
print(json.dumps(result, sort_keys=True, indent=4))

{
    "name": "Wes",
    "pet": null,
    "places_lived": [
        "United States",
        "Spain",
        "Germany"
    ],
    "siblings": [
        {
            "age": 30,
            "name": "Scott",
            "pets": [
                "Zeus",
                "Zuko"
            ]
        },
        {
            "age": 38,
            "name": "Katie",
            "pets": [
                "Sixes",
                "Stache",
                "Cisco"
            ]
        }
    ]
}


In [13]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,30
1,Katie,38


In [14]:
!type examples\example.json

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]


In [15]:
data = pd.read_json('examples/example.json')
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [23]:
print(data.to_json())
print(data.to_json(orient='records'))

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}
[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]


In [24]:
type(data.to_json())

str

In [30]:
json.loads(data.to_json())

{'a': {'0': 1, '1': 4, '2': 7},
 'b': {'0': 2, '1': 5, '2': 8},
 'c': {'0': 3, '1': 6, '2': 9}}

## Binary Data Formats
#### DataFrame.to_pickle: Pickle (serialize) object to input file path.

In [32]:
frame = pd.read_csv('examples/ex1.csv')
print(frame)
frame.to_pickle('examples/frame_pickle')

   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo


In [33]:
pd.read_pickle('examples/frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [34]:
!del examples\frame_pickle

### Using HDF5 Format
### The Hierarchical Data Format version 5 (HDF5), is an open source file format that supports large, complex, heterogeneous data. HDF5 uses a “file directory” like structure that allows you to organize data within the file in many different structured ways, as you might do with files on your computer. The HDF5 format also allows for embedding of metadata making it self-describing.

#### HDF5 is a data model, library, and file format for storing and managing data. It supports an unlimited variety of datatypes, and is designed for flexible and efficient I/O and for high volume and complex data. HDF5 is portable and is extensible, allowing applications to evolve in their use of HDF5. The HDF5 Technology suite includes tools and applications for managing, manipulating, viewing, and analyzing data in the HDF5 format. 

In [54]:
frame = pd.DataFrame({'a': np.random.randn(100)})
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
/obj1                frame        (shape->[100,1])                                       
/obj1_col            series       (shape->[100])                                         
/obj2                frame_table  (typ->appendable,nrows->100,ncols->1,indexers->[index])
/obj3                frame_table  (typ->appendable,nrows->100,ncols->1,indexers->[index])

In [48]:
frame.head()

Unnamed: 0,a
0,1.747234
1,-1.410246
2,-0.378242
3,-0.345821
4,0.380062


In [49]:
type(store),type(store['obj1']),type(store['obj1_col'])

(pandas.io.pytables.HDFStore,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series)

In [None]:
store['obj1']

 HDFStore.put(): Store object in HDFStore

### Reading Microsoft Excel Files

In [56]:
# !pip3 install xlrd
import xlrd

xlsx = pd.ExcelFile('examples/ex1.xlsx')

In [57]:
pd.read_excel(xlsx, 'Sheet1')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [58]:
frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [59]:
# !pip3 install openpyxl

import openpyxl

writer = pd.ExcelWriter('examples/ex2.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

In [73]:
frame.to_excel('examples/ex2.xlsx')

In [74]:
!del examples\ex2.xlsx

## Interacting with Web APIs

In [35]:
import requests
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
resp = requests.get(url)
resp

<Response [200]>

In [61]:
data = resp.json()
data[0]['title']

'BUG: fix Series constructor for scalar and Categorical dtype'

In [62]:
issues = pd.DataFrame(data, columns=['number', 'title',
                                     'labels', 'state'])
issues

Unnamed: 0,number,title,labels,state
0,19717,BUG: fix Series constructor for scalar and Cat...,"[{'id': 78527356, 'url': 'https://api.github.c...",open
1,19716,Incorrectly assigned Index ops/names,"[{'id': 35818298, 'url': 'https://api.github.c...",open
2,19715,Conform Series.to_csv to DataFrame.to_csv,"[{'id': 76865106, 'url': 'https://api.github.c...",open
3,19714,BUG: Fix Series constructor for Categorical wi...,"[{'id': 76811, 'url': 'https://api.github.com/...",open
4,19713,pipe example in basics.rst broken due to SciPy...,"[{'id': 134699, 'url': 'https://api.github.com...",open
5,19712,CI: Builds timing out,"[{'id': 48070600, 'url': 'https://api.github.c...",open
6,19711,DEPR: remove pandas.core.index,"[{'id': 87485152, 'url': 'https://api.github.c...",open
7,19710,DOC: develop a set of standard example DataFra...,"[{'id': 134699, 'url': 'https://api.github.com...",open
8,19709,Implement maybe_cache for compat between immut...,"[{'id': 76811, 'url': 'https://api.github.com/...",open
9,19708,Numeric Indexes should have `diff` method,"[{'id': 76812, 'url': 'https://api.github.com/...",open


#### pandas.read_sas

pandas.read_sas()

    Read SAS files stored as either XPORT or SAS7BDAT format files.

In [69]:
df = pd.read_sas('examples/demo_i.XPT')
df.head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,83732.0,9.0,2.0,1.0,62.0,,3.0,3.0,1.0,,...,5.0,1.0,3.0,134671.370419,135629.507405,1.0,125.0,10.0,10.0,4.39
1,83733.0,9.0,2.0,1.0,53.0,,3.0,3.0,1.0,,...,3.0,3.0,,24328.560239,25282.425927,1.0,125.0,4.0,4.0,1.32
2,83734.0,9.0,2.0,1.0,78.0,,3.0,3.0,2.0,,...,3.0,1.0,3.0,12400.008522,12575.838818,1.0,131.0,5.0,5.0,1.51
3,83735.0,9.0,2.0,2.0,56.0,,3.0,3.0,2.0,,...,5.0,6.0,,102717.995647,102078.634508,1.0,131.0,10.0,10.0,5.0
4,83736.0,9.0,2.0,2.0,42.0,,4.0,4.0,2.0,,...,4.0,3.0,,17627.674984,18234.736219,2.0,126.0,7.0,7.0,1.23


## Interacting with Databases

#### SQLite is a C library that provides a lightweight disk-based database that doesn’t require a separate server process and allows accessing the database using a nonstandard variant of the SQL query language. Some applications can use SQLite for internal data storage. It’s also possible to prototype an application using SQLite and then port the code to a larger database such as PostgreSQL or Oracle.

In [39]:
# --------- Example Python Program for dropping an SQLite Table---------


# import the sqlite3 module

import sqlite3

 

# Connect to the demo database

connection  = sqlite3.connect('mynewdata.sqlite')


# Get a cursor object

cursor      = connection.cursor()


# Execute the DROP Table SQL statement

dropTableStatement = "DROP TABLE test"

cursor.execute(dropTableStatement)

 

# Close the connection object

connection.close()

In [38]:
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""
con = sqlite3.connect('mynewdata.sqlite')
con.execute(query)
con.commit()

In [2]:
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

In [3]:
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

In [6]:
cursor.description #This read-only attribute provides the column names of the last query. 
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


## sqlalchemy
The Python SQL Toolkit and Object Relational Mapper

SQLAlchemy is the Python SQL toolkit and Object Relational Mapper that gives application developers the full power and flexibility of SQL.

It provides a full suite of well known enterprise-level persistence patterns, designed for efficient and high-performing database access, adapted into a simple and Pythonic domain language.

In [7]:
#!pip3 install sqlalchemy

import sqlalchemy as sqla
db = sqla.create_engine('sqlite:///mydata.sqlite')
pd.read_sql('select * from test', db)

Unnamed: 0,a,b,c,d
0,Atlanta,Georgia,1.25,6
1,Tallahassee,Florida,2.6,3
2,Sacramento,California,1.7,5


## Conclusion