## 1.1 Loading Data into Python

### Opening and reading files

In [1]:
%cat some_file.txt

This is some file
It has a few line
This is the last line


In [2]:
fname = 'some_file.txt'

f = open(fname, 'r')
content = f.read()
f.close()

print(content)

This is some file
It has a few line
This is the last line



In [3]:
fname = 'some_file.txt'
with open(fname, 'r') as f:
    content = f.read()

print(content)

This is some file
It has a few line
This is the last line



In [4]:
fname = 'some_file.txt'
with open(fname, 'r') as f:
    content = f.readlines()

print(content)

['This is some file\n', 'It has a few line\n', 'This is the last line\n']


In [5]:
fname = 'some_file.txt'
with open(fname, 'r') as f:
    for line in f:
        print(line)

This is some file

It has a few line

This is the last line



In [6]:
fname = 'some_file.txt'
with open(fname, 'r') as f:
    for i, line in enumerate(f):
        print("Line {}: {}".format(i, line.strip()))

Line 0: This is some file
Line 1: It has a few line
Line 2: This is the last line


### JSON

JavaScript Object Notation

Good for data serialization and communication between services

In [7]:
%cat movie.json

{
    "title": "Fight Club",
    "watched": true,
    "year": 1999,
    "actors": [
        "Brad Pitt",
        "Edward Norton",
        "Helena Bonham Carter"
    ]
}

In [8]:
import json

fname = 'movie.json'
with open(fname, 'r') as f:
    content = f.read()
    movie = json.loads(content)

movie

{u'actors': [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter'],
 u'title': u'Fight Club',
 u'watched': True,
 u'year': 1999}

In [9]:
import json

fname = 'movie.json'
with open(fname, 'r') as f:
    movie_alt = json.load(f)

In [10]:
movie == movie_alt

True

In [11]:
print(json.dumps(movie, indent=4))

{
    "watched": true, 
    "year": 1999, 
    "actors": [
        "Brad Pitt", 
        "Edward Norton", 
        "Helena Bonham Carter"
    ], 
    "title": "Fight Club"
}


In [12]:
%cat movies-90s.jsonl

{"title": "Fight Club", "year": 1999, "actors": ["Brad Pitt", "Edward Norton", "Helena Bonham Carter"], "watched": true}
{"title": "Goodfellas", "year": 1990, "actors": ["Robert De Niro", "Ray Liotta", "Joe Pesci"], "watched": true}
{"title": "Forrest Gump", "year": 1994, "actors": ["Tom Hanks", "Robin Wright"], "watched": true}



In [13]:
import json

fname = 'movies-90s.jsonl'

with open(fname, 'r') as f:
    for line in f:
        try:
            movie = json.loads(line)
            print(movie['title'])
        except: 
            #...
            pass


Fight Club
Goodfellas
Forrest Gump


### CSV files

Comma Separated Values

This format is very common for import/export for spreadsheet and databases

In [14]:
%cat data.csv

"NAME","AGE","LANGUAGE"
"Alice",30,"English"
"Bob",25,"Spanish"
"Charlie",35,"French"


In [15]:
import csv

fname = 'data.csv'

with open(fname, 'r') as f:
    data_reader = csv.reader(f, delimiter=',')
    headers = next(data_reader)
    print("Headers = {}".format(headers))
    for line in data_reader:
        print(line)

Headers = ['NAME', 'AGE', 'LANGUAGE']
['Alice', '30', 'English']
['Bob', '25', 'Spanish']
['Charlie', '35', 'French']


In [16]:
fname = 'data_no_header.csv'

with open(fname, 'r') as f:
    data_reader = csv.reader(f, delimiter=',')
    for line in data_reader:
        print(line)

['Alice', '30', 'English']
['Bob', '25', 'Spanish']
['Charlie', '35', 'French']


In [17]:
fname = 'data.csv'

with open(fname, 'r') as f:
    data_reader = csv.reader(f, delimiter=',')
    headers = next(data_reader)
    data = []
    for line in data_reader:
        item = {headers[i]: value for i, value in enumerate(line)}
        data.append(item)

data

[{'AGE': '30', 'LANGUAGE': 'English', 'NAME': 'Alice'},
 {'AGE': '25', 'LANGUAGE': 'Spanish', 'NAME': 'Bob'},
 {'AGE': '35', 'LANGUAGE': 'French', 'NAME': 'Charlie'}]

### Pickles: Python object serialization

In [18]:
with open('movie.json', 'r') as f:
    content = f.read()
    data = json.loads(content)

data

{u'actors': [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter'],
 u'title': u'Fight Club',
 u'watched': True,
 u'year': 1999}

In [19]:
type(data)

dict

In [20]:
import pickle 

with open('data.pickle', 'wb') as f:
    pickle.dump(data, f)

In [21]:
%cat data.pickle

(dp0
Vwatched
p1
I01
sVyear
p2
I1999
sVactors
p3
(lp4
VBrad Pitt
p5
aVEdward Norton
p6
aVHelena Bonham Carter
p7
asVtitle
p8
VFight Club
p9
s.

In [22]:
with open('data.pickle', 'rb') as f:
    data = pickle.load(f)

data

{u'actors': [u'Brad Pitt', u'Edward Norton', u'Helena Bonham Carter'],
 u'title': u'Fight Club',
 u'watched': True,
 u'year': 1999}

In [23]:
type(data)

dict

### Loading JSON and CSV into pandas

In [24]:
import pandas as pd

In [25]:
%cat movie.json

{
    "title": "Fight Club",
    "watched": true,
    "year": 1999,
    "actors": [
        "Brad Pitt",
        "Edward Norton",
        "Helena Bonham Carter"
    ]
}

In [26]:
data = pd.read_json('movie.json')
data.head()

Unnamed: 0,actors,title,watched,year
0,Brad Pitt,Fight Club,True,1999
1,Edward Norton,Fight Club,True,1999
2,Helena Bonham Carter,Fight Club,True,1999


In [27]:
%cat movies-90s.jsonl

{"title": "Fight Club", "year": 1999, "actors": ["Brad Pitt", "Edward Norton", "Helena Bonham Carter"], "watched": true}
{"title": "Goodfellas", "year": 1990, "actors": ["Robert De Niro", "Ray Liotta", "Joe Pesci"], "watched": true}
{"title": "Forrest Gump", "year": 1994, "actors": ["Tom Hanks", "Robin Wright"], "watched": true}



In [28]:
data = pd.read_json('movies-90s.jsonl', lines=True)
data.head()

Unnamed: 0,actors,title,watched,year
0,"[Brad Pitt, Edward Norton, Helena Bonham Carter]",Fight Club,True,1999
1,"[Robert De Niro, Ray Liotta, Joe Pesci]",Goodfellas,True,1990
2,"[Tom Hanks, Robin Wright]",Forrest Gump,True,1994


In [29]:
%cat data.csv

"NAME","AGE","LANGUAGE"
"Alice",30,"English"
"Bob",25,"Spanish"
"Charlie",35,"French"


In [30]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,NAME,AGE,LANGUAGE
0,Alice,30,English
1,Bob,25,Spanish
2,Charlie,35,French
