In [1]:
# Addendum: Raw File I/0
#
# This section of the workshop reviews how to read a CSV file without using pandas
#
# Objectives
#    Read a file
#    Convert a file to a string
#    Use string methods to convert a comma delimited file into lists

In [2]:
# read a file directly into a variable
f = open("data/gapminder_gdp_oceania.csv")

In [3]:
# read all file data into a string variable
# note that this method requires reading the entire file at once
# if you have a very large file, you might prefer to read it line by line
with f:
    read_data = f.read()

In [4]:
type(read_data)

str

In [5]:
# all data is available as a string
read_data

'country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007\nAustralia,10039.59564,10949.64959,12217.22686,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744\nNew Zealand,10556.57566,12247.39532,13175.678,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911\n'

In [6]:
# next, we'll create a list out of each line of the CSV file
# to do this, we'll split on the newline character
# note: by default, the split() method will split on blank space

In [7]:
lines = read_data.split('\n')

In [8]:
# we now have a list
type(lines)

list

In [9]:
# the list has a different element for each newline character
len(lines)

4

In [10]:
# view the entire list - note that there is a blank string at the end of the list
# as there are 4 lines in the oceania file
lines

['country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007',
 'Australia,10039.59564,10949.64959,12217.22686,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744',
 'New Zealand,10556.57566,12247.39532,13175.678,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911',
 '']

In [11]:
# the first line contains headers
lines[0]

'country,gdpPercap_1952,gdpPercap_1957,gdpPercap_1962,gdpPercap_1967,gdpPercap_1972,gdpPercap_1977,gdpPercap_1982,gdpPercap_1987,gdpPercap_1992,gdpPercap_1997,gdpPercap_2002,gdpPercap_2007'

In [12]:
# the next two lines contain data for australia and new zealand
lines[1]

'Australia,10039.59564,10949.64959,12217.22686,14526.12465,16788.62948,18334.19751,19477.00928,21888.88903,23424.76683,26997.93657,30687.75473,34435.36744'

In [13]:
lines[2]

'New Zealand,10556.57566,12247.39532,13175.678,14463.91893,16046.03728,16233.7177,17632.4104,19007.19129,18363.32494,21050.41377,23189.80135,25185.00911'

In [14]:
# each line itself is a string
type(lines[2])

str

In [15]:
# to get the individual elements of each line, split on the comma
nz_data = lines[2].split(',')

In [16]:
type(nz_data)

list

In [17]:
nz_data[0]

'New Zealand'

In [18]:
nz_data[1]

'10556.57566'

In [19]:
# how would we sum the GDP numbers for a one of the lines here
# we can get at the numerical data with string slicing
nz_data[1:]

['10556.57566',
 '12247.39532',
 '13175.678',
 '14463.91893',
 '16046.03728',
 '16233.7177',
 '17632.4104',
 '19007.19129',
 '18363.32494',
 '21050.41377',
 '23189.80135',
 '25185.00911']

In [20]:
# you may notice that each element is still a string, though it contains a string representation of a number

In [21]:
type(nz_data[1])

str

In [22]:
# to add themn, we need to convert them to numbers before taking a sum
sum(float(x) for x in nz_data[1:])

207151.47375

In [23]:
print(nz_data[0], sum(float(x) for x in nz_data[1:]))

New Zealand 207151.47375


In [24]:
# exercise - print the sum of all GDP values for all countries in the list
# note - you may want to exclude the last element of the list
# as it is an empty line of text.

In [25]:
# note - the range goes to len(lines)-1 to account for the empty string
# created by newline char at the end of the file

for i in range(1, len(lines)-1):
    gdp_data = lines[i].split(',')
    print(gdp_data[0], sum(float(x) for x in gdp_data[1:]))

Australia 239767.14760999999
New Zealand 207151.47375
