### Data Manipulation in Python

### Reading data from CSV files using csv package

In [1]:
import csv

In [2]:
# For compatibility across multiple platforms
import os
IB = os.environ.get('INSTABASE_URI',None) is not None
open = ib.open if IB else open

In [3]:
# Read Cities.csv data into default list format and print all rows
# Make sure data file is in same folder as notebook
# Note all values are read as strings
with open('Cities.csv','rU') as f:
    rows = csv.reader(f)
    for r in rows:
        print r

['city', 'country', 'latitude', 'longitude', 'temperature']
['Aalborg', 'Denmark', '57.03', '9.92', '7.52']
['Aberdeen', 'United Kingdom', '57.17', '-2.08', '8.10']
['Abisko', 'Sweden', '63.35', '18.83', '0.20']
['Adana', 'Turkey', '36.99', '35.32', '18.67']
['Albacete', 'Spain', '39.00', '-1.87', '12.62']
['Algeciras', 'Spain', '36.13', '-5.47', '17.38']
['Amiens', 'France', '49.90', '2.30', '10.17']
['Amsterdam', 'Netherlands', '52.35', '4.92', '8.93']
['Ancona', 'Italy', '43.60', '13.50', '13.52']
['Andorra', 'Andorra', '42.50', '1.52', '9.60']
['Angers', 'France', '47.48', '-0.53', '10.98']
['Ankara', 'Turkey', '39.93', '32.86', '9.86']
['Antalya', 'Turkey', '36.89', '30.70', '11.88']
['Arad', 'Romania', '46.17', '21.32', '9.32']
['Athens', 'Greece', '37.98', '23.73', '17.41']
['Augsburg', 'Germany', '48.35', '10.90', '4.54']
['Bacau', 'Romania', '46.58', '26.92', '7.51']
['Badajoz', 'Spain', '38.88', '-6.97', '15.61']
['Baia Mare', 'Romania', '47.66', '23.58', '8.87']
['Balti', 'M

In [4]:
# Same as previous except use dictionary format
with open('Cities.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        print r

{'latitude': '57.03', 'city': 'Aalborg', 'temperature': '7.52', 'longitude': '9.92', 'country': 'Denmark'}
{'latitude': '57.17', 'city': 'Aberdeen', 'temperature': '8.10', 'longitude': '-2.08', 'country': 'United Kingdom'}
{'latitude': '63.35', 'city': 'Abisko', 'temperature': '0.20', 'longitude': '18.83', 'country': 'Sweden'}
{'latitude': '36.99', 'city': 'Adana', 'temperature': '18.67', 'longitude': '35.32', 'country': 'Turkey'}
{'latitude': '39.00', 'city': 'Albacete', 'temperature': '12.62', 'longitude': '-1.87', 'country': 'Spain'}
{'latitude': '36.13', 'city': 'Algeciras', 'temperature': '17.38', 'longitude': '-5.47', 'country': 'Spain'}
{'latitude': '49.90', 'city': 'Amiens', 'temperature': '10.17', 'longitude': '2.30', 'country': 'France'}
{'latitude': '52.35', 'city': 'Amsterdam', 'temperature': '8.93', 'longitude': '4.92', 'country': 'Netherlands'}
{'latitude': '43.60', 'city': 'Ancona', 'temperature': '13.52', 'longitude': '13.50', 'country': 'Italy'}
{'latitude': '42.50', '

In [5]:
# Print the city and longitude of all cities with longitude < 0
# Use dictionary format
with open('Cities.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if float(r['longitude']) < 0:
            print r['city'], r['longitude']
# Show what happens without float()

Aberdeen -2.08
Albacete -1.87
Algeciras -5.47
Angers -0.53
Badajoz -6.97
Belfast -5.96
Bilbao -2.93
Birmingham -1.92
Blackpool -3.05
Bordeaux -0.60
Bournemouth -1.90
Bradford -1.75
Braga -8.42
Brest -4.50
Burgos -3.68
Caen -0.35
Cartagena -0.98
Cork -8.50
Dublin -6.25
Dundee -3.00
Edinburgh -3.22
Exeter -3.53
Galway -9.05
Glasgow -4.25
Granada -3.59
Huelva -6.93
Inverness -4.23
Lisbon -9.14
Madrid -3.68
Marbella -4.88
Murcia -1.13
Oviedo -5.83
Salamanca -5.67
Santander -3.80
Swansea -3.95
Valencia -0.40
Vigo -8.73
Zaragoza -0.89


In [6]:
# Same but using list format
with open('Cities.csv','rU') as f:
    rows = csv.reader(f)
    next(rows) # discard header row
    for r in rows:
        if float(r[3]) < 0:
            print r[0], r[3]
# Show what happens without next(rows)

Aberdeen -2.08
Albacete -1.87
Algeciras -5.47
Angers -0.53
Badajoz -6.97
Belfast -5.96
Bilbao -2.93
Birmingham -1.92
Blackpool -3.05
Bordeaux -0.60
Bournemouth -1.90
Bradford -1.75
Braga -8.42
Brest -4.50
Burgos -3.68
Caen -0.35
Cartagena -0.98
Cork -8.50
Dublin -6.25
Dundee -3.00
Edinburgh -3.22
Exeter -3.53
Galway -9.05
Glasgow -4.25
Granada -3.59
Huelva -6.93
Inverness -4.23
Lisbon -9.14
Madrid -3.68
Marbella -4.88
Murcia -1.13
Oviedo -5.83
Salamanca -5.67
Santander -3.80
Swansea -3.95
Valencia -0.40
Vigo -8.73
Zaragoza -0.89


### <font color="green">Your Turn</font>

In [7]:
# Using Countries.csv and reading in dictionary format, find
# all countries that have coastline and are not in the EU.
# Print the countries and their populations.
# Hint: The copy-paste-modify approach to programming
# is highly recommended!
#
with open('Countries.csv', 'rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if r['coastline'] == 'yes' and r['EU'] == 'no':
            print r['country'], r['population']

Albania 2.9
Bosnia and Herzegovina 3.8
Iceland 0.33
Montenegro 0.63
Norway 5.27
Turkey 79.62
Ukraine 44.62


### Reading data into Python data structures

In [8]:
# Read Cities.csv data into list of dictionaries
cities = []
with open('Cities.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        cities.append(r)
    print cities

[{'latitude': '57.03', 'city': 'Aalborg', 'temperature': '7.52', 'longitude': '9.92', 'country': 'Denmark'}, {'latitude': '57.17', 'city': 'Aberdeen', 'temperature': '8.10', 'longitude': '-2.08', 'country': 'United Kingdom'}, {'latitude': '63.35', 'city': 'Abisko', 'temperature': '0.20', 'longitude': '18.83', 'country': 'Sweden'}, {'latitude': '36.99', 'city': 'Adana', 'temperature': '18.67', 'longitude': '35.32', 'country': 'Turkey'}, {'latitude': '39.00', 'city': 'Albacete', 'temperature': '12.62', 'longitude': '-1.87', 'country': 'Spain'}, {'latitude': '36.13', 'city': 'Algeciras', 'temperature': '17.38', 'longitude': '-5.47', 'country': 'Spain'}, {'latitude': '49.90', 'city': 'Amiens', 'temperature': '10.17', 'longitude': '2.30', 'country': 'France'}, {'latitude': '52.35', 'city': 'Amsterdam', 'temperature': '8.93', 'longitude': '4.92', 'country': 'Netherlands'}, {'latitude': '43.60', 'city': 'Ancona', 'temperature': '13.52', 'longitude': '13.50', 'country': 'Italy'}, {'latitude': 

In [9]:
# Print the city and longitude of all cities with longitude < 0
for city in cities:
    if float(city['longitude']) < 0:
        print city['city'], city['longitude']

Aberdeen -2.08
Albacete -1.87
Algeciras -5.47
Angers -0.53
Badajoz -6.97
Belfast -5.96
Bilbao -2.93
Birmingham -1.92
Blackpool -3.05
Bordeaux -0.60
Bournemouth -1.90
Bradford -1.75
Braga -8.42
Brest -4.50
Burgos -3.68
Caen -0.35
Cartagena -0.98
Cork -8.50
Dublin -6.25
Dundee -3.00
Edinburgh -3.22
Exeter -3.53
Galway -9.05
Glasgow -4.25
Granada -3.59
Huelva -6.93
Inverness -4.23
Lisbon -9.14
Madrid -3.68
Marbella -4.88
Murcia -1.13
Oviedo -5.83
Salamanca -5.67
Santander -3.80
Swansea -3.95
Valencia -0.40
Vigo -8.73
Zaragoza -0.89


In [10]:
# Print each city and whether in EU
# Must join cities with countries
# First read Countries.csv data int list of dictionaries
countries = []
with open('Countries.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        countries.append(r)
print countries

[{'EU': 'no', 'country': 'Albania', 'coastline': 'yes', 'population': '2.9'}, {'EU': 'no', 'country': 'Andorra', 'coastline': 'no', 'population': '0.07'}, {'EU': 'yes', 'country': 'Austria', 'coastline': 'no', 'population': '8.57'}, {'EU': 'no', 'country': 'Belarus', 'coastline': 'no', 'population': '9.48'}, {'EU': 'yes', 'country': 'Belgium', 'coastline': 'yes', 'population': '11.37'}, {'EU': 'no', 'country': 'Bosnia and Herzegovina', 'coastline': 'yes', 'population': '3.8'}, {'EU': 'yes', 'country': 'Bulgaria', 'coastline': 'yes', 'population': '7.1'}, {'EU': 'yes', 'country': 'Croatia', 'coastline': 'yes', 'population': '4.23'}, {'EU': 'yes', 'country': 'Cyprus', 'coastline': 'yes', 'population': '1.18'}, {'EU': 'yes', 'country': 'Czech Republic', 'coastline': 'no', 'population': '10.55'}, {'EU': 'yes', 'country': 'Denmark', 'coastline': 'yes', 'population': '5.69'}, {'EU': 'yes', 'country': 'Estonia', 'coastline': 'yes', 'population': '1.31'}, {'EU': 'yes', 'country': 'Finland', 'c

In [11]:
# Now perform join
for city in cities:
    for country in countries:
        if city['country'] == country['country']:
            print city['city'], country['EU']
# add 'break' command to for-loop

Aalborg yes
Aberdeen yes
Abisko yes
Adana no
Albacete yes
Algeciras yes
Amiens yes
Amsterdam yes
Ancona yes
Andorra no
Angers yes
Ankara no
Antalya no
Arad yes
Athens yes
Augsburg yes
Bacau yes
Badajoz yes
Baia Mare yes
Balti no
Barcelona yes
Bari yes
Basel no
Batman no
Belfast yes
Belgrade no
Bergamo yes
Bergen no
Berlin yes
Bialystok yes
Bielefeld yes
Bila Tserkva no
Bilbao yes
Birmingham yes
Blackpool yes
Bodo no
Bologna yes
Bonn yes
Bordeaux yes
Botosani yes
Bournemouth yes
Bradford yes
Braga yes
Braila yes
Bratislava yes
Bremen yes
Brest yes
Brest no
Brno yes
Brugge yes
Bucharest yes
Budapest yes
Burgas yes
Burgos yes
Bursa no
Bydgoszcz yes
Bytom yes
Caen yes
Cambridge yes
Cartagena yes
Catania yes
Chemnitz yes
Cherkasy no
Chernihiv no
Chernivtsi no
Chisinau no
Constanta yes
Cork yes
Cosenza yes
Craiova yes
Daugavpils yes
Debrecen yes
Denizli no
Dijon yes
Dublin yes
Dundee yes
Edinburgh yes
Edirne no
Elbasan no
Elblag yes
Erfurt yes
Erzincan no
Erzurum no
Eskisehir no
Exeter yes
F

In [12]:
# Compute overall average city temperature
temps = [] # create list of all temperatures
with open('Cities.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        temps.append(float(r['temperature'])) 
print sum(temps)/len(temps)

9.49784037559


### Computing average directly using NumPy package

In [13]:
import numpy as np

In [14]:
# Compute overall average city temperature - now using np.average
temps = [] # create list of all temperatures
with open('Cities.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        temps.append(float(r['temperature'])) 
print np.average(temps)

9.49784037559


In [15]:
# Compute average city temperature for each country
countrylist = [] # list of countries in cities data
for city in cities:
    if city['country'] not in countrylist:
        countrylist.append(city['country'])
# print countrylist
for country in countrylist:
    temps = []
    for city in cities:
        if city['country'] == country:
            temps.append(float(city['temperature']))
    print country, np.average(temps)

Denmark 7.625
United Kingdom 8.65
Sweden 3.58666666667
Turkey 11.7266666667
Spain 14.2383333333
France 10.1511111111
Netherlands 8.75666666667
Italy 13.4746666667
Andorra 9.6
Romania 9.22444444444
Greece 16.9025
Germany 7.86928571429
Moldova 8.415
Switzerland 7.25333333333
Serbia 9.85
Norway 3.726
Poland 7.25
Ukraine 7.42
Portugal 14.47
Slovakia 8.48
Belarus 5.94666666667
Czech Republic 7.85666666667
Belgium 9.65
Hungary 9.6025
Bulgaria 10.44
Ireland 9.3
Latvia 5.27
Albania 15.18
Austria 6.144
Finland 3.4875
Lithuania 6.14333333333
Slovenia 9.27
Montenegro 9.99
Croatia 10.865
Bosnia and Herzegovina 9.6
Macedonia 9.36
Estonia 4.59


### <font color="green">Your Turn</font>

In [16]:
# Determine the average temperature for EU cities and the average
# temperature for non-EU cities, before and after "Brexit". That is,
# for one pair of averages assume the United Kingdom is in the EU,
# and for the other pair assume the United Kingdom is not in the EU.
# Print the four numbers and make sure to label which is which!
# Initial code is provided to reload cities and countries lists:
cities = []
with open('Cities.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        cities.append(r)
countries = []
with open('Countries.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        countries.append(r)
# Recommended data structures:
UK = [] # temperatures of cities in the United Kingdom
EU = [] # temperatures of cities in an EU country other than the United Kingdom
nonEU = [] # temperatures of cities in a non-EU country other than the UK
# Hint: Start with code in earlier example for joining cities and countries
# Hint: Remember you can combine two lists using "+"
# Less than 10 lines of code are needed, not counting printing
# print countries
# print cities
for country in countries:
    for city in cities:
        if city['country'] == "United Kingdom":
            UK.append(float(city['temperature']))
        elif country['EU'] == 'yes':
            EU.append(float(city['temperature']))
        elif country['EU'] == 'no':
            nonEU.append(float(city['temperature']))

print "EU average before Brexit: ", np.average(EU + UK)
print "nonEU average before brexit", np.average(nonEU)
print "EU average after brexit", np.average(EU)
print "nonEU average after brexit", np.average(nonEU + UK)

EU average before Brexit:  9.47003531786
nonEU average before brexit 9.55295
EU average after brexit 9.55295
nonEU average after brexit 9.41391708968


### Minimum and maximum

In [17]:
# Overall minimum and maximum temperatures
temps = [] # create list of all temperatures
with open('Cities.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        temps.append(float(r['temperature']))
print 'Minimum:', min(temps)
print 'Maximum:', max(temps)

Minimum: -2.2
Maximum: 18.67


In [18]:
# Alternative method
minval = 100.00 # greater than any possible minimum
maxval = -100.00 # smaller than any possible maximum
for city in cities:
    if float(city['temperature']) < minval:
        minval = float(city['temperature'])
    if float(city['temperature']) > maxval:
        maxval = float(city['temperature'])
print 'Minimum:', minval
print 'Maximum:', maxval

Minimum: -2.2
Maximum: 18.67


### <font color="green">Your Turn</font>

In [19]:
# Determine which country has the lowest average city temperature
# and which country has the highest average city temperature.
# Print the two countries and their average temperatures.
# Hint: Start with code above that computes average temperatures
# for each country, then incorporate the running min/max method.
#
# Try your hands at this yourself : Elijah

### <font color="green">Your Turn: World Cup Data</font>

In [20]:
# What player on a team with “ia” in the team name played less than
# 200 minutes and made more than 100 passes? Print the player surname.
# Note: In Python, use "'abc' in s" to check whether string s contains'abc'
# Reminder: Convert minutes and passes to integers before comparing to values
players = []
with open('Players.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        players.append(r)
teams = []
with open('Teams.csv','rU') as f:
    rows = csv.DictReader(f)
    for r in rows:
        teams.append(r)
# print players
# print teams
for team in teams:
    for player in players:
        if 'ia' in team['team'] and player['team'] == team['team'] and int(player['minutes']) < 200 and int(player['passes']) > 100:
            print "Player is:", player['surname']

Player is: Kuzmanovic


In [21]:
# What is the average number of passes made by forwards? By midfielders?
passes1 = []
passes2 = []
for player in players:
    if player['position'] == 'forward':
        passes1.append(int(player['passes']))
    elif player['position'] == 'midfielder':
        passes2.append(int(player['passes']))
print "Average passes by forwards: ", np.average(passes1)
print "Average passes by midfielders: ", np.average(passes2)

Average passes by forwards:  50.8251748252
Average passes by midfielders:  95.2719298246


In [22]:
# Which team has the highest ratio of goalsFor to goalsAgainst?
# Print the team only.
# Reminder: Use float() to make sure you're doing floating point division
# Hint: Use two variables to keep track of highest ratio seen so far
# and team with that ratio:
ratio = 0 # highest ratio seen so far
team = '' # team with highest ratio
for this_team in teams:
    new_ratio = float(this_team['goalsFor']) / float(this_team['goalsAgainst'])
    if new_ratio > ratio:
        ratio = new_ratio
        team = this_team['team']

print "Highest Ratio is:", ratio, "and Team is:", team

Highest Ratio is: 7.0 and Team is: Portugal


In [23]:
# How many players who play on a team with ranking <10 played
# more than 350 minutes?
# Reminder: Convert ranking and minutes to integers before comparing to values
# Hint: Compute join of Players and Teams, using a variable to count number of
# players satisfying requirement
count = 0
for team in teams:
    for player in players:
        if int(team['ranking']) < 10 and player['team'] == team['team'] and int(player['minutes']) > 350:
            count += 1

print "Count: ", count

Count:  54


In [25]:
# BONUS!
# Write a loop that interactively asks the user to enter a team name.
# If the team exists, print how many games the team played, how many
# yellow cards and red cards the team had, and the average number of
# minutes played by players on that team.
# If the team doesn't exist, print "Team not in 2010 World Cup".
# If 'quit' is entered, terminate the loop.
# Note: To read a string from the user instead of a number, use
# raw_input() instead of input()
team_name = "hi"
team_names = []

for team in teams:
    team_names.append(team['team'])

while team_name != "quit":
    team_name = raw_input("Enter a team name: ")
    for team in teams:
        if team_name == team['team']:
            print "Team name: ", team['team']
            print "Games played:", team['games']
            print "Yellow cards:", team['yellowCards']
            playeraverage = []
            for player in players:
                if player['team'] == team['team']:
                    playeraverage.append(int(player['minutes']))
            print "Average minutes of players on team: ", np.average(playeraverage), "\n"
    if team_name not in team_names and team_name != "quit":
        print "Team not in 2010 World Cup"

Enter a team name: Brazil
Team name:  Brazil
Games played: 5
Yellow cards: 7
Average minutes of players on team:  259.526315789 

Enter a team name: Ghana
Team name:  Ghana
Games played: 5
Yellow cards: 11
Average minutes of players on team:  295.263157895 

Enter a team name: Nigeria
Team name:  Nigeria
Games played: 3
Yellow cards: 5
Average minutes of players on team:  161.833333333 

Enter a team name: France
Team name:  France
Games played: 3
Yellow cards: 6
Average minutes of players on team:  152.894736842 

Enter a team name: Morocco
Team not in 2010 World Cup
Enter a team name: quit
