### **Data Manipulation in Python**

<font color="red">File access required:</font> In Colab this notebook requires first uploading files **Cities.csv**, **Countries.csv**, **Players.csv**, and **Teams.csv** using the *Files* feature in the left toolbar. If running the notebook on a local computer, simply ensure these files are in the same workspace as the notebook.

### Reading data from CSV files using csv package

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Set-up
import csv
import numpy as np

In [None]:
# Use csv package 'DictReader' to read Cities.csv data
# After header, data is read row-by-row into dictionary format
# Note all values are read as strings
with open('/content/drive/MyDrive/Google Colab/Basic & Advanced SQL/Cities.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        print(dict(r))

{'city': 'Aalborg', 'country': 'Denmark', 'latitude': '57.03', 'longitude': '9.92', 'temperature': '7.52'}
{'city': 'Aberdeen', 'country': 'United Kingdom', 'latitude': '57.17', 'longitude': '-2.08', 'temperature': '8.10'}
{'city': 'Abisko', 'country': 'Sweden', 'latitude': '63.35', 'longitude': '18.83', 'temperature': '0.20'}
{'city': 'Adana', 'country': 'Turkey', 'latitude': '36.99', 'longitude': '35.32', 'temperature': '18.67'}
{'city': 'Albacete', 'country': 'Spain', 'latitude': '39.00', 'longitude': '-1.87', 'temperature': '12.62'}
{'city': 'Algeciras', 'country': 'Spain', 'latitude': '36.13', 'longitude': '-5.47', 'temperature': '17.38'}
{'city': 'Amiens', 'country': 'France', 'latitude': '49.90', 'longitude': '2.30', 'temperature': '10.17'}
{'city': 'Amsterdam', 'country': 'Netherlands', 'latitude': '52.35', 'longitude': '4.92', 'temperature': '8.93'}
{'city': 'Ancona', 'country': 'Italy', 'latitude': '43.60', 'longitude': '13.50', 'temperature': '13.52'}
{'city': 'Andorra', 'co

In [None]:
# Print the city and longitude of all cities with longitude < 0
with open('/content/drive/MyDrive/Google Colab/Basic & Advanced SQL/Cities.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if float(r['longitude']) < 0:
            print(r['city'], r['longitude'])
# Show what happens without float()

Aberdeen -2.08
Albacete -1.87
Algeciras -5.47
Angers -0.53
Badajoz -6.97
Belfast -5.96
Bilbao -2.93
Birmingham -1.92
Blackpool -3.05
Bordeaux -0.60
Bournemouth -1.90
Bradford -1.75
Braga -8.42
Brest -4.50
Burgos -3.68
Caen -0.35
Cartagena -0.98
Cork -8.50
Dublin -6.25
Dundee -3.00
Edinburgh -3.22
Exeter -3.53
Galway -9.05
Glasgow -4.25
Granada -3.59
Huelva -6.93
Inverness -4.23
Lisbon -9.14
Madrid -3.68
Marbella -4.88
Murcia -1.13
Oviedo -5.83
Salamanca -5.67
Santander -3.80
Swansea -3.95
Valencia -0.40
Vigo -8.73
Zaragoza -0.89


### <font color="green">**Your Turn**</font>

In [None]:
# Using csv package 'DictReader' to read Countries.csv data,
# find all countries that have coastline and are not in the EU;
# print the countries and their populations
# Note: In Python, use '==' to test equality
with open('/content/drive/MyDrive/Google Colab/Basic & Advanced SQL/Countries.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        if r['coastline'] == 'yes' and r['EU'] == 'no':
            print(r['country'], r['population'])

Albania 2.9
Bosnia and Herzegovina 3.8
Iceland 0.33
Montenegro 0.63
Norway 5.27
Turkey 79.62
Ukraine 44.62


### Reading data into Python data structures

In [None]:
# Read Cities.csv data into list of dictionaries
cities = []
with open('/content/drive/MyDrive/Google Colab/Basic & Advanced SQL/Cities.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        cities.append(dict(r))
cities

[{'city': 'Aalborg',
  'country': 'Denmark',
  'latitude': '57.03',
  'longitude': '9.92',
  'temperature': '7.52'},
 {'city': 'Aberdeen',
  'country': 'United Kingdom',
  'latitude': '57.17',
  'longitude': '-2.08',
  'temperature': '8.10'},
 {'city': 'Abisko',
  'country': 'Sweden',
  'latitude': '63.35',
  'longitude': '18.83',
  'temperature': '0.20'},
 {'city': 'Adana',
  'country': 'Turkey',
  'latitude': '36.99',
  'longitude': '35.32',
  'temperature': '18.67'},
 {'city': 'Albacete',
  'country': 'Spain',
  'latitude': '39.00',
  'longitude': '-1.87',
  'temperature': '12.62'},
 {'city': 'Algeciras',
  'country': 'Spain',
  'latitude': '36.13',
  'longitude': '-5.47',
  'temperature': '17.38'},
 {'city': 'Amiens',
  'country': 'France',
  'latitude': '49.90',
  'longitude': '2.30',
  'temperature': '10.17'},
 {'city': 'Amsterdam',
  'country': 'Netherlands',
  'latitude': '52.35',
  'longitude': '4.92',
  'temperature': '8.93'},
 {'city': 'Ancona',
  'country': 'Italy',
  'lati

In [None]:
# Read Countries.csv data into list of dictionaries
countries = []
with open('/content/drive/MyDrive/Google Colab/Basic & Advanced SQL/Countries.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        countries.append(dict(r))
countries

[{'country': 'Albania', 'population': '2.9', 'EU': 'no', 'coastline': 'yes'},
 {'country': 'Andorra', 'population': '0.07', 'EU': 'no', 'coastline': 'no'},
 {'country': 'Austria', 'population': '8.57', 'EU': 'yes', 'coastline': 'no'},
 {'country': 'Belarus', 'population': '9.48', 'EU': 'no', 'coastline': 'no'},
 {'country': 'Belgium',
  'population': '11.37',
  'EU': 'yes',
  'coastline': 'yes'},
 {'country': 'Bosnia and Herzegovina',
  'population': '3.8',
  'EU': 'no',
  'coastline': 'yes'},
 {'country': 'Bulgaria', 'population': '7.1', 'EU': 'yes', 'coastline': 'yes'},
 {'country': 'Croatia', 'population': '4.23', 'EU': 'yes', 'coastline': 'yes'},
 {'country': 'Cyprus', 'population': '1.18', 'EU': 'yes', 'coastline': 'yes'},
 {'country': 'Czech Republic',
  'population': '10.55',
  'EU': 'yes',
  'coastline': 'no'},
 {'country': 'Denmark', 'population': '5.69', 'EU': 'yes', 'coastline': 'yes'},
 {'country': 'Estonia', 'population': '1.31', 'EU': 'yes', 'coastline': 'yes'},
 {'countr

In [None]:
# Print the city and longitude of all cities with longitude < 0
for city in cities:
    if float(city['longitude']) < 0:
        print(city['city'], city['longitude'])

Aberdeen -2.08
Albacete -1.87
Algeciras -5.47
Angers -0.53
Badajoz -6.97
Belfast -5.96
Bilbao -2.93
Birmingham -1.92
Blackpool -3.05
Bordeaux -0.60
Bournemouth -1.90
Bradford -1.75
Braga -8.42
Brest -4.50
Burgos -3.68
Caen -0.35
Cartagena -0.98
Cork -8.50
Dublin -6.25
Dundee -3.00
Edinburgh -3.22
Exeter -3.53
Galway -9.05
Glasgow -4.25
Granada -3.59
Huelva -6.93
Inverness -4.23
Lisbon -9.14
Madrid -3.68
Marbella -4.88
Murcia -1.13
Oviedo -5.83
Salamanca -5.67
Santander -3.80
Swansea -3.95
Valencia -0.40
Vigo -8.73
Zaragoza -0.89


In [None]:
# Print all cities that are not in the EU
# Requires joining cities and countries
for city in cities:
    for country in countries:
        if city['country'] == country['country'] and country['EU'] == 'no':
            print(city['city'], '-', city['country'])

Adana - Turkey
Andorra - Andorra
Ankara - Turkey
Antalya - Turkey
Balti - Moldova
Basel - Switzerland
Batman - Turkey
Belgrade - Serbia
Bergen - Norway
Bila Tserkva - Ukraine
Bodo - Norway
Brest - Belarus
Bursa - Turkey
Cherkasy - Ukraine
Chernihiv - Ukraine
Chernivtsi - Ukraine
Chisinau - Moldova
Denizli - Turkey
Edirne - Turkey
Elbasan - Albania
Erzincan - Turkey
Erzurum - Turkey
Eskisehir - Turkey
Gaziantep - Turkey
Geneva - Switzerland
Horlivka - Ukraine
Hrodna - Belarus
Istanbul - Turkey
Karaman - Turkey
Kayseri - Turkey
Kherson - Ukraine
Kiev - Ukraine
Kremenchuk - Ukraine
Kryvyy Rih - Ukraine
Lvov - Ukraine
Makiyivka - Ukraine
Malatya - Turkey
Manisa - Turkey
Mazyr - Belarus
Minsk - Belarus
Nis - Serbia
Novi Sad - Serbia
Ordu - Turkey
Orsha - Belarus
Oslo - Norway
Pinsk - Belarus
Podgorica - Montenegro
Rivne - Ukraine
Samsun - Turkey
Sarajevo - Bosnia and Herzegovina
Siirt - Turkey
Sivas - Turkey
Skopje - Macedonia
Stavanger - Norway
Sumy - Ukraine
Tarsus - Turkey
Tekirdag - Tur

### Aggregation

In [None]:
# Compute overall average city temperature
temps = [] # create list of all temperatures
for city in cities:
    temps.append(float(city['temperature']))
# print(temps)
print(np.average(temps))

9.497840375586854


In [None]:
# Alternative using running sum and count
sum = 0
count = 0
for city in cities:
    sum += float(city['temperature'])
    count += 1
print(sum/count)

9.497840375586858


In [None]:
# Compute average city temperature for each country
# First compute list of countries
countryList = []
for city in cities:
    if city['country'] not in countryList:
        countryList.append(city['country'])
# print(countryList)
# Then compute average temperature for each
for country in countryList:
    temps = []
    for city in cities:
        if city['country'] == country:
            temps.append(float(city['temperature']))
    print(country, np.average(temps))

Denmark 7.625
United Kingdom 8.649999999999999
Sweden 3.5866666666666673
Turkey 11.726666666666667
Spain 14.238333333333332
France 10.151111111111112
Netherlands 8.756666666666668
Italy 13.474666666666668
Andorra 9.6
Romania 9.224444444444444
Greece 16.9025
Germany 7.8692857142857155
Moldova 8.415
Switzerland 7.253333333333333
Serbia 9.85
Norway 3.7260000000000004
Poland 7.25
Ukraine 7.420000000000001
Portugal 14.469999999999999
Slovakia 8.48
Belarus 5.946666666666666
Czech Republic 7.8566666666666665
Belgium 9.65
Hungary 9.6025
Bulgaria 10.44
Ireland 9.299999999999999
Latvia 5.27
Albania 15.18
Austria 6.144
Finland 3.4875
Lithuania 6.1433333333333335
Slovenia 9.27
Montenegro 9.99
Croatia 10.865
Bosnia and Herzegovina 9.6
Macedonia 9.36
Estonia 4.59


In [None]:
# Compute overall minimum and maximum city temperatures
temps = []
for city in cities:
    temps.append(float(city['temperature']))
print('Minimum:', min(temps))
print('Maximum:', max(temps))

Minimum: -2.2
Maximum: 18.67


In [None]:
# Alternative method using running min and max
minval = float(cities[0]['temperature'])
maxval = float(cities[0]['temperature'])
for city in cities[1:]:
    if float(city['temperature']) < minval:
        minval = float(city['temperature'])
    if float(city['temperature']) > maxval:
        maxval = float(city['temperature'])
print('Minimum:', minval)
print('Maximum:', maxval)

Minimum: -2.2
Maximum: 18.67


### <font color="green">**Your Turn**</font>

In [None]:
# Find the minimum, maximum, and average temperatures of
# cities that are in the EU, and the minimum, maximum, and average
# temperatures of cities that are not in the EU
#
# Hint: You will need to "join" cities and countries using one loop inside
#   another as seen in an earlier example

EU = []
nonEU = []

for city in cities:
    for country in countries:
        if city['country'] == country['country']:
            EU.append(float(city['temperature']))
        else:
            nonEU.append(float(city['temperature']))

# # Once the lists are populated, the following code prints the results
print('EU:    ', 'minimum', min(EU), 'maximum', max(EU), 'average', np.average(EU))
print('non-EU:', 'minimum', min(nonEU), 'maximum', max(nonEU), 'average', np.average(nonEU))

EU:     minimum -2.2 maximum 18.67 average 9.497840375586854
non-EU: minimum -2.2 maximum 18.67 average 9.497840375586854


### <font color="green">**Your Turn: World Cup Data**</font>

In [6]:
# Read Players.csv and Teams.csv into lists of dictionaries
players = []
with open('/content/drive/MyDrive/Google Colab/Basic & Advanced SQL/Players.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        players.append(dict(r))
teams = []
with open('/content/drive/MyDrive/Google Colab/Basic & Advanced SQL/Teams.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        teams.append(dict(r))

In [None]:
# Show first two items in players list
players[:2]

[{'surname': 'Abdoun',
  'team': 'Algeria',
  'position': 'midfielder',
  'minutes': '16',
  'shots': '0',
  'passes': '6',
  'tackles': '0',
  'saves': '0'},
 {'surname': 'Belhadj',
  'team': 'Algeria',
  'position': 'defender',
  'minutes': '270',
  'shots': '1',
  'passes': '146',
  'tackles': '8',
  'saves': '0'}]

In [None]:
# Show first two items in teams list
teams[:2]

[{'team': 'Brazil',
  'ranking': '1',
  'games': '5',
  'wins': '3',
  'draws': '1',
  'losses': '1',
  'goalsFor': '9',
  'goalsAgainst': '4',
  'yellowCards': '7',
  'redCards': '2'},
 {'team': 'Spain',
  'ranking': '2',
  'games': '6',
  'wins': '5',
  'draws': '0',
  'losses': '1',
  'goalsFor': '7',
  'goalsAgainst': '2',
  'yellowCards': '3',
  'redCards': '0'}]

####*Problems*

In [None]:
# What player on a team with “ia” in the team name played less than
# 200 minutes and made more than 100 passes? Print the player surname.
# Note: In Python, use "'abc' in s" to check whether string s contains'abc'
# Reminder: Convert minutes and passes to integers before comparing to values
for player in players:
    if 'ia' in player['team'] and int(player['minutes']) < 200 and\
        int(player['passes']) > 100:
            print(player['surname'])

Kuzmanovic


In [None]:
# What is the average number of passes made by forwards? By midfielders?
# Make sure to label which is which.
forwards = []
midfielders = []
for player in players:
    if player['position'] == 'forward':
        forwards.append(int(player['passes']))
    elif player['position'] == 'midfielder':
        midfielders.append(int(player['passes']))
print('Average passes by forwards:', np.average(forwards))
print('Average passes by midfielders:', np.average(midfielders))

Average passes by forwards: 50.82517482517483
Average passes by midfielders: 95.2719298245614


In [8]:
# Which team has the highest ratio of goalsFor to goalsAgainst?
# Print the team only.
# Reminder: Use float() to make sure you're doing floating point division
# Hint: Use two variables to keep track of highest ratio seen so far
# and team with that ratio:
ratio = 0 # highest ratio seen so far
ratioteam = '' # team with highest ratio
for team in teams:
    nextratio = float(team['goalsFor'])/float(team['goalsAgainst'])
    if nextratio > ratio:
        ratio = nextratio
        ratioteam = team['team']
print(ratioteam)

Portugal


In [None]:
# How many players who play on a team with ranking <10 played
# more than 350 minutes?
# Reminder: Convert ranking and minutes to integers before comparing to values
# Hint: Compute join of players and teams, using a variable to count number of
# players satisfying requirement
for player in players:
  for team in teams:
    if player['team'] == team['team']:
      if int(team['ranking']) < 10 and int(player['minutes']) > 350:
        count += 1
print(count)

54


### <font color="green">**Your Turn Extra: Titanic Data**</font>

<font color="red">File access required:</font> In Colab these extra problems require first uploading **Titanic.csv** using the *Files* feature in the left toolbar. If running the notebook on a local computer, simply ensure this file is in the same workspace as the notebook.

In [None]:
# Read Titanic.csv into list of dictionaries
titanic = []
with open('/content/drive/MyDrive/Google Colab/Basic & Advanced SQL/Titanic.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        titanic.append(dict(r))

In [None]:
# Show first 5 items in titanic list
titanic[:5]

[{'last': 'Abbing',
  'first': 'Mr. Anthony',
  'gender': 'M',
  'age': '42',
  'class': '3',
  'fare': '7.55',
  'embarked': 'Southampton',
  'survived': 'no'},
 {'last': 'Abbott',
  'first': 'Mrs. Stanton (Rosa Hunt)',
  'gender': 'F',
  'age': '35',
  'class': '3',
  'fare': '20.25',
  'embarked': 'Southampton',
  'survived': 'yes'},
 {'last': 'Abbott',
  'first': 'Mr. Rossmore Edward',
  'gender': 'M',
  'age': '16',
  'class': '3',
  'fare': '20.25',
  'embarked': 'Southampton',
  'survived': 'no'},
 {'last': 'Abelson',
  'first': 'Mr. Samuel',
  'gender': 'M',
  'age': '30',
  'class': '2',
  'fare': '24.00',
  'embarked': 'Cherbourg',
  'survived': 'no'},
 {'last': 'Abelson',
  'first': 'Mrs. Samuel (Hannah Wizosky)',
  'gender': 'F',
  'age': '28',
  'class': '2',
  'fare': '24.00',
  'embarked': 'Cherbourg',
  'survived': 'yes'}]

In [None]:
# How many married women over the specified age threshold embarked in Cherbourg?
# Make sure to try different thresholds.
# Note: In Python, use "'abc' in s" to check whether string s contains 'abc'
# Note: You will need to account for the fact that some ages are the empty string ''
# Reminder: Convert non-blank ages to float before comparing to a value
age_threshold = 50
count = 0
for t in titanic:
    if t['age'] != '':
        if float(t['age']) > age_threshold and 'Mrs.' in t['first'] and t['embarked'] == 'Cherbourg':
            count += 1
print(count, 'married women over age', age_threshold, 'embarked in Cherbourg')

4 married women over age 50 embarked in Cherbourg


In [None]:
# What is the average fare paid by passengers in the three classes, and the
# average age of passengers in the three classes (ignoring missing ages)?
import numpy as np
class1fares = []
class1ages = []
class2fares = []
class2ages = []
class3fares = []
class3ages = []
for t in titanic:
    if t['class'] == '1':
        class1fares.append(float(t['fare']))
        if t['age'] != '': class1ages.append(float(t['age']))
    if t['class'] == '2':
        class2fares.append(float(t['fare']))
        if t['age'] != '': class2ages.append(float(t['age']))
    if t['class'] == '3':
        class3fares.append(float(t['fare']))
        if t['age'] != '': class3ages.append(float(t['age']))
print('First class average fare is', np.average(class1fares), 'and average age is', np.average(class1ages))
print('Second class average fare is', np.average(class2fares), 'and average age is', np.average(class2ages))
print('Third class average fare is', np.average(class3fares), 'and average age is', np.average(class3ages))

First class average fare is 84.15500000000002 and average age is 38.233440860215055
Second class average fare is 20.662228260869565 and average age is 29.87763005780347
Third class average fare is 13.676863543788187 and average age is 25.14061971830986


In [None]:
# Find the survival rate for passengers in the three different classes,
# i.e., what fraction of passengers in each class survived? Also find
# the survival rate for males versus females, and for children (age < 18)
# versus adults (age >= 18) ignoring passengers whose age is missing.
class1all = 0
class1survived = 0
class2all = 0
class2survived = 0
class3all = 0
class3survived = 0
femaleall = 0
femalesurvived = 0
maleall = 0
malesurvived = 0
childall = 0
childsurvived = 0
adultall = 0
adultsurvived = 0
for t in titanic:
    if t['class'] == '1':
        class1all += 1
        if t['survived'] == 'yes': class1survived += 1
    if t['class'] == '2':
        class2all += 1
        if t['survived'] == 'yes': class2survived += 1
    if t['class'] == '3':
        class3all += 1
        if t['survived'] == 'yes': class3survived += 1
    if t['gender'] == 'F':
        femaleall += 1
        if t['survived'] == 'yes': femalesurvived += 1
    if t['gender'] == 'M':
        maleall += 1
        if t['survived'] == 'yes': malesurvived += 1
    if t['age'] != '' and float(t['age']) < 18:
        childall += 1
        if t['survived'] == 'yes': childsurvived += 1
    if t['age'] != '' and float(t['age']) >= 18:
        adultall += 1
        if t['survived'] == 'yes': adultsurvived += 1
print('Survival rate in first class:', class1survived/class1all)
print('Survival rate in second class:', class2survived/class2all)
print('Survival rate in third class:', class3survived/class3all)
print('Survival rate for females:', femalesurvived/femaleall)
print('Survival rate for males:', malesurvived/maleall)
print('Survival rate for children:', childsurvived/childall)
print('Survival rate for adults', adultsurvived/adultall)

Survival rate in first class: 0.6296296296296297
Survival rate in second class: 0.47282608695652173
Survival rate in third class: 0.24236252545824846
Survival rate for females: 0.7420382165605095
Survival rate for males: 0.18890814558058924
Survival rate for children: 0.5398230088495575
Survival rate for adults 0.3810316139767055


In [None]:
# Find pairs of passengers who are likely to be twin children: same
# last name, same age, same embarkation, and age is under 18. Print
# each pair once, including their names, age, and mbarkation city.

for t1 in titanic:
    for t2 in titanic:
        if t1['last'] == t2['last'] and t1['first'] < t2['first']\
        and t1['age'] == t2['age'] and t1['embarked'] == t2['embarked']\
        and t1['age'] != '' and float(t1['age']) < 18:
            print(t1['first'], t1['last'], 'and', t2['first'], t2['last'],\
                 'are', t1['age'], 'years old and embarked in', t1['embarked'])

Miss Eugenie Baclini and Miss Helene Barbara Baclini are 0.75 years old and embarked in Cherbourg
Mr. Jovo Calic and Mr. Petar Calic are 17 years old and embarked in Southampton
