In [26]:
import unicodecsv
import pandas as pd

# function read csv
def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

#load CSV as list
people_filename = './data/people.csv'
people = read_csv(people_filename)
df_people = pd.DataFrame(people)
df_people.head()

Unnamed: 0,32,32.betterworse,Age,Category,Country 1,Country 2,Gender,Job,Number of kids,Place of living,REF,Type of school,Years of studies
0,,KO,30,Adult,Australia,,F,Phd Student,0,Sydney,AUS001,"Law, Health",10
1,,KO,32,Adult,China,,F,Former Consultant,0,Paris,CHN01,Engineering,5
2,,KO,69,Adult,France,,F,Retiree,3,Cucq,FR001,,1
3,,KO,58,Adult,France,,M,Farmer,2,La Calotterie,FR002,,0
4,,KO,28,Adult,Madagascar,,F,Singer,0,Paris,MAD001,,3


In [27]:
#List unique values in Place column
places = df_people["Place of living"].unique()
print(len(places))

78


In [28]:
# Get population from Wikidata from city (Place of living attributes)
# use PY wikibot : Wiki API & wikipedia python package
# Part 1 - Function definition
import pywikibot
site = pywikibot.Site('en', 'wikipedia')  # The site we want to run our bot on
repo = site.data_repository()

# Get English label from ItemPage
def getLabel(item):
    return item.get()["labels"]["en"]

def getClaims(pageName):
    try:
        page = pywikibot.Page(site, pageName)
        item = pywikibot.ItemPage.fromPage(page)
        item_dict = item.get()
        claims = item_dict["claims"]
        #dict_keys(['aliases', 'labels', 'sitelinks', 'descriptions', 'claims'])
        #return item_dict["claims"] # Get the claim dictionary
    except:
        #print("page doesn't exist : " + pageName)
        claims = 0
        #return 0
    return claims

#get claim label by ID, ONLY FOR UNIQUE CLAIMS AS CONTINENT FOR A CITY
def getClaimLabelbyID(claim_dict,claimID):
    try:
        first_claim = claim_dict[claimID].pop()
        #get the associated value
        first_claim_value = first_claim.getTarget() 
        #print(first_claim_value)
        continent = getLabel(first_claim_value)
    except:
        #print("no ID found " + claimID)
        continent = 0
    return continent 

# Get continent name from city claim dictionnary
def getContinent(claim_dict):
    #claim_dict = getClaims(cityName)
    label = getClaimLabelbyID(claim_dict,"P30")
    return label

# Return True if city is a capital from city claim dictionnary
def isCapital(claim_dict):
    #claim_dict = getClaims(cityName)
    instances_list = claim_dict["P31"]
    isCapital = False
    for i in instances_list: 
        target = i.getTarget()
        targetLabel = getLabel(target)
        #print(targetLabel)
        if targetLabel == "capital":
            isCapital = True
        #print(isCapital)
    return isCapital

def getYear(e):
  return e['year']

# Return most recent population count from city claim dictionnary
def getMostRecentPopulation(claim_dict):
    
    try:
        #clm_dict = getClaims(cityName)
        population_list = claim_dict["P1082"]

        #date format is either +00000002012-00-00T00:00:00Z or +00000002012-01-01T00:00:00Z 
        #regular date parsing cannot handle both case

        populationTemp = []
        for pop in population_list:
            popJSON = pop.toJSON()
            dateString = popJSON["qualifiers"]["P585"][0]["datavalue"]["value"]["time"]
            yearValue = dateString[8:12]   
            populationValue = int(popJSON["mainsnak"]["datavalue"]["value"]["amount"])
            populationTemp.append({'year': yearValue, 'population': populationValue})
            #print(yearValue)
            #print(populationValue)

        # sort the array on the year attribute
        populationTemp.sort(key=getYear)
        #print(populationTemp)
        # in order to pop the most recent year
        mostRecentPopulation = populationTemp.pop()['population']
    except:
        print("ID not found : P1082 (population)")
        mostRecentPopulation = 0
    return mostRecentPopulation



In [31]:
# Get population from Wikidata from city (Place of living attributes)
# use PY wikibot : Wiki API & wikipedia python package
# Part 2 - Application

places_characteristics = []

for city in places:
    #print(city) 
    place_claim_dict = getClaims(city) # return 0 if not found
    if place_claim_dict != 0:
        #print(place_claim_dict)
        capital = isCapital(place_claim_dict)
        pop = getMostRecentPopulation(place_claim_dict)
        if pop == 0:
            pop = "Not found"
            #cityType = "Unknown"
        else:
            cityType = getCityType(pop)
        places_characteristics.append({'Place of living': city, 'capital': capital, 'population': pop})
    else:
        places_characteristics.append({'Place of living': city, 'capital': "City not found", 'population': "City not found"})

print(len(places_characteristics))
df_places = pd.DataFrame(places_characteristics)
df_places.head()

ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
ID not found : P1082 (population)
78


Unnamed: 0,Place of living,capital,population
0,Sydney,True,4840600
1,Paris,True,2206488
2,Cucq,False,5123
3,La Calotterie,False,649
4,Singapore,False,5888926


In [32]:
df_cities = pd.DataFrame(places_characteristics)
df_cities.head()

print("Cities not found in Wikidata")
print(df_cities[df_cities['population'] == "City not found"])
print("\n")

print("Cities found, with no population info")
print(df_cities[df_cities['population'] == "Not found"])
print("\n")

print("Cities found with population")
print(df_cities[(df_cities['population'] != "Not found") & (df_cities['population'] != "City not found")])

df_cities.to_csv('citiesList.csv', index=False)

Cities not found in Wikidata
       Place of living         capital      population
32           Palmeiras  City not found  City not found
33         Jericocoara  City not found  City not found
34             Quindio  City not found  City not found
36               Minca  City not found  City not found
41         Forth Worth  City not found  City not found
51           Soft Gobi  City not found  City not found
52         Middle Gobi  City not found  City not found
57  Ayervatty Division  City not found  City not found
59         Countryside  City not found  City not found
62               Nimmu  City not found  City not found
73           Menagesha  City not found  City not found


Cities found, with no population info
   Place of living capital population
5          Sologne   False  Not found
6          Valence   False  Not found
7         Tel Aviv   False  Not found
13       Chantilly   False  Not found
14         Córdoba   False  Not found
15      San Martín   False  Not found
16   