In [237]:
import pandas as pd
import numpy as np
import geopandas

In [238]:
iris_talence_latlon = geopandas.GeoDataFrame.from_file('original_data/iris/iris_talence.js')
iris_talence_latlon.head()

Unnamed: 0,DCOMIRIS,DEPCOM,IRIS,NOM_COM,NOM_IRIS,TYP_IRIS,geometry
0,335220112,33522,112,Talence,Thouars 2,H,POLYGON ((-0.587543196053298 44.78908756888363...
1,335220102,33522,102,Talence,La Fauvette,H,POLYGON ((-0.587032337621043 44.81515751477936...
2,335220108,33522,108,Talence,Peylanne-Leysotte,H,POLYGON ((-0.5828804443461369 44.8048544065047...
3,335220103,33522,103,Talence,Combattants-Bijou,H,POLYGON ((-0.590734733544139 44.78731246999818...
4,335220105,33522,105,Talence,Cauderes,H,POLYGON ((-0.587032337621043 44.81515751477936...


In [239]:
activite_resident = pd.read_excel('original_data/iris/activite-residents-2012.xls', skiprows=5)
activite_resident.head()

Unnamed: 0,IRIS,REG,REG2016,DEP,UU2010,COM,LIBCOM,TRIRIS,GRD_QUART,LIBIRIS,...,P12_ACTOCC15P_ILT3,P12_ACTOCC15P_ILT4,P12_ACTOCC15P_ILT5,P12_ACTOCC15P_ILT45D,C12_ACTOCC15P,C12_ACTOCC15P_PAS,C12_ACTOCC15P_MAR,C12_ACTOCC15P_DROU,C12_ACTOCC15P_VOIT,C12_ACTOCC15P_TCOM
0,10010000,82,84,1,1000,1001,L'Abergement-Clémenciat,ZZZZZZ,100100,L'Abergement-Clémenciat (commune non irisée),...,83.676923,14.942308,0.0,,338.692308,31.876923,7.969231,3.984615,290.876923,3.984615
1,10020000,82,84,1,1000,1002,L'Abergement-de-Varey,ZZZZZZ,100200,L'Abergement-de-Varey (commune non irisée),...,28.119658,2.008547,0.0,,104.444444,0.0,0.0,0.0,96.410256,8.034188
2,10040101,82,84,1,1302,1004,Ambérieu-en-Bugey,ZZZZZZ,100401,Les Perouses-Triangle d'Activite,...,112.198631,4.828213,0.0,,715.635129,75.289455,119.660483,21.756694,407.353725,91.574773
3,10040102,82,84,1,1302,1004,Ambérieu-en-Bugey,ZZZZZZ,100401,Longeray-Gare,...,251.078104,1.056829,0.0,,1478.54037,57.685313,168.621858,26.249674,1003.722497,222.261029
4,10040201,82,84,1,1302,1004,Ambérieu-en-Bugey,ZZZZZZ,100402,Centre-St Germain-Vareilles,...,303.45485,12.28274,1.049194,,1528.972551,26.832875,139.834053,47.053809,1123.438076,191.813737


In [240]:
import unicodedata

def unicode_to_string(word):
    if type(word) == str:
        return word
    elif pd.isnull(word):
        return 
    else:
        return unicodedata.normalize('NFKD', word).encode('ascii','ignore')
    
def data_to_string(data):
    new_data = data.copy()
    
    col_title_unicode = new_data.columns.values.tolist()
    col_title = map(unicode_to_string,col_title_unicode) 
    new_data.columns = col_title
    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == unicode: #check the first not-NaN value
            new_data[col] = map(unicode_to_string,new_data[col])
            
    return new_data

In [241]:
def rename_column(data): 
    new_data = data.copy()
    col_title = new_data.columns.tolist()
    for x in xrange(len(col_title)):
        if col_title[x] == 'IRIS':
            col_title[x] = 'DCOMIRIS' # this will be the pivot column for merging
        if col_title[x] == 'LIBIRIS':
            col_title[x] = 'NOM_IRIS'
        if col_title[x] == "COM":
            col_title[x] = "DEPCOM"
        if col_title[x] == "LIBCOM": 
            col_title[x] = "NOM_COM"
    new_data.columns = col_title
    return new_data

In [242]:
def to_digit(string):
    new_string = list(string)[:]
    for x in xrange(len(new_string)): 
        if not new_string[x].isdigit():
            new_string[x] = '0'
    return int(''.join(new_string))

def convert_code(data): # convert code from str to int and replace 2AXXX or 2BXXX by 20XXX 
        new_data = data.copy()

        iris_list = new_data['DCOMIRIS'].tolist() 
        new_iris_list = [int(code) if code.isdigit() else to_digit(code) for code in iris_list]
        new_data['DCOMIRIS'] = new_iris_list
        
        depcom_list = new_data['DEPCOM'].tolist() 
        new_depcom_list = [int(code) if code.isdigit() else to_digit(code) for code in depcom_list]
        new_data['DEPCOM'] = new_depcom_list

        return new_data

In [243]:
def float_to_int(data):
    new_data = data.copy()    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == np.float64: #check the first not-NaN value
            new_data[col] = new_data[col].round()
    return new_data

In [244]:
iris_talence_latlon = data_to_string(iris_talence_latlon)
iris_talence_latlon = convert_code(iris_talence_latlon)

In [245]:
activite_resident = data_to_string(activite_resident)
activite_resident = rename_column(activite_resident)

In [246]:
activite_resident = convert_code(activite_resident)
activite_resident = float_to_int(activite_resident)

In [248]:
def extract_df(data, variable_list, variable_name):
    new_data = data.copy()
    new_data = new_data.loc[:,variable_list]
    
    new_data.columns = variable_name
    return new_data

In [249]:
variable_list = ['DCOMIRIS','P12_POP1564','C12_ACT1564','C12_ACT1564_CS1','C12_ACT1564_CS2',
                 'C12_ACT1564_CS3','C12_ACT1564_CS4','C12_ACT1564_CS5','C12_ACT1564_CS6','P12_CHOM1564','P12_RETR1564']
variable_name = ['DCOMIRIS', 'Population', 'Actifs','Agriculteurs','Artisans','Cadres',
                'Prof_intermediaires','Employes', 'Ouvriers','Chomeurs','Retraites']
act_resident = extract_df(activite_resident, variable_list, variable_name)

In [250]:
act_resident[act_resident.DCOMIRIS == 335220112]

Unnamed: 0,DCOMIRIS,Population,Actifs,Agriculteurs,Artisans,Cadres,Prof_intermediaires,Employes,Ouvriers,Chomeurs,Retraites
16695,335220112,1276,883,0,21,161,237,253,200,214,76


In [251]:
talence_iris_data = pd.merge(iris_talence_latlon, act_resident, on="DCOMIRIS", how="inner")

In [253]:
talence_iris_data

Unnamed: 0,DCOMIRIS,DEPCOM,IRIS,NOM_COM,NOM_IRIS,TYP_IRIS,geometry,Population,Actifs,Agriculteurs,Artisans,Cadres,Prof_intermediaires,Employes,Ouvriers,Chomeurs,Retraites
0,335220112,33522,112,Talence,Thouars 2,H,POLYGON ((-0.587543196053298 44.78908756888363...,1276,883,0,21,161,237,253,200,214,76
1,335220102,33522,102,Talence,La Fauvette,H,POLYGON ((-0.587032337621043 44.81515751477936...,2824,1860,0,69,498,625,471,177,219,126
2,335220108,33522,108,Talence,Peylanne-Leysotte,H,POLYGON ((-0.5828804443461369 44.8048544065047...,2635,1837,0,77,501,584,506,135,200,205
3,335220103,33522,103,Talence,Combattants-Bijou,H,POLYGON ((-0.590734733544139 44.78731246999818...,2655,871,5,24,171,229,276,106,243,45
4,335220105,33522,105,Talence,Cauderes,H,POLYGON ((-0.587032337621043 44.81515751477936...,2258,1578,5,79,501,544,297,148,125,147
5,335220113,33522,113,Talence,Thouars 1,H,POLYGON ((-0.590734733544139 44.78731246999818...,2190,1440,0,61,108,326,508,404,328,98
6,335220110,33522,110,Talence,Megret,H,POLYGON ((-0.589329054641736 44.80295051298793...,1381,887,0,37,150,207,290,178,181,44
7,335220109,33522,109,Talence,Plume-La-Boule,H,POLYGON ((-0.591909983144453 44.79381924440587...,2883,1890,0,65,399,603,540,263,245,235
8,335220107,33522,107,Talence,Saint-Genes,H,POLYGON ((-0.585092624083788 44.81789608008269...,1798,1217,8,49,320,329,350,136,199,90
9,335220111,33522,111,Talence,Lycee,H,POLYGON ((-0.589329054641736 44.80295051298793...,1501,1008,0,38,159,249,352,189,192,71


In [254]:
talence_iris_data = geopandas.GeoDataFrame(talence_iris_data)
talence_iris_data.to_file('result/talence_iris_data', driver="ESRI Shapefile")

