In [1]:
import pandas as pd
import numpy as np

In [2]:
salary_data = pd.read_excel("salaire.xls", skiprows=5)
immigration_data = pd.read_excel("immigration.xls", skiprows=10)
population_data = pd.read_excel("population.xls", skiprows=5)

### First, as usual, we convert unicode to string.

In [3]:
import unicodedata

def unicode_to_string(word):
    if pd.isnull(word):
        return 
    else:
        return unicodedata.normalize('NFKD', word).encode('ascii','ignore')
    
def data_to_string(data):
    new_data = data.copy()
    
    col_title_unicode = new_data.columns.values.tolist()
    col_title = map(unicode_to_string,col_title_unicode)
    new_data.columns = col_title
    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == unicode: #check the first not-NaN value
            new_data[col] = map(unicode_to_string,new_data[col])
            
    return new_data

In [4]:
salary_data = data_to_string(salary_data)
immigration_data = data_to_string(immigration_data)
population_data = data_to_string(population_data)

### Now we will rename the column headers so that we have at least 1 common column for all the data frames. 

In [5]:
def rename_column(data): 
    new_data = data.copy()
    col_title = new_data.columns.tolist()
    for x in xrange(len(col_title)):
        if col_title[x] == 'CODGEO':
            col_title[x] = 'Code Insee' # this will be the pivot column for merging
        if col_title[x] == 'LIBGEO':
            col_title[x] = 'Libelle de la commune'
    new_data.columns = col_title
    return new_data

In [6]:
salary_data = rename_column(salary_data)
immigration_data = rename_column(immigration_data)
population_data = rename_column(population_data)

### We still have problem with departemental code of Corse because they are not in numerical form (2A... or 2B...).

In [7]:
def to_digit(string):
    new_string = list(string)[:]
    for x in xrange(len(new_string)): 
        if not new_string[x].isdigit():
            new_string[x] = '0'
    return int(''.join(new_string))

In [8]:
def replace_insee_code(data): #replace 2AXXX or 2BXXX by 20XXX 
        new_data = data.copy()
        insee_list = new_data['Code Insee'].tolist() 
        new_insee_list = [int(code) if code.isdigit() else to_digit(code) for code in insee_list]
        new_data['Code Insee'] = new_insee_list
        return new_data

In [9]:
salary_data = replace_insee_code(salary_data)
immigration_data = replace_insee_code(immigration_data)
population_data = replace_insee_code(population_data)

### There is still another problem: all the numbers in our excel sheet are read in float format. We need to convert them to int for the immigration and population data.

In [10]:
def float_to_int(data):
    new_data = data.copy()    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == np.float64: #check the first not-NaN value
            new_data[col] = new_data[col].round()
    return new_data

In [11]:
immigration_data = float_to_int(immigration_data)
population_data = float_to_int(population_data)

### In immigration data, we will add a column which shows us the total number of adult immigrants  in each town

In [12]:
def add_total_immigrant_variable(data): 
    new_data = data.copy()
    variable_list = []
    for variable in data.columns.tolist(): 
        if 'IMMI1' in variable and 'AGE400' not in variable: 
            variable_list.append(variable)
    new_data['Total immigrant'] = new_data[variable_list].sum(axis=1)
    col = new_data.columns.tolist()
    col = col[:2] + col[-1:] + col[2:-1]
    new_data = new_data[col]
    return new_data

In [13]:
immigration_data = add_total_immigrant_variable(immigration_data)

In [14]:
immigration_data 

Unnamed: 0,Code Insee,Libelle de la commune,Total immigrant,AGE400_IMMI1_SEXE1,AGE400_IMMI1_SEXE2,AGE400_IMMI2_SEXE1,AGE400_IMMI2_SEXE2,AGE415_IMMI1_SEXE1,AGE415_IMMI1_SEXE2,AGE415_IMMI2_SEXE1,AGE415_IMMI2_SEXE2,AGE425_IMMI1_SEXE1,AGE425_IMMI1_SEXE2,AGE425_IMMI2_SEXE1,AGE425_IMMI2_SEXE2,AGE455_IMMI1_SEXE1,AGE455_IMMI1_SEXE2,AGE455_IMMI2_SEXE1,AGE455_IMMI2_SEXE2
0,1001,L'Abergement-Clemenciat,25,1,3,92,83,1,0,34,32,7,9,154,137,5,3,101,116
1,1002,L'Abergement-de-Varey,4,0,1,25,18,1,0,6,11,2,1,52,47,0,0,38,32
2,1004,Amberieu-en-Bugey,1349,57,53,1386,1542,73,61,802,872,442,403,2447,2386,194,176,1454,1886
3,1005,Amberieux-en-Dombes,59,0,0,176,163,0,2,108,101,18,19,334,324,11,9,182,195
4,1006,Ambleon,8,2,1,5,4,0,0,8,3,2,3,23,19,0,3,18,21
5,1007,Ambronay,80,0,4,273,270,1,1,91,120,19,15,492,493,24,20,294,320
6,1008,Ambutrix,22,1,0,71,84,0,0,36,28,5,5,173,167,9,3,77,80
7,1009,Andert-et-Condon,16,0,0,38,27,2,0,10,16,6,6,59,52,2,0,59,60
8,1010,Anglefort,45,2,2,128,127,1,1,54,50,10,17,207,190,8,8,125,139
9,1011,Apremont,10,0,2,40,44,0,1,16,18,3,4,73,76,0,2,63,43


### Now we try to merge the 3 data frames: right_party_vote, immigration_data, salary_data

In [15]:
right_party_vote = pd.read_excel("right_party_vote.xlsx")

In [16]:
immigration_salary = pd.merge(immigration_data, salary_data, on = ['Code Insee','Libelle de la commune'], how = 'outer')

In [17]:
immigration_salary_population = pd.merge(population_data, immigration_salary, on = ['Code Insee','Libelle de la commune'], how = 'outer')

In [18]:
full_data_right = pd.merge(right_party_vote, immigration_salary_population, on = 'Code Insee', how = 'outer')

### Export the full data frame

In [19]:
writer = pd.ExcelWriter('full_data_right.xlsx')
full_data_right.to_excel(writer,'Sheet1')
writer.save()