In [2]:
import pandas as pd
import numpy as np

In [2]:
salary_data = pd.read_excel("salaire.xls", skiprows=5)
immigration_data = pd.read_excel("immigration.xls", skiprows=10)
population_data = pd.read_excel("population.xls", skiprows=5)
education_data = pd.read_excel("education.xls", skiprows=5)
logement_data = pd.read_excel("logement.xls", skiprows=5)

KeyboardInterrupt: 

### First, as usual, we convert unicode to string.

In [None]:
import unicodedata

def unicode_to_string(word):
    if pd.isnull(word):
        return 
    else:
        return unicodedata.normalize('NFKD', word).encode('ascii','ignore')
    
def data_to_string(data):
    new_data = data.copy()
    
    col_title_unicode = new_data.columns.values.tolist()
    col_title = map(unicode_to_string,col_title_unicode)
    new_data.columns = col_title
    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == unicode: #check the first not-NaN value
            new_data[col] = map(unicode_to_string,new_data[col])
            
    return new_data

In [None]:
salary_data = data_to_string(salary_data)
immigration_data = data_to_string(immigration_data)
population_data = data_to_string(population_data)
education_data = data_to_string(education_data)
logement_data = data_to_string(logement_data)

### Now we will rename the column headers so that we have at least 1 common column for all the data frames. 

In [None]:
def rename_column(data): 
    new_data = data.copy()
    col_title = new_data.columns.tolist()
    for x in xrange(len(col_title)):
        if col_title[x] == 'CODGEO':
            col_title[x] = 'Code Insee' # this will be the pivot column for merging
        if col_title[x] == 'LIBGEO':
            col_title[x] = 'Libelle de la commune'
    new_data.columns = col_title
    return new_data

In [None]:
salary_data = rename_column(salary_data)
immigration_data = rename_column(immigration_data)
population_data = rename_column(population_data)
education_data = rename_column(education_data)
logement_data = rename_column(logement_data)

### We still have problem with departemental code of Corse because they are not in numerical form (2A... or 2B...).

In [None]:
def to_digit(string):
    new_string = list(string)[:]
    for x in xrange(len(new_string)): 
        if not new_string[x].isdigit():
            new_string[x] = '0'
    return int(''.join(new_string))

In [None]:
def replace_insee_code(data): #replace 2AXXX or 2BXXX by 20XXX 
        new_data = data.copy()
        insee_list = new_data['Code Insee'].tolist() 
        new_insee_list = [int(code) if code.isdigit() else to_digit(code) for code in insee_list]
        new_data['Code Insee'] = new_insee_list
        return new_data

In [None]:
salary_data = replace_insee_code(salary_data)
immigration_data = replace_insee_code(immigration_data)
population_data = replace_insee_code(population_data)
education_data = replace_insee_code(education_data)
logement_data = replace_insee_code(logement_data)

### There is still another problem: all the numbers in our excel sheet are read in float format. We need to convert them to int for the immigration and population data because they contain quantity-format data.

In [None]:
def float_to_int(data):
    new_data = data.copy()    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == np.float64: #check the first not-NaN value
            new_data[col] = new_data[col].round()
    return new_data

In [None]:
immigration_data = float_to_int(immigration_data)
population_data = float_to_int(population_data)
education_data = float_to_int(education_data)
logement_data = float_to_int(logement_data )

### In immigration data, we will add a column which shows us the percentage of adult immigrants  in each town.

In [None]:
def add_total_immigrant_variable(data): 
    new_data = data.copy()
    variable_list = []
    sum_list = []
    for variable in data.columns.tolist(): 
        if 'IMMI' in variable and 'AGE400' not in variable: 
            sum_list.append(variable)
            if 'IMMI1' in variable:
                variable_list.append(variable)
    new_data['% immigrant'] = (new_data[variable_list].sum(axis=1)/ new_data[sum_list].sum(axis=1)) * 100
    col = new_data.columns.tolist()
    col = col[:2] + col[-1:] + col[2:-1]
    new_data = new_data[col]
    return new_data

In [None]:
immigration_data = add_total_immigrant_variable(immigration_data)

In [None]:
immigration_data.head()

### In education data, we convert the data to percentage. 

In [None]:
def education_level_in_percentage(data):
    new_data = data.copy()
    variable_list = ['Code Insee', 'Libelle de la commune']
    for variable in new_data.columns.tolist(): 
        if '_NSCOL15P_' in variable:
            variable_list.append(variable)
    for chosen_variable in variable_list[2:]:
        new_data[chosen_variable] = (new_data[chosen_variable] / new_data['P12_NSCOL15P']) * 100
        #new_data[chosen_variable] = new_data['P12_NSCOL15P'] - new_data[variable_list].sum(axis=1)
    return new_data[variable_list]

In [None]:
education_data = education_level_in_percentage(education_data)

In [None]:
education_data.head()

### In population data, we will transform the social status columns from quantity to percentage too.

In [None]:
def social_status_percentage(data): 
    new_data = data.copy()
    status_list = ['C12_POP15P_CS1', 'C12_POP15P_CS2','C12_POP15P_CS3','C12_POP15P_CS4','C12_POP15P_CS5',
                   'C12_POP15P_CS6','C12_POP15P_CS7','C12_POP15P_CS8']
    adult_population = new_data['C12_POP15P']
    for col in status_list:
        new_data['% '+ col] = (new_data[col] / adult_population)*100
    return new_data

In [None]:
population_data = social_status_percentage(population_data)

In [None]:
population_data.head()

### In logement data, we compute the percentage of house proprietor

In [None]:
def proprietor_percentage(data):
    new_data = data.copy()
    new_data['P12_NPER_RP_PROP'] = ((new_data['P12_NPER_RP_PROP'])/new_data['P12_NPER_RP'])*100
    return new_data

In [None]:
logement_data = proprietor_percentage(logement_data)

### Now we try to merge all the data frames

In [40]:
right_party_vote = pd.read_excel("right_party_vote.xlsx")
left_party_vote = pd.read_excel("left_party_vote.xlsx")
er_party_vote = pd.read_excel("er_party_vote.xlsx")


In [42]:
immigration_salary = pd.merge(immigration_data, salary_data, on = ['Code Insee','Libelle de la commune'], how = 'outer')

In [43]:
immigration_salary_population = pd.merge(population_data, immigration_salary, on = ['Code Insee','Libelle de la commune'], how = 'outer')

In [44]:
immigration_salary_population_education = pd.merge(education_data, immigration_salary_population, on = ['Code Insee','Libelle de la commune'], how = 'outer')

In [45]:
im_sal_pop_edu_log = pd.merge(logement_data, immigration_salary_population_education, on = ['Code Insee','Libelle de la commune'], how = 'outer')

In [46]:
full_data_right = pd.merge(right_party_vote, im_sal_pop_edu_log, on = 'Code Insee', how = 'left')
full_data_left = pd.merge(left_party_vote, im_sal_pop_edu_log, on = 'Code Insee', how = 'left')
full_data_er = pd.merge(er_party_vote, im_sal_pop_edu_log, on = 'Code Insee', how = 'left')

In [47]:
full_data_right.head()

Unnamed: 0,Code Insee,Voix,% Voix/Ins,% Voix/Exp,% Abs/Ins,REG_x,DEP_x,Libelle de la commune,P12_LOG,P12_RP,...,SNHMHO12,SNHM1812,SNHM2612,SNHM5012,SNHMF1812,SNHMF2612,SNHMF5012,SNHMH1812,SNHMH2612,SNHMH5012
0,1001,107,18.32,33.44,44.69,82,1,L'Abergement-Clemenciat,335,301,...,,,,,,,,,,
1,1002,32,14.28,25.2,41.52,82,1,L'Abergement-de-Varey,160,101,...,,,,,,,,,,
2,1004,1096,13.45,29.76,53.34,82,1,Amberieu-en-Bugey,6629,6012,...,10.945976,9.724445,13.042454,15.461354,9.049192,11.33007,12.313855,10.194692,14.088531,17.557177
3,1005,168,14.38,31.22,52.23,82,1,Amberieux-en-Dombes,647,615,...,,,,,,,,,,
4,1006,22,20.56,42.31,49.53,82,1,Ambleon,70,52,...,,,,,,,,,,


In [3]:
#right_vote = pd.read_excel("full_data_right.xlsx")
left_vote = pd.read_excel("full_data_left.xlsx")
#er_vote = pd.read_excel("full_data_er.xlsx")
#right_vote_2010 = pd.read_excel("2010_right_party_vote")



In [4]:
#right_vote_2010 = pd.read_excel("2010_right_party_vote.xlsx")
left_vote_2010 = pd.read_excel("2010_left_party_vote.xlsx")
#er_party_vote = pd.read_excel("2010_er_party_vote")

In [7]:
right_vote.head()

Unnamed: 0,Code Insee,Voix,% Voix/Ins,% Voix/Exp,% Abs/Ins,REG_x,DEP_x,Libelle de la commune,P12_LOG,P12_RP,...,SNHMHO12,SNHM1812,SNHM2612,SNHM5012,SNHMF1812,SNHMF2612,SNHMF5012,SNHMH1812,SNHMH2612,SNHMH5012
0,1001,107,18.32,33.44,44.69,82,1,L'Abergement-Clemenciat,335,301,...,,,,,,,,,,
1,1002,32,14.28,25.2,41.52,82,1,L'Abergement-de-Varey,160,101,...,,,,,,,,,,
2,1004,1096,13.45,29.76,53.34,82,1,Amberieu-en-Bugey,6629,6012,...,10.945976,9.724445,13.042454,15.461354,9.049192,11.33007,12.313855,10.194692,14.088531,17.557177
3,1005,168,14.38,31.22,52.23,82,1,Amberieux-en-Dombes,647,615,...,,,,,,,,,,
4,1006,22,20.56,42.31,49.53,82,1,Ambleon,70,52,...,,,,,,,,,,


In [7]:
#right_vote_2010 = right_vote_2010[['Code Insee',"% Voix/Exp"]]
left_vote_2010 = left_vote_2010[['Code Insee',"% Voix/Exp"]]
#er_vote_2010 = left_vote_2010[['Code Insee',"% Voix/Exp"]]

In [8]:
#right_vote_2010.columns = ['Code Insee','2010 Voix/Exp']
left_vote_2010.columns = ['Code Insee','2010 Voix/Exp']
#er_vote_2010.columns = ['Code Insee','2010 Voix/Exp']

In [9]:
left_vote_2010.head()

Unnamed: 0,Code Insee,2010 Voix/Exp
0,1001,32.81
1,1002,30.08
2,1004,41.11
3,1005,34.73
4,1006,34.0


In [13]:
full_data_right = pd.merge(right_vote, right_vote_2010, on = 'Code Insee', how = 'left')

In [10]:
full_data_left = pd.merge(left_vote, left_vote_2010, on = 'Code Insee', how = 'left')

In [None]:
full_data_er = pd.merge(er_vote, er_vote_2010, on = 'Code Insee', how = 'left')

### Export the full data frame

In [17]:
writer = pd.ExcelWriter('full_data_right.xlsx')
full_data_right.to_excel(writer,'Sheet1')
writer.save()

In [11]:
writer = pd.ExcelWriter('full_data_left.xlsx')
full_data_left.to_excel(writer,'Sheet1')
writer.save()

In [52]:
writer = pd.ExcelWriter('full_data_er.xlsx')
full_data_er.to_excel(writer,'Sheet1')
writer.save()