In [55]:
import pandas as pd
import numpy as np 

In [56]:
data = pd.read_excel("regional2014.xls")

In [57]:
data2 = data.copy()

In [58]:
col_title_unicode = data.columns.values.tolist()

### We see that the data is in Unicode format, which will cause some trouble later. We will try to convert it to string.

In [33]:
import unicodedata

def unicode_to_string(word):
    if pd.isnull(word):
        return 
    else:
        return unicodedata.normalize('NFKD', word).encode('ascii','ignore')

In [34]:
col_title = map(unicode_to_string,col_title_unicode)

### We replace the column head by new string-format

In [35]:
data.columns = col_title

In [36]:
data.columns.values.tolist()

['Code du departement',
 'Libelle du departement',
 'Code de la commune',
 'Libelle de la commune',
 'Inscrits',
 'Abstentions',
 '% Abs/Ins',
 'Votants',
 '% Vot/Ins',
 'Blancs et nuls',
 '% BlNuls/Ins',
 '% BlNuls/Vot',
 'Exprimes',
 '% Exp/Ins',
 '% Exp/Vot',
 'Nuance Liste',
 'Libelle Abrege Liste',
 'Libelle Etendu Liste',
 'Voix',
 '% Voix/Ins',
 '% Voix/Exp',
 'Nuance Liste.1',
 'Libelle Abrege Liste.1',
 'Libelle Etendu Liste.1',
 'Voix.1',
 '% Voix/Ins.1',
 '% Voix/Exp.1',
 'Nuance Liste.2',
 'Libelle Abrege Liste.2',
 'Libelle Etendu Liste.2',
 'Voix.2',
 '% Voix/Ins.2',
 '% Voix/Exp.2',
 'Nuance Liste.3',
 'Libelle Abrege Liste.3',
 'Libelle Etendu Liste.3',
 'Voix.3',
 '% Voix/Ins.3',
 '% Voix/Exp.3',
 'Nuance Liste.4',
 'Libelle Abrege Liste.4',
 'Libelle Etendu Liste.4',
 'Voix.4',
 '% Voix/Ins.4',
 '% Voix/Exp.4',
 'Nuance Liste.5',
 'Libelle Abrege Liste.5',
 'Libelle Etendu Liste.5',
 'Voix.5',
 '% Voix/Ins.5',
 '% Voix/Exp.5',
 'Nuance Liste.6',
 'Libelle Abrege Liste

###  We see that not only the column header but all the text-format cells in the dataset are in unicode. We will now convert all to string.

In [26]:
def data_to_string(data):
    new_data = data.copy()
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == unicode: #check the first not-NaN value
            new_data[col] = map(unicode_to_string,new_data[col])
    return new_data

In [59]:
import unicodedata

def unicode_to_string(word):
    if pd.isnull(word):
        return 
    else:
        return unicodedata.normalize('NFKD', word).encode('ascii','ignore')
    
def data_to_string(data):
    new_data = data.copy()
    
    col_title_unicode = new_data.columns.values.tolist()
    col_title = map(unicode_to_string,col_title_unicode)
    new_data.columns = col_title
    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == unicode: #check the first not-NaN value
            new_data[col] = map(unicode_to_string,new_data[col]) 
    return new_data

In [60]:
data = data_to_string(data)

### Check if there are NaN values in Numeric columns

In [61]:
def check_nan_in_numeric(data):
    new_data = data.copy()
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) is not str : #check the first not-NaN value to eliminate all string column
            if new_data[col].isnull().any(): 
                print col

In [62]:
check_nan_in_numeric(data)

Voix.8
% Voix/Ins.8
% Voix/Exp.8
Voix.9
% Voix/Ins.9
% Voix/Exp.9
Voix.10
% Voix/Ins.10
% Voix/Exp.10
Voix.11
% Voix/Ins.11
% Voix/Exp.11
Voix.12
% Voix/Ins.12
% Voix/Exp.12


### The problem here is that the number of parties is not the same in all communities: most of them have 8 parties, but some of them have more than 8. 
### Therefore, before checking for NaN value, we need to re-organize the data.
### We will create a new dataframe that has only 1 voting result per row.
### We start by adding a new column in the original dataset: Code Insee. We will use this as our pivot column.

In [63]:
def add_insee_code(data):
    new_data = data.copy()
    insee_code = []
    for x in xrange(len(new_data)):
        if new_data['Code du departement'][x] < 100:
            code = new_data['Code du departement'][x]*1000 + new_data['Code de la commune'][x]
        else:
            code = (new_data['Code du departement'][x]/10)*1000 + new_data['Code de la commune'][x]
        insee_code.append(code)
    new_data['Code Insee'] = insee_code
    cols = new_data.columns.tolist()
    cols = cols[-1:] + cols[:-1] # we move the new variable to the first column of our dataframe
    new_data = new_data[cols]
    return new_data

In [64]:
data = add_insee_code(data)

In [70]:
data.columns.tolist()

['Code Insee',
 'Code du departement',
 'Libelle du departement',
 'Code de la commune',
 'Libelle de la commune',
 'Inscrits',
 'Abstentions',
 '% Abs/Ins',
 'Votants',
 '% Vot/Ins',
 'Blancs et nuls',
 '% BlNuls/Ins',
 '% BlNuls/Vot',
 'Exprimes',
 '% Exp/Ins',
 '% Exp/Vot',
 'Nuance Liste',
 'Libelle Abrege Liste',
 'Libelle Etendu Liste',
 'Voix',
 '% Voix/Ins',
 '% Voix/Exp',
 'Nuance Liste.1',
 'Libelle Abrege Liste.1',
 'Libelle Etendu Liste.1',
 'Voix.1',
 '% Voix/Ins.1',
 '% Voix/Exp.1',
 'Nuance Liste.2',
 'Libelle Abrege Liste.2',
 'Libelle Etendu Liste.2',
 'Voix.2',
 '% Voix/Ins.2',
 '% Voix/Exp.2',
 'Nuance Liste.3',
 'Libelle Abrege Liste.3',
 'Libelle Etendu Liste.3',
 'Voix.3',
 '% Voix/Ins.3',
 '% Voix/Exp.3',
 'Nuance Liste.4',
 'Libelle Abrege Liste.4',
 'Libelle Etendu Liste.4',
 'Voix.4',
 '% Voix/Ins.4',
 '% Voix/Exp.4',
 'Nuance Liste.5',
 'Libelle Abrege Liste.5',
 'Libelle Etendu Liste.5',
 'Voix.5',
 '% Voix/Ins.5',
 '% Voix/Exp.5',
 'Nuance Liste.6',
 'Libel

#### (NB: we had to manually modify some departement codes like those of Corse, Reunion, Martinique,.... because they were not in numerical form).

### Now we create the new dataframe. 

In [65]:
def remove_nan(data):
    new_data = data.copy()
    boolean_df = pd.notnull(new_data[new_data.columns.tolist()[1:]])
    boolean_list = [any(row) for index, row in boolean_df.iterrows()]
    new_data = new_data[boolean_list]
    return new_data

In [82]:
def create_voting_data(data):
    data_index = ['Code Insee', 'Nuance Liste', 'Voix', '% Voix/Ins','% Voix/Exp']

    code_insee = 'Code Insee'
    #nListe = 'NListe'
    nuance_liste = 'Nuance Liste'
    voix = 'Voix'
    voix_ins = '% Voix/Ins'
    voix_exp = '% Voix/Exp'

    voting_data = data[data_index] 
    counter = 1

    while True:
        #new_nListe = nListe + '.' + str(counter)
        new_nuance_liste = nuance_liste + '.' + str(counter)
        new_voix = voix + '.' + str(counter)
        new_voix_ins = voix_ins + '.' + str(counter)
        new_voix_exp = voix_exp + '.' + str(counter)

        try: # condition to stop
            data[new_nuance_liste]  
        except: 
            break

        new_data_index = ['Code Insee',new_nuance_liste, new_voix,new_voix_ins, new_voix_exp]
        new_data = data[new_data_index]
        new_data.columns = data_index
        voting_data = pd.concat([voting_data, new_data])
        counter += 1

    voting_data = remove_nan(voting_data)
    voting_data = voting_data.sort_index() 
    voting_data.index = range(0,len(voting_data))
    
    return voting_data

In [83]:
voting_data = create_voting_data(data)

In [84]:
voting_data.shape

(353727, 5)

### Once we have a clean list, we will divide it into 3 groups corresponding to the 3 political orienatations: Left, Right and Extreme Right 

In [87]:
voting_data['Nuance Liste'].unique()

array(['LEXG', 'LMAJ', 'LCMD', 'LAUT', 'LVEC', 'LSOC', 'LCOP', 'LFN',
       'LDVG', 'LEXD', 'LUG', 'LDVD', 'LREG'], dtype=object)

In [88]:
left_party = ['LDVG', 'LCOM', 'LFG', 'LEXG','LSOC','LVEG','LREG','LUG','LCOP']
right_party = ['LDVD','LDLF', 'LUDI','LUD','LMDM', 'LRDG','LLR', 'LAUT'] 
er_party = ['LFN','LEXD',] 
other_party = ['LECO','LDIV', 'LVEC', 'LMAJ', 'LCMD']

In [89]:
left_party_vote = voting_data[voting_data['Nuance Liste'].isin(left_party)]
left_party_vote.index = range(0, len(left_party_vote))
                              
right_party_vote = voting_data[voting_data['Nuance Liste'].isin(right_party)]
right_party_vote.index = range(0, len(right_party_vote))
                               
er_party_vote = voting_data[voting_data['Nuance Liste'].isin(er_party)]
er_party_vote.index = range(0, len(er_party_vote))
                            
other_party_vote = voting_data[voting_data['Nuance Liste'].isin(other_party)]
other_party_vote.index = range(0, len(other_party_vote))

### Now we group all the result of the right/left party in each town

In [92]:
def groupby_town(data, original_data):
    new_data = data.copy()
    #new_data = new_data.drop(['NListe','Nuance Liste'],1)
    new_data = new_data.groupby('Code Insee')
    new_data = new_data.aggregate(np.sum).reset_index()
    abs_ins = original_data[['Code Insee','% Abs/Ins']] # we add abstention information
    new_data = pd.merge(new_data,abs_ins, on = 'Code Insee',how='left')
    return new_data

In [93]:
left_party_vote = groupby_town(left_party_vote,data) 
right_party_vote = groupby_town(right_party_vote,data)
er_party_vote = groupby_town(er_party_vote,data)
other_party_vote = groupby_town(other_party_vote,data)

In [102]:
right_party_vote.head() 

Unnamed: 0,Code Insee,Voix,% Voix/Ins,% Voix/Exp,% Abs/Ins
0,1001,9,1.53,3.52,55.1
1,1002,3,1.32,2.65,47.58
2,1004,61,0.79,1.97,58.7
3,1005,10,0.9,2.46,62.21
4,1006,0,0.0,0.0,52.83


In [95]:
left_party_vote.head()

Unnamed: 0,Code Insee,Voix,% Voix/Ins,% Voix/Exp,% Abs/Ins
0,1001,84,14.28,32.81,55.1
1,1002,34,14.97,30.08,47.58
2,1004,1270,16.46,41.11,58.7
3,1005,141,12.74,34.73,62.21
4,1006,17,16.04,34.0,52.83


In [101]:
er_party_vote.head()

Unnamed: 0,Code Insee,Voix,% Voix/Ins,% Voix/Exp,% Abs/Ins
0,1001,58,9.86,22.66,55.1
1,1002,19,8.37,16.81,47.58
2,1004,465,6.03,15.05,58.7
3,1005,84,7.59,20.69,62.21
4,1006,8,7.55,16.0,52.83


### We export these dataframes to excel files

In [97]:
writer = pd.ExcelWriter('2010_left_party_vote.xlsx')
left_party_vote.to_excel(writer,'Sheet1')
writer.save()

In [98]:
writer = pd.ExcelWriter('2010_right_party_vote.xlsx')
right_party_vote.to_excel(writer,'Sheet1')
writer.save()

In [96]:
writer = pd.ExcelWriter('2010_er_party_vote.xlsx')
er_party_vote.to_excel(writer,'Sheet1')
writer.save()