### This script have all functions to clean and export the voting result of each parties (left, right or extreme right) for the regional electoral data. 
### The main function is clean_data(data), it will do the following tasks: 
        * Import data.
        * Convert unicode to string.
        * Convert non-numerical departmental code to numerical.
        * Add Insee code.
        * Filter list for voting data. 
        * Group voting data by political parties (left/right/er).
        * Export data

### ----------------- FUNCTIONS THAT WE WILL NEED -----------------

In [1]:
import pandas as pd
import numpy as np 

### We see that the data is in Unicode format, which will cause some trouble later. We will try to convert it to string.

In [2]:
import unicodedata

def unicode_to_string(word):
    if word == False: # there are some town that are named Faux, which confused the program 
        word = u'Faux'
    if pd.isnull(word):
        return 
    else:
        return unicodedata.normalize('NFKD', word).encode('ascii','ignore')
    
def data_to_string(data):
    new_data = data.copy()
    
    col_title_unicode = new_data.columns.values.tolist()
    col_title = map(unicode_to_string,col_title_unicode)
    new_data.columns = col_title
    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == unicode: #check the first not-NaN value
            new_data[col] = map(unicode_to_string,new_data[col]) 
    return new_data

### Check if there are NaN values in Numeric columns

In [3]:
def check_nan_in_numeric(data):
    new_data = data.copy()
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) is not str : #check the first not-NaN value to eliminate all string column
            if new_data[col].isnull().any(): 
                print col

### The problem here is that the number of parties is not the same in all communities: most of them have 8 parties, but some of them have more than 8. 
### Therefore, before checking for NaN value, we need to re-organize the data.
### We will create a new dataframe that has only 1 voting result per row.
### We start by adding a new column in the original dataset: Code Insee. We will use this as our pivot column.
### Before we can add the Insee Code. We need to modify some department codes like those of Corse, Reunion, Martinique,.. because they are not in numerical form

In [4]:
def convert_departmental_code(data):
    new_data = data.copy()
    code_list = {'la reunion':974, 'guyane':973, 'martinique':972, 'guadeloupe':971, 'corse':20}
    for departement in code_list.keys(): 
        index = data['Libelle du departement'].str.lower().str.contains(departement)
        new_data.loc[index,'Code du departement'] = code_list[departement]
    return new_data

In [5]:
def add_insee_code(data):
    new_data = data.copy()
    insee_code = []
    for x in xrange(len(new_data)):
        if new_data['Code du departement'][x] < 100:
            code = new_data['Code du departement'][x]*1000 + new_data['Code de la commune'][x]
        else:
            code = (new_data['Code du departement'][x]/10)*1000 + new_data['Code de la commune'][x]
        insee_code.append(code)
    new_data['Code Insee'] = insee_code
    cols = new_data.columns.tolist()
    cols = cols[-1:] + cols[:-1] # we move the new variable to the first column of our dataframe
    new_data = new_data[cols]
    return new_data

### Now we create the new dataframe. 

In [6]:
def remove_nan(data):
    new_data = data.copy()
    boolean_df = pd.notnull(new_data[new_data.columns.tolist()[1:]])
    boolean_list = [any(row) for index, row in boolean_df.iterrows()]
    new_data = new_data[boolean_list]
    return new_data

In [7]:
def create_voting_data(data):
    data_index = ['Code Insee', 'Nuance Liste','% Voix/Exp']

    code_insee = 'Code Insee'
    #nListe = 'NListe'
    nuance_liste = 'Nuance Liste'
    #voix = 'Voix'
    #voix_ins = '% Voix/Ins'
    voix_exp = '% Voix/Exp'

    voting_data = data[data_index] 
    counter = 1

    while True:
        #new_nListe = nListe + '.' + str(counter)
        new_nuance_liste = nuance_liste + '.' + str(counter)
        #new_voix = voix + '.' + str(counter)
        #new_voix_ins = voix_ins + '.' + str(counter)
        new_voix_exp = voix_exp + '.' + str(counter)

        try: # condition to stop
            data[new_nuance_liste]  
        except: 
            break

        new_data_index = ['Code Insee',new_nuance_liste, new_voix_exp]
        new_data = data[new_data_index]
        new_data.columns = data_index
        voting_data = pd.concat([voting_data, new_data])
        counter += 1

    voting_data = remove_nan(voting_data) # we remove all the empty rows 
    voting_data = voting_data.sort_index() 
    voting_data.index = range(0,len(voting_data))
    
    return voting_data

### Once we have a clean list, we will divide it into 3 groups corresponding to the 3 political orienatations: Left, Right and Extreme Right 

In [8]:
left_party = ['LDVG', 'LCOM', 'LFG', 'LEXG','LSOC','LVEG','LREG','LUG','LCOP','LXG','LDG']
right_party = ['LDVD','LDLF', 'LUDI','LUD','LMDM', 'LRDG','LLR', 'LAUT','LDR', 'LDD'] 
er_party = ['LFN','LEXD','LXD'] 
other_party = ['LECO','LDIV', 'LVEC', 'LMAJ', 'LCMD', 'LDV','LGA','LVE','LEC','LCP','LRG']

### Now we group all the result of the right/left party in each town

In [9]:
def groupby_town(data, original_data):
    new_data = data.copy()
    #new_data = new_data.drop(['NListe','Nuance Liste'],1)
    new_data = new_data.groupby('Code Insee')
    new_data = new_data.aggregate(np.sum).reset_index() # we sum all the votes of right/left/er lists in 1 town
    #abs_ins = original_data[['Code Insee','% Abs/Ins']] # we add abstention information
    #new_data = pd.merge(new_data,abs_ins, on = 'Code Insee',how='left')
    return new_data

### ----------------- MAIN FUNCTION -----------------

In [10]:
def clean_data(file, export_name):
    data = pd.read_excel(file)
    data = data_to_string(data)
    data = convert_departmental_code(data)
    data = add_insee_code(data)
    voting_data = create_voting_data(data)
    
    left_party = ['LDVG', 'LCOM', 'LFG', 'LEXG','LSOC','LVEG','LREG','LUG','LCOP','LXG','LDG']
    right_party = ['LDVD','LDLF', 'LUDI','LUD','LMDM', 'LRDG','LLR', 'LAUT','LDR', 'LDD'] 
    er_party = ['LFN','LEXD','LXD'] 
    other_party = ['LECO','LDIV', 'LVEC', 'LMAJ', 'LCMD', 'LDV','LGA','LVE','LEC','LCP','LRG']
    
    left_party_vote = voting_data[voting_data['Nuance Liste'].isin(left_party)]
    left_party_vote.index = range(0, len(left_party_vote))

    right_party_vote = voting_data[voting_data['Nuance Liste'].isin(right_party)]
    right_party_vote.index = range(0, len(right_party_vote))

    er_party_vote = voting_data[voting_data['Nuance Liste'].isin(er_party)]
    er_party_vote.index = range(0, len(er_party_vote))

    other_party_vote = voting_data[voting_data['Nuance Liste'].isin(other_party)]
    other_party_vote.index = range(0, len(other_party_vote))
    
    left_party_vote = groupby_town(left_party_vote,data) 
    right_party_vote = groupby_town(right_party_vote,data)
    er_party_vote = groupby_town(er_party_vote,data)
    other_party_vote = groupby_town(other_party_vote,data)
    
    left_party_vote.columns = ['Code Insee', export_name+' vote']
    right_party_vote.columns = ['Code Insee', export_name+' vote']
    er_party_vote.columns = ['Code Insee', export_name+' vote']
    
    writer1 = pd.ExcelWriter('cleaned_data/'+export_name+'_right.xlsx')
    left_party_vote.to_excel(writer1,'Sheet1')
    writer1.save()
    
    writer2 = pd.ExcelWriter('cleaned_data/'+export_name+'_left.xlsx')
    right_party_vote.to_excel(writer2,'Sheet1')
    writer2.save()
    
    writer3 = pd.ExcelWriter('cleaned_data/'+export_name+'_er.xlsx')
    er_party_vote.to_excel(writer3,'Sheet1')
    writer3.save()
    
    return left_party_vote, right_party_vote, er_party_vote

In [18]:
left,right,er = clean_data(file='original_data/regionale2015.xlsx', export_name='regionale2015')
left,right,er = clean_data(file='original_data/regionale2010.xls', export_name='regionale2010')
left,right,er = clean_data(file='original_data/regionale2004.xls', export_name='regionale2004')

In [None]:
import cProfile

#cProfile.run("clean_data('regionale2015.xlsx','regionale2015')")