In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.stats.stats import pearsonr   
import matplotlib.pyplot as plt

In [7]:
data = pd.read_excel('full_data_right.xlsx')

In [8]:
import unicodedata

def unicode_to_string(word):
    if pd.isnull(word):
        return 
    else:
        return unicodedata.normalize('NFKD', word).encode('ascii','ignore')
    
def data_to_string(data):
    new_data = data.copy()
    
    col_title_unicode = new_data.columns.values.tolist()
    col_title = map(unicode_to_string,col_title_unicode)
    new_data.columns = col_title
    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == unicode: #check the first not-NaN value
            new_data[col] = map(unicode_to_string,new_data[col])
            
    return new_data

In [25]:
def float_to_int(data):
    new_data = data.copy()    
    for col in new_data.columns:
        if '% Voix' in col: 
            # Here we add this condition because the variable %voix needs to be in float format 
            # Need to think more about this in case we have more float-fotmat data 
            continue
        else: 
            not_nan_index = [not ind for ind in new_data[col].isnull()]
            not_nan_value = new_data[col][not_nan_index]
            if type(not_nan_value.iloc[0]) == np.float64: #check the first not-NaN value
                new_data[col] = new_data[col].round()
    return new_data

In [29]:
#data2 = data.copy()
#data2 = data_to_string(data2)
#data2 = float_to_int(data2)
data = data_to_string(data)
data = float_to_int(data)

In [30]:
data.head()

Unnamed: 0,Code Insee,NListe,Nuance Liste,Voix,% Voix/Ins,% Voix/Exp,REG,DEP,Libelle de la commune,P12_POP,...,SNHMHO12,SNHM1812,SNHM2612,SNHM5012,SNHMF1812,SNHMF2612,SNHMF5012,SNHMH1812,SNHMH2612,SNHMH5012
0,1001,2,LDVD,0,0.0,0.0,82,1,L'Abergement-Clemenciat,777,...,,,,,,,,,,
1,1001,9,LDLF,7,1.2,2.19,82,1,L'Abergement-Clemenciat,777,...,,,,,,,,,,
2,1001,6,LUD,100,17.12,31.25,82,1,L'Abergement-Clemenciat,777,...,,,,,,,,,,
3,1002,6,LUD,27,12.05,21.26,82,1,L'Abergement-de-Varey,235,...,,,,,,,,,,
4,1002,2,LDVD,0,0.0,0.0,82,1,L'Abergement-de-Varey,235,...,,,,,,,,,,


In [34]:
data.columns.tolist()

['Code Insee',
 'NListe',
 'Nuance Liste',
 'Voix',
 '% Voix/Ins',
 '% Voix/Exp',
 'REG',
 'DEP',
 'Libelle de la commune',
 'P12_POP',
 'P12_POP0014',
 'P12_POP1529',
 'P12_POP3044',
 'P12_POP4559',
 'P12_POP6074',
 'P12_POP7589',
 'P12_POP90P',
 'P12_POPH',
 'P12_H0014',
 'P12_H1529',
 'P12_H3044',
 'P12_H4559',
 'P12_H6074',
 'P12_H7589',
 'P12_H90P',
 'P12_H0019',
 'P12_H2064',
 'P12_H65P',
 'P12_POPF',
 'P12_F0014',
 'P12_F1529',
 'P12_F3044',
 'P12_F4559',
 'P12_F6074',
 'P12_F7589',
 'P12_F90P',
 'P12_F0019',
 'P12_F2064',
 'P12_F65P',
 'P12_POP01P',
 'P12_POP01P_IRAN1',
 'P12_POP01P_IRAN2',
 'P12_POP01P_IRAN3',
 'P12_POP01P_IRAN4',
 'P12_POP01P_IRAN5',
 'P12_POP01P_IRAN6',
 'P12_POP01P_IRAN7',
 'P12_POP0114_IRAN2P',
 'P12_POP0114_IRAN2',
 'P12_POP0114_IRAN3P',
 'P12_POP1524_IRAN2P',
 'P12_POP1524_IRAN2',
 'P12_POP1524_IRAN3P',
 'P12_POP2554_IRAN2P',
 'P12_POP2554_IRAN2',
 'P12_POP2554_IRAN3P',
 'P12_POP55P_IRAN2P',
 'P12_POP55P_IRAN2',
 'P12_POP55P_IRAN3P',
 'C12_POP15P',
 'C12

### We will create a new data frame containing only the variable needed. We drop all the others variables. 

In [35]:
def create_final_data(data):
    new_data = data.copy()
    title = data.columns.tolist()
    dropping_index = []
    keeping_variable = ['Code Insee','% Voix/Ins','% Voix/Exp','Total immigrant','C12_POP15P_CS1','C12_POP15P_CS2','C12_POP15P_CS3','C12_POP15P_CS4','C12_POP15P_CS5', 'C12_POP15P_CS6',
                       'C12_POP15P_CS7', 'C12_POP15P_CS8','SNHM12','Total ']
    for index, variable in enumerate(title): 
        if variable not in keeping_variable:
            dropping_index.append(index)
    new_data = new_data.drop(new_data.columns[dropping_index], axis = 1)
    return new_data

In [36]:
data2 = create_final_data(data)

In [37]:
data2.head() 

Unnamed: 0,Code Insee,% Voix/Ins,% Voix/Exp,C12_POP15P_CS1,C12_POP15P_CS2,C12_POP15P_CS3,C12_POP15P_CS4,C12_POP15P_CS5,C12_POP15P_CS6,C12_POP15P_CS7,C12_POP15P_CS8,Total immigrant,SNHM12
0,1001,0.0,0.0,16,4,80,88,116,72,167,76,25,
1,1001,1.2,2.19,16,4,80,88,116,72,167,76,25,
2,1001,17.12,31.25,16,4,80,88,116,72,167,76,25,
3,1002,12.05,21.26,0,4,20,52,8,32,64,8,4,
4,1002,0.0,0.0,0,4,20,52,8,32,64,8,4,


In [89]:
voix_immigrant = data2[['Code Insee','% Voix/Exp','Total immigrant']]

In [92]:
voix_immigrant = voix_immigrant.dropna()
voix_immigrant.head(3)

Unnamed: 0,Code Insee,% Voix/Exp,Total immigrant
0,1001,0.0,25
1,1001,2.19,25
2,1001,31.25,25


In [93]:
voix_immigrant = voix_immigrant.groupby(['Code Insee', 'Total immigrant'])

In [94]:
voix_immigrant = voix_immigrant.aggregate(np.sum)


In [103]:
voix_immigrant.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,% Voix/Exp
Code Insee,Total immigrant,Unnamed: 2_level_1
1001,25,33.44
1002,4,25.2
1004,1349,29.76
1005,59,31.22
1006,8,42.31


In [104]:
i = voix_immigrant['Total immigrant'].tolist()

KeyError: 'Total immigrant'

In [101]:
v = voix_immigrant['% Voix/Exp'].tolist()


KeyError: 'Total immigrant'

In [100]:
plt.hist(v)
plt.show()

In [58]:
pearsonr(voix_immigrant['% Voix/Exp'], voix_immigrant['Total immigrant'])

(-0.010989411186549234, 0.0010778371926835689)

In [59]:
data3 = data2.iloc[:5]
data3

Unnamed: 0,Code Insee,% Voix/Ins,% Voix/Exp,C12_POP15P_CS1,C12_POP15P_CS2,C12_POP15P_CS3,C12_POP15P_CS4,C12_POP15P_CS5,C12_POP15P_CS6,C12_POP15P_CS7,C12_POP15P_CS8,Total immigrant,SNHM12
0,1001,0.0,0.0,16,4,80,88,116,72,167,76,25,
1,1001,1.2,2.19,16,4,80,88,116,72,167,76,25,
2,1001,17.12,31.25,16,4,80,88,116,72,167,76,25,
3,1002,12.05,21.26,0,4,20,52,8,32,64,8,4,
4,1002,0.0,0.0,0,4,20,52,8,32,64,8,4,


In [74]:
tit = data3.columns.tolist()
tit.remove('% Voix/Ins')

In [76]:
tit.remove('% Voix/Exp')

In [84]:
tit.remove('SNHM12')

ValueError: list.remove(x): x not in list

In [85]:
dat = data3.groupby(tit)

In [86]:
dat.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,% Voix/Ins,% Voix/Exp,SNHM12
Code Insee,C12_POP15P_CS1,C12_POP15P_CS2,C12_POP15P_CS3,C12_POP15P_CS4,C12_POP15P_CS5,C12_POP15P_CS6,C12_POP15P_CS7,C12_POP15P_CS8,Total immigrant,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001,16,4,80,88,116,72,167,76,25,18.32,33.44,
1002,0,4,20,52,8,32,64,8,4,12.05,21.26,


In [87]:
data3

Unnamed: 0,Code Insee,% Voix/Ins,% Voix/Exp,C12_POP15P_CS1,C12_POP15P_CS2,C12_POP15P_CS3,C12_POP15P_CS4,C12_POP15P_CS5,C12_POP15P_CS6,C12_POP15P_CS7,C12_POP15P_CS8,Total immigrant,SNHM12
0,1001,0.0,0.0,16,4,80,88,116,72,167,76,25,
1,1001,1.2,2.19,16,4,80,88,116,72,167,76,25,
2,1001,17.12,31.25,16,4,80,88,116,72,167,76,25,
3,1002,12.05,21.26,0,4,20,52,8,32,64,8,4,
4,1002,0.0,0.0,0,4,20,52,8,32,64,8,4,


In [None]:
plt.hist(y,bins=np.arange(min(y), max(y) + 10, 10))
plt.show()

In [47]:
x

array([ 15.93846154,  15.93846154,  15.93846154, ...,   0.        ,
         0.        ,   8.2930232 ])

In [45]:
x = scipy.array([-0.65499887,  2.34644428, 3.0])
y = scipy.array([-1.46049758,  3.86537321, 21.0])
pearsonr(x,y)

(0.79617014831975552, 0.41371200873701036)