In [70]:
import pandas as pd
import numpy as np
import scipy
from scipy.stats.stats import pearsonr   
import matplotlib.pyplot as plt

In [71]:
data_right = pd.read_excel('full_data_right.xlsx')

In [73]:
import unicodedata

def unicode_to_string(word):
    if pd.isnull(word):
        return 
    else:
        return unicodedata.normalize('NFKD', word).encode('ascii','ignore')
    
def data_to_string(data):
    new_data = data.copy()
    
    col_title_unicode = new_data.columns.values.tolist()
    col_title = map(unicode_to_string,col_title_unicode)
    new_data.columns = col_title
    
    for col in new_data.columns:
        not_nan_index = [not ind for ind in new_data[col].isnull()]
        not_nan_value = new_data[col][not_nan_index]
        if type(not_nan_value.iloc[0]) == unicode: #check the first not-NaN value
            new_data[col] = map(unicode_to_string,new_data[col])
            
    return new_data

In [74]:
def float_to_int(data):
    new_data = data.copy()    
    for col in new_data.columns:
        if '%' in col: 
            # Here we add this condition because the variable %voix needs to be in float format 
            # Need to think more about this in case we have more float-fotmat data 
            continue
        else: 
            not_nan_index = [not ind for ind in new_data[col].isnull()]
            not_nan_value = new_data[col][not_nan_index]
            if type(not_nan_value.iloc[0]) == np.float64: #check the first not-NaN value
                new_data[col] = new_data[col].round()
    return new_data

In [75]:
data_right = data_to_string(data_right)
data_right = float_to_int(data_right)

In [76]:
data_right.head()

Unnamed: 0,Code Insee,Voix,% Voix/Ins,% Voix/Exp,% Abs/Ins,REG,DEP,Libelle de la commune,P12_POP,P12_POP0014,...,SNHMHO12,SNHM1812,SNHM2612,SNHM5012,SNHMF1812,SNHMF2612,SNHMF5012,SNHMH1812,SNHMH2612,SNHMH5012
0,1001,107,18.32,33.44,44.69,82,1,L'Abergement-Clemenciat,777,178,...,,,,,,,,,,
1,1002,32,14.28,25.2,41.52,82,1,L'Abergement-de-Varey,235,44,...,,,,,,,,,,
2,1004,1096,13.45,29.76,53.34,82,1,Amberieu-en-Bugey,14233,3038,...,11.0,10.0,13.0,15.0,9.0,11.0,12.0,10.0,14.0,18.0
3,1005,168,14.38,31.22,52.23,82,1,Amberieux-en-Dombes,1642,338,...,,,,,,,,,,
4,1006,22,20.56,42.31,49.53,82,1,Ambleon,110,12,...,,,,,,,,,,


### We will create a new data frame containing only the variable needed. We drop all the others variables. 

In [77]:
def create_final_data(data):
    new_data = data.copy()
    title = data.columns.tolist()
    dropping_index = []
    keeping_variable = ['Code Insee','% Voix/Ins','% Voix/Exp','% immigrant','% C12_POP15P_CS1','% C12_POP15P_CS2','% C12_POP15P_CS3',
                        '% C12_POP15P_CS4','% C12_POP15P_CS5', '% C12_POP15P_CS6','% C12_POP15P_CS7', '% C12_POP15P_CS8',
                        'SNHM12','Total ']
    for index, variable in enumerate(title): 
        if variable not in keeping_variable:
            dropping_index.append(index)
    new_data = new_data.drop(new_data.columns[dropping_index], axis = 1)
    return new_data

In [78]:
data2 = create_final_data(data_right)

In [81]:
data2.head() 

Unnamed: 0,Code Insee,% Voix/Ins,% Voix/Exp,% C12_POP15P_CS1,% C12_POP15P_CS2,% C12_POP15P_CS3,% C12_POP15P_CS4,% C12_POP15P_CS5,% C12_POP15P_CS6,% C12_POP15P_CS7,% C12_POP15P_CS8,% immigrant,SNHM12
0,1001,18.32,33.44,2.588997,0.647249,12.944984,14.239482,18.770227,11.650485,27.022654,12.297735,4.173623,
1,1002,14.28,25.2,0.0,2.116402,10.582011,27.513228,4.232804,16.931217,33.862434,4.232804,2.105263,
2,1004,13.45,29.76,0.0,2.81854,6.451324,15.59592,17.14388,17.197566,25.590551,15.211167,12.048946,13.0
3,1005,14.38,31.22,0.0,5.611068,8.147579,17.524981,18.139892,14.681015,25.28824,10.607225,4.528012,
4,1006,20.56,42.31,0.0,0.0,4.651163,18.604651,18.604651,9.302326,40.697674,9.302326,8.0,


In [82]:
voix_immigrant = data2[['Code Insee','% Voix/Exp','% immigrant']]

In [83]:
voix_immigrant = voix_immigrant.dropna()
voix_immigrant.head()

Unnamed: 0,Code Insee,% Voix/Exp,% immigrant
0,1001,33.44,4.173623
1,1002,25.2,2.105263
2,1004,29.76,12.048946
3,1005,31.22,4.528012
4,1006,42.31,8.0


In [84]:
plt.hist(voix_immigrant['% Voix/Exp'])
plt.show()

In [57]:
plt.scatter(voix_immigrant['% Voix/Exp'], voix_immigrant['% immigrant'])
plt.show() 

  if self._edgecolors == str('face'):


In [69]:
pearsonr(voix_immigrant['% Voix/Exp'], voix_immigrant['% immigrant'])[0]*100

-12.480203475887857

In [108]:
data2.columns.tolist()

['Code Insee',
 '% Voix/Ins',
 '% Voix/Exp',
 '% C12_POP15P_CS1',
 '% C12_POP15P_CS2',
 '% C12_POP15P_CS3',
 '% C12_POP15P_CS4',
 '% C12_POP15P_CS5',
 '% C12_POP15P_CS6',
 '% C12_POP15P_CS7',
 '% C12_POP15P_CS8',
 '% immigrant',
 'SNHM12']

In [146]:
def create_cor_table(data):
    variable_list = data.columns.tolist()
    columns_list = ['D/Immigrants', 'D/Agriculteurs','D/Artisans','D/Cadres','D/Prof_Intermediaire','D/Employes', 
               'D/Ouvrier', 'D/Retraite', 'D/Autre']
    cor_right_party = pd.DataFrame(columns = columns_list)
    counter = 3 # because the first 3 variables in the variable list are not important 
    for col in columns_list:
        extract_data = data[['% Voix/Exp', variable_list[counter]]]
        extract_data = extract_data.dropna()
        cor = pearsonr(extract_data[extract_data.columns[0]], extract_data[extract_data.columns[1]])[0] * 100
        cor_right_party[col] = [cor]
        counter += 1 
    return cor_right_party

In [147]:
correlation_right_party = create_cor_table(data2)

In [148]:
correlation_right_party

Unnamed: 0,D/Immigrants,D/Agriculteurs,D/Artisans,D/Cadres,D/Prof_Intermediaire,D/Employes,D/Ouvrier,D/Retraite,D/Autre
0,17.898836,-1.428497,6.420419,-3.340594,-9.659553,-1.31459,3.44955,-11.855126,-12.480203


In [141]:
columns = ['D/Immigrants', 'D/Agriculteurs','D/Artisans','D/Cadres','D/Prof_Intermediaire','D/Employes', 
               'D/Ouvrier', 'D/Retraite', 'D/Autre']
d = pd.DataFrame(columns = columns)

In [142]:
d['D/Immigrants'] = [1]
d

Unnamed: 0,D/Immigrants,D/Agriculteurs,D/Artisans,D/Cadres,D/Prof_Intermediaire,D/Employes,D/Ouvrier,D/Retraite,D/Autre
0,1,,,,,,,,


In [136]:
voix_agriculteur = data2[['% Voix/Exp','% C12_POP15P_CS1']]
len(voix_agriculteur)

36640

In [101]:
voix_agriculteur = voix_agriculteur.dropna()
len(voix_agriculteur)

36628

In [119]:
pearsonr(voix_agriculteur['% Voix/Exp'], voix_agriculteur['% C12_POP15P_CS1'])[0]

0.17898836389870007

In [74]:
tit = data3.columns.tolist()
tit.remove('% Voix/Ins')

In [76]:
tit.remove('% Voix/Exp')

In [84]:
tit.remove('SNHM12')

ValueError: list.remove(x): x not in list

In [85]:
dat = data3.groupby(tit)

In [86]:
dat.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,% Voix/Ins,% Voix/Exp,SNHM12
Code Insee,C12_POP15P_CS1,C12_POP15P_CS2,C12_POP15P_CS3,C12_POP15P_CS4,C12_POP15P_CS5,C12_POP15P_CS6,C12_POP15P_CS7,C12_POP15P_CS8,Total immigrant,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001,16,4,80,88,116,72,167,76,25,18.32,33.44,
1002,0,4,20,52,8,32,64,8,4,12.05,21.26,


In [87]:
data3

Unnamed: 0,Code Insee,% Voix/Ins,% Voix/Exp,C12_POP15P_CS1,C12_POP15P_CS2,C12_POP15P_CS3,C12_POP15P_CS4,C12_POP15P_CS5,C12_POP15P_CS6,C12_POP15P_CS7,C12_POP15P_CS8,Total immigrant,SNHM12
0,1001,0.0,0.0,16,4,80,88,116,72,167,76,25,
1,1001,1.2,2.19,16,4,80,88,116,72,167,76,25,
2,1001,17.12,31.25,16,4,80,88,116,72,167,76,25,
3,1002,12.05,21.26,0,4,20,52,8,32,64,8,4,
4,1002,0.0,0.0,0,4,20,52,8,32,64,8,4,


In [None]:
plt.hist(y,bins=np.arange(min(y), max(y) + 10, 10))
plt.show()

In [47]:
x

array([ 15.93846154,  15.93846154,  15.93846154, ...,   0.        ,
         0.        ,   8.2930232 ])

In [45]:
x = scipy.array([-0.65499887,  2.34644428, 3.0])
y = scipy.array([-1.46049758,  3.86537321, 21.0])
pearsonr(x,y)

(0.79617014831975552, 0.41371200873701036)