# Importing Data

In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
import seaborn as sns

from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [2]:
geo = pd.read_csv('geo.csv')
industry = pd.read_csv('industry.csv')
salary = pd.read_csv('salary.csv')

In [3]:
geo.head(3)

Unnamed: 0,EU_circo,code_région,nom_région,chef.lieu_région,numéro_département,nom_département,préfecture,numéro_circonscription,nom_commune,codes_postaux,code_insee,latitude,longitude,éloignement
0,Sud-Est,82,Rhône-Alpes,Lyon,1,Ain,Bourg-en-Bresse,1,Attignat,1340,1024,46.283333,5.166667,1.21
1,Sud-Est,82,Rhône-Alpes,Lyon,1,Ain,Bourg-en-Bresse,1,Beaupont,1270,1029,46.4,5.266667,1.91
2,Sud-Est,82,Rhône-Alpes,Lyon,1,Ain,Bourg-en-Bresse,1,Bény,1370,1038,46.333333,5.283333,1.51


In [4]:
industry.head(3)

Unnamed: 0,CODGEO,LIBGEO,REG,DEP,E14TST,E14TS0ND,E14TS1,E14TS6,E14TS10,E14TS20,E14TS50,E14TS100,E14TS200,E14TS500
0,1001,L'Abergement-Clémenciat,82,1,25,22,1,2,0,0,0,0,0,0
1,1002,L'Abergement-de-Varey,82,1,10,9,1,0,0,0,0,0,0,0
2,1004,Ambérieu-en-Bugey,82,1,996,577,272,63,46,24,9,3,2,0


In [5]:
salary.head(3)

Unnamed: 0,CODGEO,LIBGEO,SNHM14,SNHMC14,SNHMP14,SNHME14,SNHMO14,SNHMF14,SNHMFC14,SNHMFP14,...,SNHMHO14,SNHM1814,SNHM2614,SNHM5014,SNHMF1814,SNHMF2614,SNHMF5014,SNHMH1814,SNHMH2614,SNHMH5014
0,1004,Ambérieu-en-Bugey,13.7,24.2,15.5,10.3,11.2,11.6,19.1,13.2,...,11.6,10.5,13.7,16.1,9.7,11.8,12.5,11.0,14.9,18.6
1,1007,Ambronay,13.5,22.1,14.7,10.7,11.4,11.9,19.0,13.3,...,11.7,9.8,13.8,14.6,9.2,12.2,12.5,10.2,14.9,16.4
2,1014,Arbent,13.5,27.6,15.6,11.1,11.1,10.9,19.5,11.7,...,11.8,9.3,13.3,16.0,8.9,10.6,12.5,9.6,15.1,18.6


# Data Cleaning

<b>NOTE:</b> Dropping any columns that is not important

In [6]:
geo.drop(['EU_circo', 'code_région', 'éloignement', 'numéro_département', 'nom_département', 'préfecture', 'numéro_circonscription', 'codes_postaux'], axis=1, inplace=True)

In [7]:
geo.rename(columns={'nom_région': 'region_name',
                          'chef.lieu_région': 'region_capital',
                          'nom_commune': 'common_name',
                          'codes_postaux': 'postcodes'}, inplace=True)

In [8]:
geo.head(3)

Unnamed: 0,region_name,region_capital,common_name,code_insee,latitude,longitude
0,Rhône-Alpes,Lyon,Attignat,1024,46.283333,5.166667
1,Rhône-Alpes,Lyon,Beaupont,1029,46.4,5.266667
2,Rhône-Alpes,Lyon,Bény,1038,46.333333,5.283333


In [10]:
# 1
geo["longitude"] = geo["longitude"].apply(lambda x: str(x).replace(',','.'))
# 2
mask = geo["longitude"] == '-'
geo.drop(geo[mask].index, inplace=True)
# 3
geo.dropna(subset = ["longitude", "latitude"], inplace=True)
# 4
geo["longitude"] = geo["longitude"].astype(float)

In [11]:
geo.drop_duplicates(subset=["code_insee"], keep="first", inplace=True)

In [14]:
paris_lat = geo.loc[geo["common_name"] == "Paris"].iloc[0]["latitude"]
paris_lon = geo.loc[geo["common_name"] == "Paris"].iloc[0]["longitude"]

In [16]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians 
    lon1 = radians(lon1)
    lat1 = radians(lat1)
    lon2 = radians(lon2)
    lat2 = radians(lat2)
    #lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

distances = []

for index, row in geo.iterrows():
    distances.append(haversine(row["longitude"], row["latitude"], paris_lon, paris_lat))

In [17]:
geo["distance"] = pd.Series(distances, index=geo.index)