# Fusion des bases de données

Dans ce notebook, on fusionne toutes nos bases de données pour créer notre base de données de travail.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#On importe le notebook dans lequel on a récupéré et nettoyé nos données
%run Recuperation_donnees.ipynb

## Fusion des données et création de la base

On commence par ajouter une variable avec la population et une variable avec le PIB par habitant, pour l'année correspondante, à la base des médailles.

In [3]:
#Fonction qui, pour une année donnée, fait une jointure entre la base des médailles et la population et le PIB par habitant 
#pour cette année donnée

def fonction_jointure(annee):
    
    df_medailles_jointure = df_medailles_copy[df_medailles_copy['Annee'] == annee]
    
    variable_annee_pop = str(annee) + " [YR" + str(annee) + "]"
    df_pop_jointure = df_pop[['Country Code', variable_annee_pop]]
    
    variable_annee_pib = str(annee) + " [YR" + str(annee) + "]"
    df_pib_jointure = df_pib_hab[['Country Code', variable_annee_pib]]
    
    df_merged_init = df_medailles_jointure.merge(df_pop_jointure, on="Country Code")
    df_merged_init.rename({variable_annee_pop: "Population"}, axis=1, inplace=True)
    
    df_merged = df_merged_init.merge(df_pib_jointure, on="Country Code")
    df_merged.rename({variable_annee_pib: "PIB par hab"}, axis=1, inplace=True)
    
    return(df_merged)   

In [4]:
#on teste la fonction
fonction_jointure(1992).head(10)

Unnamed: 0,Country Name,Country Code,Gold,Silver,Bronze,Total,Annee,Lieu,Pays_hote,Code_pays_hote,Population,PIB par hab
0,Unified Team,RUS,45,38,29,112,1992,Barcelona,Spain,ESP,148538197,3098.802639
1,United States,USA,37,34,37,108,1992,Barcelona,Spain,ESP,256514000,25418.990776
2,People's Republic of China,CHN,16,22,16,54,1992,Barcelona,Spain,ESP,1164970000,366.460692
3,Cuba,CUB,14,6,11,31,1992,Barcelona,Spain,ESP,10736386,2057.103595
4,Spain,ESP,13,7,2,22,1992,Barcelona,Spain,ESP,39157685,16112.188915
5,Republic of Korea,KOR,12,5,12,29,1992,Barcelona,Spain,ESP,43747962,8126.67039
6,Hungary,HUN,11,12,7,30,1992,Barcelona,Spain,ESP,10369341,3735.10582
7,France,FRA,8,5,16,29,1992,Barcelona,Spain,ESP,58851216,23813.712246
8,Australia,AUS,7,9,11,27,1992,Barcelona,Spain,ESP,17495000,18604.18827
9,Canada,CAN,7,4,7,18,1992,Barcelona,Spain,ESP,28371264,20879.84833


In [18]:
#On construit la base dans laquelle on a la population et le PIB par habitant pour l'année en cours

df = pd.DataFrame(columns=[])

for annee in df_medailles_copy['Annee'].unique():
    df_merged = fonction_jointure(annee)
    df = pd.concat([df, df_merged])

df

Unnamed: 0,Country Name,Country Code,Gold,Silver,Bronze,Total,Annee,Lieu,Pays_hote,Code_pays_hote,Population,PIB par hab
0,Unified Team,RUS,45,38,29,112,1992,Barcelona,Spain,ESP,148538197,3098.802639
1,United States,USA,37,34,37,108,1992,Barcelona,Spain,ESP,256514000,25418.990776
2,People's Republic of China,CHN,16,22,16,54,1992,Barcelona,Spain,ESP,1164970000,366.460692
3,Cuba,CUB,14,6,11,31,1992,Barcelona,Spain,ESP,10736386,2057.103595
4,Spain,ESP,13,7,2,22,1992,Barcelona,Spain,ESP,39157685,16112.188915
...,...,...,...,...,...,...,...,...,...,...,...,...
60,Finland,FIN,0,0,2,2,2020,Tokyo,Japan,JPN,5530719,48773.281169
61,Côte d'Ivoire,CIV,0,0,1,1,2020,Tokyo,Japan,JPN,26378275,2325.723705
62,Ghana,GHA,0,0,1,1,2020,Tokyo,Japan,JPN,31072945,2205.529016
63,Republic of Moldova,MDA,0,0,1,1,2020,Tokyo,Japan,JPN,2620495,4547.059721


On ajoute une variable indicatrice indiquant si le pays est le pays hôte ou pas.

In [19]:
df['Pays_hote_oui_non'] = (df['Country Code'] == df['Code_pays_hote'])
df['Pays_hote_oui_non'] = df['Pays_hote_oui_non'].astype(int)
df

Unnamed: 0,Country Name,Country Code,Gold,Silver,Bronze,Total,Annee,Lieu,Pays_hote,Code_pays_hote,Population,PIB par hab,Pays_hote_oui_non
0,Unified Team,RUS,45,38,29,112,1992,Barcelona,Spain,ESP,148538197,3098.802639,0
1,United States,USA,37,34,37,108,1992,Barcelona,Spain,ESP,256514000,25418.990776,0
2,People's Republic of China,CHN,16,22,16,54,1992,Barcelona,Spain,ESP,1164970000,366.460692,0
3,Cuba,CUB,14,6,11,31,1992,Barcelona,Spain,ESP,10736386,2057.103595,0
4,Spain,ESP,13,7,2,22,1992,Barcelona,Spain,ESP,39157685,16112.188915,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,Finland,FIN,0,0,2,2,2020,Tokyo,Japan,JPN,5530719,48773.281169,0
61,Côte d'Ivoire,CIV,0,0,1,1,2020,Tokyo,Japan,JPN,26378275,2325.723705,0
62,Ghana,GHA,0,0,1,1,2020,Tokyo,Japan,JPN,31072945,2205.529016,0
63,Republic of Moldova,MDA,0,0,1,1,2020,Tokyo,Japan,JPN,2620495,4547.059721,0


On crée des variables indicatrices pour l'année.

In [20]:
df_dummies = pd.get_dummies(df, columns=["Annee"])
df_dummies

Unnamed: 0,Country Name,Country Code,Gold,Silver,Bronze,Total,Lieu,Pays_hote,Code_pays_hote,Population,PIB par hab,Pays_hote_oui_non,Annee_1992,Annee_1996,Annee_2000,Annee_2004,Annee_2008,Annee_2012,Annee_2016,Annee_2020
0,Unified Team,RUS,45,38,29,112,Barcelona,Spain,ESP,148538197,3098.802639,0,1,0,0,0,0,0,0,0
1,United States,USA,37,34,37,108,Barcelona,Spain,ESP,256514000,25418.990776,0,1,0,0,0,0,0,0,0
2,People's Republic of China,CHN,16,22,16,54,Barcelona,Spain,ESP,1164970000,366.460692,0,1,0,0,0,0,0,0,0
3,Cuba,CUB,14,6,11,31,Barcelona,Spain,ESP,10736386,2057.103595,0,1,0,0,0,0,0,0,0
4,Spain,ESP,13,7,2,22,Barcelona,Spain,ESP,39157685,16112.188915,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,Finland,FIN,0,0,2,2,Tokyo,Japan,JPN,5530719,48773.281169,0,0,0,0,0,0,0,0,1
61,Côte d'Ivoire,CIV,0,0,1,1,Tokyo,Japan,JPN,26378275,2325.723705,0,0,0,0,0,0,0,0,1
62,Ghana,GHA,0,0,1,1,Tokyo,Japan,JPN,31072945,2205.529016,0,0,0,0,0,0,0,0,1
63,Republic of Moldova,MDA,0,0,1,1,Tokyo,Japan,JPN,2620495,4547.059721,0,0,0,0,0,0,0,0,1
