<a href="https://colab.research.google.com/github/christoph-fraller/dopp_2020w_group03_ex3/blob/main/dopp_2020w_group03_ex3_with_git.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate SSH-Keys for Accessing Git Repository

In [1]:
# import and mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# generate ssh keys (insert your username@github.com + hit enter when prompted for any answer)
! ssh-keygen -t rsa -b 4096 -C 'christoph-fraller@github.com'

Generating public/private rsa key pair.
Enter file in which to save the key (/root/.ssh/id_rsa): 
/root/.ssh/id_rsa already exists.
Overwrite (y/n)? ^C


In [3]:
# check whether or not the ssh keys have been created ('id_rsa' and 'id_rsa.pub' should be displayed)
! ls /root/.ssh/

id_rsa	id_rsa.pub  known_hosts


In [4]:
# create directory for saving the ssh keys
! mkdir -p /content/drive/MyDrive/Ssh

In [5]:
# copy ssh keys from /root/.ssh/* to /content/drive/MyDrive/Ssh/*
! cp /root/.ssh/id_rsa /content/drive/MyDrive/Ssh/
! cp /root/.ssh/id_rsa.pub /content/drive/MyDrive/Ssh/

In [6]:
# display public ssh key for copy/paste
! cat /content/drive/MyDrive/Ssh/id_rsa.pub

ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDB/4vlF0AnzwXwMPthgQP5FlcRYOGQXpvaUuFK8XB8vmdt7N5K/o5drPc9uErxFkc7SkBPDCuBNoeZFS2KseqAEPm3PLcrqpPwJc/WPf0mAAXHf4alH0owIoEu+V3fMHV6URMcapA47tgHC3HAlJS5W3m1ctPBhVoqQwZ+pRN+kEdQBxrdGeGBe/Ea4P9jIrv5kfdAneeAhUg0TgtEYHxTHfCYcwEDDkkkbiVdGxlChW87zolnOxa5dJQ3bLCXkqBA+kM5OLw99qTObVmBJsDBorN3XFuAGAZ4UX4j2QnBvWz9RjHngZ8BgceofvTdSiGK3aGWj9L+BYigZhiF+qMOZF2tnF1DmWgiMxXwDZF61+FnXdO+25JBjo/Q1r6i0XIV1OG5iz89zmXev40gdBeQ2cqDiQisqCb46EvZpQQsPwZfDJ7M1RV/evNVVus2Kr5j1yQL3BcrW3dhnqNj1kukJ4fYgT1FUAEkrbqf8C6k6vp1nml9oqKM+lMy+8CQ1zt/bblz2eqoUhOSvuwUH4YIjcKOP4SuwEKfaVSXXjfbMFxgEGkO35up5zpud80TWpQwVSNIsLa/MrAb7OUeF+NgEG4h1UC5jtrc/109USgKNcwBud2N0GdzV4frwRZzoO/BLnOOLWXVj3B/tVWfTh0uGFN3mH4Au0yf5ssxRIEaBQ== christoph-fraller@github.com


In [7]:
# add github to known hosts and adapt file access permissions
! ssh-keyscan github.com >> /root/.ssh/known_hosts
! chmod 644 /root/.ssh/known_hosts
! chmod 600 /root/.ssh/id_rsa
! ssh -T git@github.com

# github.com:22 SSH-2.0-babeld-78794f53
# github.com:22 SSH-2.0-babeld-78794f53
# github.com:22 SSH-2.0-babeld-78794f53
Hi christoph-fraller/dopp_2020w_group03_ex3! You've successfully authenticated, but GitHub does not provide shell access.


# Git Setup

In [8]:
# git config settings (replace with your credentials)
! git config --global user.email "christoph.fraller@gmail.com"
! git config --global user.name "christoph-fraller"

In [9]:
# create directory for git repositories
! mkdir -p /content/drive/MyDrive/Git

In [10]:
# git-clone has to be performed only once when setting up the git repo at your google drive
! git clone git@github.com:christoph-fraller/dopp_2020w_group03_ex3.git /content/drive/MyDrive/Git/dopp_2020w_group03_ex3

fatal: destination path '/content/drive/MyDrive/Git/dopp_2020w_group03_ex3' already exists and is not an empty directory.


# Important Shell and Git Commands

**NOTICE:** Always ensure that you are in the right directory when performing git commands (e.g. /content/drive/MyDrive/Git/dopp_2020w_group03_ex3). In case of any issues that might occur when switching directories it is highly recommended to restart the runtime engine (CTRL + M + .).

In [11]:
# check current working directory
! pwd

/content


In [12]:
# switch to specified working directory
%cd /content/drive/MyDrive/Git/dopp_2020w_group03_ex3

/content/drive/MyDrive/Git/dopp_2020w_group03_ex3


In [13]:
# list content of current working directory
! ls

data  dopp_2020w_group03_ex3_with_git.ipynb  README.md


In [14]:
# check git status
! git status

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [15]:
# always perform a git pull before you start working or commit/push some changes
! git pull

Already up to date.


In [16]:
# add a new data file to git repo directly from colab
# at first upload the file into the folder of your google drive
# ! git add /content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/test.txt
# ! git commit -m 'New file added.'
# ! git push

# Perform these steps everytime when a new session has been started

In [17]:
# copy ssh keys from /content/drive/MyDrive/Ssh/* to /root/.ssh/*
! cp /content/drive/MyDrive/Ssh/id_rsa /root/.ssh/
! cp /content/drive/MyDrive/Ssh/id_rsa.pub /root/.ssh/ 

In [18]:
# add github to known hosts and adapt file access permissions
! ssh-keyscan github.com >> /root/.ssh/known_hosts
! chmod 644 /root/.ssh/known_hosts
! chmod 600 /root/.ssh/id_rsa
! ssh -T git@github.com

# github.com:22 SSH-2.0-babeld-78794f53
# github.com:22 SSH-2.0-babeld-78794f53
# github.com:22 SSH-2.0-babeld-78794f53
Hi christoph-fraller/dopp_2020w_group03_ex3! You've successfully authenticated, but GitHub does not provide shell access.


In [19]:
# always perform a git pull before you start working or commit/push some changes
! git pull

Already up to date.


# Data Preprocessing

In [20]:
import pandas as pd

###Load the data

source: World Bank

In [21]:
def load_csv_data():
    
    data_file = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/data_indicators_1.csv', sep = ',')

    return data_file

data = load_csv_data()

In [22]:
data

Unnamed: 0,Time,Time Code,Country Name,Country Code,"GNI per capita, Atlas method (current US$) [NY.GNP.PCAP.CD]","Population, total [SP.POP.TOTL]","Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative) [SE.PRM.CUAT.ZS]","Fertility rate, total (births per woman) [SP.DYN.TFRT.IN]","Unemployment, total (% of total labor force) (modeled ILO estimate) [SL.UEM.TOTL.ZS]","Agriculture, forestry, and fishing, value added (% of GDP) [NV.AGR.TOTL.ZS]",Total natural resources rents (% of GDP) [NY.GDP.TOTL.RT.ZS],Urban population (% of total population) [SP.URB.TOTL.IN.ZS],Access to electricity (% of population) [EG.ELC.ACCS.ZS],Gini index (World Bank estimate) [SI.POV.GINI],Consumer price index (2010 = 100) [FP.CPI.TOTL],External balance on goods and services (% of GDP) [NE.RSB.GNFS.ZS],"Central government debt, total (% of GDP) [GC.DOD.TOTL.GD.ZS]"
0,1960,YR1960,Argentina,ARG,..,20481779,..,3.109,..,..,..,73.611,..,..,..,2.12913319746322E-06,..
1,1960,YR1960,Australia,AUS,..,10276477,..,3.453,..,..,..,81.529,..,..,7.96045785639958,-1.067036411864,..
2,1960,YR1960,Brazil,BRA,..,72179226,..,6.061,..,17.6669984863859,..,46.139,..,..,..,-0.0582390668087098,..
3,1960,YR1960,China,CHN,..,667070000,..,5.756,..,23.1752941976736,..,16.203,..,..,..,-0.122440650295898,..
4,1960,YR1960,France,FRA,..,46621669,..,2.85,..,10.5372165520776,..,61.88,..,..,10.4335934118144,2.43412905154375,..
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13237,,,,,,,,,,,,,,,,,
13238,,,,,,,,,,,,,,,,,
13239,,,,,,,,,,,,,,,,,
13240,Data from database: World Development Indicators,,,,,,,,,,,,,,,,


In [23]:
data.tail(15)

Unnamed: 0,Time,Time Code,Country Name,Country Code,"GNI per capita, Atlas method (current US$) [NY.GNP.PCAP.CD]","Population, total [SP.POP.TOTL]","Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative) [SE.PRM.CUAT.ZS]","Fertility rate, total (births per woman) [SP.DYN.TFRT.IN]","Unemployment, total (% of total labor force) (modeled ILO estimate) [SL.UEM.TOTL.ZS]","Agriculture, forestry, and fishing, value added (% of GDP) [NV.AGR.TOTL.ZS]",Total natural resources rents (% of GDP) [NY.GDP.TOTL.RT.ZS],Urban population (% of total population) [SP.URB.TOTL.IN.ZS],Access to electricity (% of population) [EG.ELC.ACCS.ZS],Gini index (World Bank estimate) [SI.POV.GINI],Consumer price index (2010 = 100) [FP.CPI.TOTL],External balance on goods and services (% of GDP) [NE.RSB.GNFS.ZS],"Central government debt, total (% of GDP) [GC.DOD.TOTL.GD.ZS]"
13227,2020,YR2020,Portugal,PRT,..,..,..,..,5.93300008773804,..,..,..,..,..,..,..,..
13228,2020,YR2020,Puerto Rico,PRI,..,..,..,..,8.41399955749512,..,..,..,..,..,..,..,..
13229,2020,YR2020,Qatar,QAT,..,..,..,..,0.0820000022649765,..,..,..,..,..,..,..,..
13230,2020,YR2020,Romania,ROU,..,..,..,..,3.92199993133545,..,..,..,..,..,..,..,..
13231,2020,YR2020,Rwanda,RWA,..,..,..,..,1.04100000858307,..,..,..,..,..,..,..,..
13232,2020,YR2020,Samoa,WSM,..,..,..,..,8.35499954223633,..,..,..,..,..,..,..,..
13233,2020,YR2020,San Marino,SMR,..,..,..,..,..,..,..,..,..,..,..,..,..
13234,2020,YR2020,Sao Tome and Principe,STP,..,..,..,..,13.6709995269775,..,..,..,..,..,..,..,..
13235,2020,YR2020,Senegal,SEN,..,..,..,..,6.67600011825562,..,..,..,..,..,..,..,..
13236,2020,YR2020,Serbia,SRB,..,..,..,..,12.7220001220703,..,..,..,..,..,..,..,..


### Drop unnecessary rows and columns

In [24]:
#drop rows
data.drop(data.tail(5).index,inplace=True)
#drop columns
data.drop(['Time Code','Country Code'], axis = 1, inplace=True)
#rename column Time to Year
data.rename(columns={'Time': 'Year'},inplace = True)


In [25]:
#data.tail(15)

### Set index

In [26]:
data = data.set_index(['Country Name', 'Year'])
data

Unnamed: 0_level_0,Unnamed: 1_level_0,"GNI per capita, Atlas method (current US$) [NY.GNP.PCAP.CD]","Population, total [SP.POP.TOTL]","Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative) [SE.PRM.CUAT.ZS]","Fertility rate, total (births per woman) [SP.DYN.TFRT.IN]","Unemployment, total (% of total labor force) (modeled ILO estimate) [SL.UEM.TOTL.ZS]","Agriculture, forestry, and fishing, value added (% of GDP) [NV.AGR.TOTL.ZS]",Total natural resources rents (% of GDP) [NY.GDP.TOTL.RT.ZS],Urban population (% of total population) [SP.URB.TOTL.IN.ZS],Access to electricity (% of population) [EG.ELC.ACCS.ZS],Gini index (World Bank estimate) [SI.POV.GINI],Consumer price index (2010 = 100) [FP.CPI.TOTL],External balance on goods and services (% of GDP) [NE.RSB.GNFS.ZS],"Central government debt, total (% of GDP) [GC.DOD.TOTL.GD.ZS]"
Country Name,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Argentina,1960,..,20481779,..,3.109,..,..,..,73.611,..,..,..,2.12913319746322E-06,..
Australia,1960,..,10276477,..,3.453,..,..,..,81.529,..,..,7.96045785639958,-1.067036411864,..
Brazil,1960,..,72179226,..,6.061,..,17.6669984863859,..,46.139,..,..,..,-0.0582390668087098,..
China,1960,..,667070000,..,5.756,..,23.1752941976736,..,16.203,..,..,..,-0.122440650295898,..
France,1960,..,46621669,..,2.85,..,10.5372165520776,..,61.88,..,..,10.4335934118144,2.43412905154375,..
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Samoa,2020,..,..,..,..,8.35499954223633,..,..,..,..,..,..,..,..
San Marino,2020,..,..,..,..,..,..,..,..,..,..,..,..,..
Sao Tome and Principe,2020,..,..,..,..,13.6709995269775,..,..,..,..,..,..,..,..
Senegal,2020,..,..,..,..,6.67600011825562,..,..,..,..,..,..,..,..


In [27]:
#drop Year 2020 since too little data available
data.drop('2020',level='Year',inplace=True)
data

Unnamed: 0_level_0,Unnamed: 1_level_0,"GNI per capita, Atlas method (current US$) [NY.GNP.PCAP.CD]","Population, total [SP.POP.TOTL]","Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative) [SE.PRM.CUAT.ZS]","Fertility rate, total (births per woman) [SP.DYN.TFRT.IN]","Unemployment, total (% of total labor force) (modeled ILO estimate) [SL.UEM.TOTL.ZS]","Agriculture, forestry, and fishing, value added (% of GDP) [NV.AGR.TOTL.ZS]",Total natural resources rents (% of GDP) [NY.GDP.TOTL.RT.ZS],Urban population (% of total population) [SP.URB.TOTL.IN.ZS],Access to electricity (% of population) [EG.ELC.ACCS.ZS],Gini index (World Bank estimate) [SI.POV.GINI],Consumer price index (2010 = 100) [FP.CPI.TOTL],External balance on goods and services (% of GDP) [NE.RSB.GNFS.ZS],"Central government debt, total (% of GDP) [GC.DOD.TOTL.GD.ZS]"
Country Name,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Argentina,1960,..,20481779,..,3.109,..,..,..,73.611,..,..,..,2.12913319746322E-06,..
Australia,1960,..,10276477,..,3.453,..,..,..,81.529,..,..,7.96045785639958,-1.067036411864,..
Brazil,1960,..,72179226,..,6.061,..,17.6669984863859,..,46.139,..,..,..,-0.0582390668087098,..
China,1960,..,667070000,..,5.756,..,23.1752941976736,..,16.203,..,..,..,-0.122440650295898,..
France,1960,..,46621669,..,2.85,..,10.5372165520776,..,61.88,..,..,10.4335934118144,2.43412905154375,..
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Samoa,2019,4190,197097,..,..,8.35900020599365,9.75444952942173,..,18.056,..,..,117.555522119693,-14.6108154792665,..
San Marino,2019,..,33860,..,..,..,..,..,97.368,..,..,..,..,..
Sao Tome and Principe,2019,1930,215056,..,..,13.3690004348755,12.4625607991542,..,73.598,..,..,..,..,..
Senegal,2019,1460,16296364,..,..,6.60400009155273,14.7915678290978,..,47.653,..,..,109.251270172291,-14.9211503360035,..


### Extract dataset

In [28]:
dataset1 = data['GNI per capita, Atlas method (current US$) [NY.GNP.PCAP.CD]']
dataset1

Country Name           Year
Argentina              1960      ..
Australia              1960      ..
Brazil                 1960      ..
China                  1960      ..
France                 1960      ..
                               ... 
Samoa                  2019    4190
San Marino             2019      ..
Sao Tome and Principe  2019    1930
Senegal                2019    1460
Serbia                 2019    7030
Name: GNI per capita, Atlas method (current US$) [NY.GNP.PCAP.CD], Length: 13020, dtype: object

# Frage 1

In [29]:
#load thresholds
def load_csv_income_thresholds():
    
    data_file = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/data_indicators_1.csv', sep = ',')

    return data_file

income_thresholds = load_csv_income_thresholds()


