## 3.1 Importing and Cleaning

In [8]:
import pandas as pd
url = 'https://raw.githubusercontent.com/jstaf/gapminder/master/gapminder/gapminder.csv'
df = pd.read_csv(url)

In [9]:
df.isnull().sum().sum()

0

In [10]:
df['country'].isnull().sum()

0

In [11]:
df = df.rename({'country': 'Country'}, axis = 1)
df.head()

Unnamed: 0,Country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [12]:
df = df.rename({'lifeExp': 'Life_Expectancy',
                'pop': 'Population',
                'gdpPercap': 'GDP_per_Cap'}, axis = 1)
df.head()

Unnamed: 0,Country,continent,year,Life_Expectancy,Population,GDP_per_Cap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [13]:
df = df.drop(columns = {'continent'}, axis = 1)
df.head()

Unnamed: 0,Country,year,Life_Expectancy,Population,GDP_per_Cap
0,Afghanistan,1952,28.801,8425333,779.445314
1,Afghanistan,1957,30.332,9240934,820.85303
2,Afghanistan,1962,31.997,10267083,853.10071
3,Afghanistan,1967,34.02,11537966,836.197138
4,Afghanistan,1972,36.088,13079460,739.981106


In [14]:
df.dtypes

Country             object
year                 int64
Life_Expectancy    float64
Population           int64
GDP_per_Cap        float64
dtype: object

In [15]:
df['Life_Expectancy'] = df['Life_Expectancy'].astype(int)
df.dtypes

Country             object
year                 int64
Life_Expectancy      int64
Population           int64
GDP_per_Cap        float64
dtype: object

In [16]:
df = df.astype({"Life_Expectancy": float, 
                "Population": float})
df.dtypes

Country             object
year                 int64
Life_Expectancy    float64
Population         float64
GDP_per_Cap        float64
dtype: object

In [17]:
df.Country.unique().size

142

In [18]:
df.year.unique().size

12

In [19]:
df.year.unique()

array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
       2007])

## 3.2 Adding Variables

In [20]:
import pandas as pd
url = 'https://raw.githubusercontent.com/jstaf/gapminder/master/gapminder/gapminder.csv'
df = pd.read_csv(url)

In [21]:
df['Total_GDP'] = df['pop'] * df['gdpPercap']
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,Total_GDP
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,6567086000.0
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,7585449000.0
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,8758856000.0
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,9648014000.0
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,9678553000.0


In [22]:
df['Total_GDP'] = df['Total_GDP'] / 1000000000
df = df.rename({'Total_GDP': 'TotalGDP_Bil'}, axis = 1)
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,TotalGDP_Bil
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,6.567086
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,7.585449
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,8.758856
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,9.648014
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,9.678553


In [23]:
df['Country_Cont'] = df['country'].astype(str) + '_' + df['continent']
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,TotalGDP_Bil,Country_Cont
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,6.567086,Afghanistan_Asia
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,7.585449,Afghanistan_Asia
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,8.758856,Afghanistan_Asia
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,9.648014,Afghanistan_Asia
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,9.678553,Afghanistan_Asia


## 3.3 Subsetting

In [24]:
import pandas as pd
url = 'https://raw.githubusercontent.com/jstaf/gapminder/master/gapminder/gapminder.csv'
df = pd.read_csv(url)


In [25]:
df['country']

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [26]:
df_2 = df[df['country'] == 'Canada']
df_2.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
240,Canada,Americas,1952,68.75,14785584,11367.16112
241,Canada,Americas,1957,69.96,17010154,12489.95006
242,Canada,Americas,1962,71.3,18985849,13462.48555
243,Canada,Americas,1967,72.13,20819767,16076.58803
244,Canada,Americas,1972,72.88,22284500,18970.57086


In [27]:
cols = ['country', 'continent', 'pop']
df[cols]

Unnamed: 0,country,continent,pop
0,Afghanistan,Asia,8425333
1,Afghanistan,Asia,9240934
2,Afghanistan,Asia,10267083
3,Afghanistan,Asia,11537966
4,Afghanistan,Asia,13079460
...,...,...,...
1699,Zimbabwe,Africa,9216418
1700,Zimbabwe,Africa,10704340
1701,Zimbabwe,Africa,11404948
1702,Zimbabwe,Africa,11926563


In [28]:
cols = ['lifeExp', 'pop', 'gdpPercap']
df[cols].mean()

lifeExp      5.947444e+01
pop          2.960121e+07
gdpPercap    7.215327e+03
dtype: float64

In [29]:
df.iloc[0,0]

'Afghanistan'

In [30]:
df.iloc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [31]:
df.iloc[:3]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071


In [32]:
df.iloc[0:3]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071


In [33]:
df.iloc[:,0]

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [34]:
df.iloc[:, 1:3]

Unnamed: 0,continent,year
0,Asia,1952
1,Asia,1957
2,Asia,1962
3,Asia,1967
4,Asia,1972
...,...,...
1699,Africa,1987
1700,Africa,1992
1701,Africa,1997
1702,Africa,2002


In [35]:
df_3 = df[df['pop'] > 10000000] # all variables for when population is greater than 10 million
df_3.head() 

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439


In [36]:
df[(df['country'] == 'Canada') & (df['year'] >= 2000)]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
250,Canada,Americas,2002,79.77,31902268,33328.96507
251,Canada,Americas,2007,80.653,33390141,36319.23501
