# 1. Pandas DataFrame 

## Loading dataset

In [9]:
import pandas as pd

df = pd.read_csv('./gapminder.tsv', sep ='\t')


In [10]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


## Learning dataset

In [13]:
df.shape

(1704, 6)

In [14]:
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [15]:
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

#### NOTE: 
##### Pandas type> object,     Python type> string
##### Pandas type> int64,      Python type> int
##### Pandas type> float64,    Python type> float
#####  Pandas type> datetime64, Python type> datetime (* datetime is found in python standard library, not loaded by default so needs to be imported)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


## Subsetting column by column name

In [28]:
country_df = df['country']

In [19]:
country_df.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [20]:
country_df.tail()

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object

In [30]:
subset_df = df[['country', 'continent', 'year']]

In [26]:
subset_df

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


## Subsetting rows by index number

In [44]:
df.loc[12] # all info on the 12th row. [12] is index number

country      Albania
continent     Europe
year            1952
lifeExp        55.23
pop          1282697
gdpPercap    1601.06
Name: 12, dtype: object

In [42]:
df.loc[[0, 99, 999]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
99,Bangladesh,Asia,1967,43.453,62821884,721.186086
999,Mongolia,Asia,1967,51.253,1149500,1226.04113


In [45]:
df.iloc[-1] # [-1] wouldn't work with loc

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object

In [48]:
number_of_rows = df.shape[0] # get the last row

In [49]:
last_row_index = number_of_rows -1  # subtract 1 from the value to get the last index value

In [50]:
print(df.loc[last_row_index])  #  subset using the index of the last row

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object


In [51]:
print(df.tail(n = 1)) #another way to get the last index value

       country continent  year  lifeExp       pop   gdpPercap
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298


In [52]:
subset_loc = df.loc[0]
subset_head = df.head(n=1)

In [57]:
type(subset_loc)

pandas.core.series.Series

In [58]:
type(subset_head)

pandas.core.frame.DataFrame

## Subsetting Columns

In [60]:
subset = df.loc[:, ['year', 'pop']]


In [62]:
subset.head()

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460


## Subsetting Columns by Range

In [64]:
small_range = list(range(5))
small_range

[0, 1, 2, 3, 4]

In [76]:
# Subset the dataframe with the range

subset = df.iloc[:, small_range]
subset.head()

Unnamed: 0,lifeExp,pop,gdpPercap
0,28.801,8425333,779.445314
1,30.332,9240934,820.85303
2,31.997,10267083,853.10071
3,34.02,11537966,836.197138
4,36.088,13079460,739.981106


In [79]:
# Create a range from 3 to 5 inclusive

small_range = list(range(3, 6))
small_range

[3, 4, 5]

In [78]:
subset = df.iloc[:, small_range]
subset.head()

Unnamed: 0,lifeExp,pop,gdpPercap
0,28.801,8425333,779.445314
1,30.332,9240934,820.85303
2,31.997,10267083,853.10071
3,34.02,11537966,836.197138
4,36.088,13079460,739.981106


In [83]:
# Create a range from 0 to 5 inclusive, every other interger

small_range = list(range(0, 6, 2))
subset = df.iloc[:, small_range]
subset.head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


## Slicing Columns

In [86]:
small_range = list(range(3))
subset = df.iloc[:, small_range]
subset.head()


Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [89]:
# Slicing the first 3 columns

subset = df.iloc[:, :3]
subset.head()


Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [90]:
small_range = list(range(3, 6))
sugset = df.iloc[:, small_range]
subset.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [92]:
# slicing columns 3 to 5 inclusive

subset = df.iloc[:, 3:6]
subset.head()

Unnamed: 0,lifeExp,pop,gdpPercap
0,28.801,8425333,779.445314
1,30.332,9240934,820.85303
2,31.997,10267083,853.10071
3,34.02,11537966,836.197138
4,36.088,13079460,739.981106


In [None]:
small_range = list(range(0, 6, 2))
subset = df.iloc[:, small_range]
subset.head()


In [100]:
# Slice every other first 5 columns

subset = df.iloc[:, 0:6:2]
subset.head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


## Subsetting Rows and Columns

In [97]:
# We can choose to put values to the left of the comma if we want to select specific rows along with specific columns.

# Using loc
print(df.loc[42, 'country'])

# Using iloc
print(df.iloc[42, 0])


Angola
Angola


## Subsetting Multiple Rows and Columns

In [98]:
# get the 1st, 100th, and 1000th rows
# from the 1st, 4th, and 6th columns
# the columns we are hoping to get are country, lifeExp, gdpPercap

print(df.iloc[[0, 99, 999], [0, 3, 5]])  #[rows], [columns]

         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130


In [99]:
# if we use the column names directly, it makes the code a bit easier to read

print(df.loc[[0, 99, 999], ['country', 'lifeExp', 'gdpPercap']])


         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130


In [101]:
print (df.loc[10:13, ['country', 'lifeExp', 'gdpPercap']])


        country  lifeExp    gdpPercap
10  Afghanistan   42.129   726.734055
11  Afghanistan   43.828   974.580338
12      Albania   55.230  1601.056136
13      Albania   59.280  1942.284244
