In [2]:
# importing modules
import pandas as pd
import numpy as np

In [3]:
# loading data
data = pd.read_csv("./country_data.tsv", delimiter="\t")

In [4]:
# inspecting loaded data
data.head(10)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945
8,Afghanistan,Asia,1992,41.674,16317921,649.341395
9,Afghanistan,Asia,1997,41.763,22227415,635.341351


In [5]:
# getting dataframe info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


<h2>Series</h2>

In [6]:
# series from list
series_1 = pd.Series([4, 7, -5, 3])

In [7]:
series_1

0    4
1    7
2   -5
3    3
dtype: int64

In [8]:
# getting values of series
series_1.values

array([ 4,  7, -5,  3], dtype=int64)

In [9]:
# getting index of series
series_1.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
series_with_index = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
series_with_index

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:
series_with_index.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [12]:
# getting element by index
series_with_index['b']

7

In [13]:
# getting multiple elements by index
series_with_index[["a", "c", "d"]]

a   -5
c    3
d    4
dtype: int64

In [14]:
# assigning elements using index
series_with_index['d'] = 6
series_with_index

d    6
b    7
a   -5
c    3
dtype: int64

In [15]:
# multiplying each element by 2
series_with_index * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [16]:
# checking index in series
'b' in series_with_index

True

In [17]:
# creating series from dict
states_dict = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}

states_series = pd.Series(states_dict)
states_series

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [18]:
# naming index and data column
states_series.name = "population"
states_series.index.name = "state"
states_series

state
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
Name: population, dtype: int64

<h2>Dataframe</h2>

In [19]:
# creating df from a dict
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"], "year": [2000, 2001, 2002, 2001, 2002, 2003], "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

states_df = pd.DataFrame(data=data)
states_df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [20]:
# getting columns of df
states_df.columns

Index(['state', 'year', 'pop'], dtype='object')

In [21]:
# selecting a column
states_df["pop"]

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [22]:
# transposing dataframe
states_df.T

Unnamed: 0,0,1,2,3,4,5
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
year,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2


<h2>Index Objects</h2>

In [23]:
states_series.index

Index(['Ohio', 'Texas', 'Oregon', 'Utah'], dtype='object', name='state')

In [24]:
# creating index object
pd.Index(data=np.arange(5))

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [25]:
# index objects behave as fixed size sets
"Texas" in states_series.index

True

<h2>Essential Functionality</h2>

In [26]:
states_df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [27]:
# dropping rows
states_df.drop([1, 4])

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
2,Ohio,2002,3.6
3,Nevada,2001,2.4
5,Nevada,2003,3.2


In [28]:
# dropping columns
states_df.drop("year", axis=1)

Unnamed: 0,state,pop
0,Ohio,1.5
1,Ohio,1.7
2,Ohio,3.6
3,Nevada,2.4
4,Nevada,2.9
5,Nevada,3.2


<i>many functions like drop, which modifies size and shape of the dataframe can manipulate an object in-place whithout returning a new object</i>

In [29]:
#states_df.drop("year", axis=1, inplace=True)

<h2>Indexing, Selection & Filtering</h2>

In [30]:
series_with_index

d    6
b    7
a   -5
c    3
dtype: int64

In [31]:
# selecting using index
series_with_index[1]

7

In [32]:
# selecting using label
series_with_index["b"]

7

In [33]:
# slicing
series_with_index[:3]

d    6
b    7
a   -5
dtype: int64

In [34]:
state_number_df = pd.DataFrame(data=np.arange(16).reshape(4, 4), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"])
state_number_df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [35]:
# selecting a column
state_number_df.three

Ohio         2
Colorado     6
Utah        10
New York    14
Name: three, dtype: int32

In [36]:
# selecting multiple columns
state_number_df[["two", "four"]]

Unnamed: 0,two,four
Ohio,1,3
Colorado,5,7
Utah,9,11
New York,13,15


In [37]:
# selecting a row by label
state_number_df.loc["Utah"]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [38]:
# selecting a row by index
state_number_df.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [39]:
# selecting multiple rows
state_number_df.iloc[:3]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11


In [40]:
# selecting rows and columns
state_number_df.iloc[0, :2]

one    0
two    1
Name: Ohio, dtype: int32

In [41]:
state_number_df.loc[["Ohio", "Colorado"], ["three", "four"]]

Unnamed: 0,three,four
Ohio,2,3
Colorado,6,7


In [42]:
state_number_df.iloc[-2:, -2:]

Unnamed: 0,three,four
Utah,10,11
New York,14,15


<h2>Function Application & Mapping</h2>

In [44]:
# applying function to each column
state_number_df.apply(lambda x: x.max() - x.min())

one      12
two      12
three    12
four     12
dtype: int32

In [45]:
# applying function to each element
state_number_df.applymap(lambda x: x * 10)

Unnamed: 0,one,two,three,four
Ohio,0,10,20,30
Colorado,40,50,60,70
Utah,80,90,100,110
New York,120,130,140,150


In [48]:
# applying function to each element of a column
state_number_df.two.apply(lambda x: x / 2)

Ohio        0.5
Colorado    2.5
Utah        4.5
New York    6.5
Name: two, dtype: float64

<h2>Sorting & Ranking</h2>

In [49]:
# sorting by index
# my_df.sort_index(axis=0, ascending=True)

In [52]:
series_1

0    4
1    7
2   -5
3    3
dtype: int64

In [51]:
# sorting by values
series_1.sort_values()

2   -5
3    3
0    4
1    7
dtype: int64

In [54]:
# sorting a df by column
state_number_df.sort_values(by="three", ascending=False)

Unnamed: 0,one,two,three,four
New York,12,13,14,15
Utah,8,9,10,11
Colorado,4,5,6,7
Ohio,0,1,2,3


In [55]:
# ranking series & df

<h2>Summarizing & Computing Descriptive Statistics</h2>

In [56]:
# calculating sum
state_number_df.sum()

one      24
two      28
three    32
four     36
dtype: int64

In [57]:
# calculating cumulative sum
state_number_df.cumsum()

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,6,8,10
Utah,12,15,18,21
New York,24,28,32,36


In [58]:
# calculating common statistics
state_number_df.describe()

Unnamed: 0,one,two,three,four
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


<h2>Correlation & Covariance</h2>

In [59]:
# covariance of entire df
state_number_df.cov()

Unnamed: 0,one,two,three,four
one,26.666667,26.666667,26.666667,26.666667
two,26.666667,26.666667,26.666667,26.666667
three,26.666667,26.666667,26.666667,26.666667
four,26.666667,26.666667,26.666667,26.666667


In [60]:
# covariance between 2 features
state_number_df.one.cov(state_number_df.three)

26.666666666666664

In [61]:
# correlation of entire df
state_number_df.corr()

Unnamed: 0,one,two,three,four
one,1.0,1.0,1.0,1.0
two,1.0,1.0,1.0,1.0
three,1.0,1.0,1.0,1.0
four,1.0,1.0,1.0,1.0


In [62]:
# correlation between 2 features
state_number_df.two.corr(state_number_df.four)

1.0

In [63]:
# correlation of df with a series
state_number_df.corrwith(state_number_df.three)

one      1.0
two      1.0
three    1.0
four     1.0
dtype: float64

<h2>Unique Values, Value Counts & Membership</h2>

In [64]:
letter_series = pd.Series(list("cadaabbcc"))
letter_series

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [65]:
# getting unique values in a series
letter_series.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [66]:
# getting frequency of each value
letter_series.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [67]:
# checking membership
letter_series.isin(["b", "c"])

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool