# Data Science Basic Packages

## Numpy array operations

In [1]:
import numpy as np

In [2]:
a = np.array([1,2,3,4])
print(a[1:3])
print(a[1:])
print(a[:-1])
print(a[1:-1])
print(a[0], a[2])
print(a.dtype)

[2 3]
[2 3 4]
[1 2 3]
[2 3]
1 3
int32


In [3]:
A = [1,2,3]
A[0:2]

[1, 2]

In [4]:
B=np.array(A)
B[0:2]

array([1, 2])

In [5]:
b = np.array([1.5, 2.5, 3.5, 4])
print(b.dtype)
a2 = np.array([1, 2, 3, 4], dtype = np.float)
print(a2.dtype)

float64
float64


In [6]:
#Dimension and Shapes
A = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
print(A.shape) # (row,column)
print(A.ndim) #number of dimensions
print(A.size) #number of elements inside matrix

(2, 3)
2
6


In [7]:
B = np.array([
    [
        [12, 11, 10],
        [9, 8, 7],],
    [
        [6, 5, 4],
        [3, 2, 1]
    ]
])

print(B.shape)
print(B.ndim)
print(B.size)

(2, 2, 3)
3
12


In [8]:
# Slicing
A = np.array([
#.   0. 1. 2
    [1, 2, 3], # 0
    [4, 5, 6], # 1
    [7, 8, 9]  # 2
])
print(A[0])
print(A[0][1])
print(A[: ,1])
print(A[0 : 2])
print(A[:2, :2])

[1 2 3]
2
[2 5 8]
[[1 2 3]
 [4 5 6]]
[[1 2]
 [4 5]]


In [9]:
#Summary statistics
a = np.array([1, 2, 3, 4])
print(a.sum())
print(a.mean())
print(a.max())
print(a.min())
print(a.std())
print(a.var())

10
2.5
4
1
1.118033988749895
1.25


In [10]:
A = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])
print(A.ndim)
print(A.sum())
print(A.mean(0))
print(A.max())
print(A.min())
print(A.std())
print(A.var())

2
45
[4. 5. 6.]
9
1
2.581988897471611
6.666666666666667


In [11]:
print(A.mean(axis=1)[0])

2.0


In [12]:
print(A.sum(axis=0)) #rowwise
print(A.sum(axis=1)) #columnswise

[12 15 18]
[ 6 15 24]


In [13]:
a = np.array([1, 2, 3, 4])
print(a + 10)
print(a * 10)
a += 10
print(a)

[11 12 13 14]
[10 20 30 40]
[11 12 13 14]


In [14]:
a = np.array([1, 2, 3, 4])
a[[True, True, False, True]]

array([1, 2, 4])

In [26]:
print(a.argmax()) #returns index with highest value
print(a.argmin()) #returns index with lowest value
print(a[a.argmin()])

3
0
1


In [18]:
print(a > 2)
print(a[a > 2])
print(a [~(a > 2)])
print(a[(a % 2 == 0) & (a < 4)])

[False False  True  True]
[3 4]
[1 2]
[2]


In [19]:
A = np.random.randint(100, size=(3, 3))
A

array([[64, 59,  6],
       [ 8, 59, 16],
       [34,  0, 32]])

In [33]:
A[0,2] #(0,2) of matrix

6

In [37]:
A[[0,2], [1]] #first + 3rd row of second column

array([59,  0])

In [20]:
print(A > 30)
print(A[A > 30])

[[ True  True False]
 [False  True False]
 [ True False  True]]
[64 59 59 34 32]


In [21]:
A.sum(axis=0).shape

(3,)

In [25]:
print(A.argmax())
print(A.argmin())
print(A.argmax(axis=0)) #returns index of highest value per row
print(A.argmax(axis=1)) #returns index of highest value per column

0
7
[0 0 2]
[0 1 0]


In [27]:
print(A.min(axis=0)) #return smallest value per row

[8 0 6]


In [69]:
print(np.random.rand(2,4))
print(np.arange(0, 1, .1))
print(np.arange(10).reshape(5, 2))
print(np.linspace(0, 10, 20, endpoint = False))

[[0.81401969 0.53683904 0.81907139 0.14912928]
 [0.65591483 0.94154851 0.56995209 0.69063474]]
[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]
[0.  0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5.  5.5 6.  6.5 7.  7.5 8.  8.5
 9.  9.5]


# Pandas

In [42]:
import pandas as pd
import numpy as np

## Series

In [15]:
# In millions
g7_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])
g7_pop.name = 'G7 Population in millions'
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64

In [16]:
print(type(g7_pop))
print(g7_pop.dtype)
print(g7_pop.values)
print(type(g7_pop.values))

<class 'pandas.core.series.Series'>
float64
[ 35.467  63.951  80.94   60.665 127.061  64.511 318.523]
<class 'numpy.ndarray'>


In [4]:
print(g7_pop[0])
print(len(g7_pop))
print(g7_pop.index)

35.467
7
RangeIndex(start=0, stop=7, step=1)


In [33]:
#Can rename the index
g7_pop.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [34]:
#Creating series using dictionaries
pd.Series({
    'Canada': 35.467,
    'France': 63.951,
    'Germany': 80.94,
    'Italy': 60.665,
    'Japan': 127.061,
    'United Kingdom': 64.511,
    'United States': 318.523
}, name='G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [35]:
#Another way to create series
pd.Series(
    [35.467, 63.951, 80.94, 60.665, 127.061, 64.511, 318.523],
    index=['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
    name='G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [36]:
#Create a subset of series
pd.Series(g7_pop, index=['France', 'Germany', 'Italy', 'Spain'])

France     63.951
Germany    80.940
Italy      60.665
Spain         NaN
Name: G7 Population in millions, dtype: float64

In [40]:
#Index
print(g7_pop[['Italy', 'France']])
print("")
print(g7_pop.iloc[0])
print("")
print(g7_pop.iloc[-1])
print("")
print(g7_pop.iloc[[0, 1]])
print("")
print(g7_pop['Canada': 'Italy'])

Italy     60.665
France    63.951
Name: G7 Population in millions, dtype: float64

35.467

318.523

Canada    35.467
France    63.951
Name: G7 Population in millions, dtype: float64

Canada     35.467
France     63.951
Germany    80.940
Italy      60.665
Name: G7 Population in millions, dtype: float64


In [29]:
print(g7_pop > 70)
print("")

print(g7_pop[g7_pop > 70])
print("")

print(g7_pop[g7_pop > g7_pop.mean()])
print("")

print(g7_pop[(g7_pop > 80) & (g7_pop < 200)])
print("")

print(np.log(g7_pop))

Canada            True
France            True
Germany           True
Italy             True
Japan             True
United Kingdom    True
United States     True
Name: G7 Population in millions, dtype: bool

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64

United States    318.523
Name: G7 Population in millions, dtype: float64

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
Name: G7 Population in millions, dtype: float64

Canada            4.605070
France            4.605070
Germany           4.393708
Italy             4.605070
Japan             4.844667
United Kingdom    4.605070
United States     5.763695
Name: G7 Population in millions, dtype: float64


In [28]:
#Modifying series
print(g7_pop)
print("")

g7_pop["Canada"] = 30
print(g7_pop)
print("")

g7_pop[g7_pop < 70] = 99.99
print(g7_pop)

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64

Canada             30.000
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64


## Dataframes

In [43]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [45]:
a = df.melt(id_vars = ["Population", "GDP"]) #returns a new df with columns : population, GDP, variable, value
a

Unnamed: 0,Population,GDP,variable,value
0,35.467,1785387,Surface Area,9984670
1,63.951,2833687,Surface Area,640679
2,80.94,3874437,Surface Area,357114
3,60.665,2167744,Surface Area,301336
4,127.061,4602367,Surface Area,377930
5,64.511,2950039,Surface Area,242495
6,318.523,17348075,Surface Area,9525067
7,35.467,1785387,HDI,0.913
8,63.951,2833687,HDI,0.888
9,80.94,3874437,HDI,0.916


In [64]:
a.pivot(index = ["Population", "GDP"], columns = "variable")["value"].reset_index()

variable,Population,GDP,Continent,HDI,Surface Area
0,35.467,1785387,America,0.913,9984670
1,60.665,2167744,Europe,0.873,301336
2,63.951,2833687,Europe,0.888,640679
3,64.511,2950039,Europe,0.907,242495
4,80.94,3874437,Europe,0.916,357114
5,127.061,4602367,Asia,0.891,377930
6,318.523,17348075,America,0.915,9525067


In [41]:
df.pivot(index="Continent", columns = "HDI", values="GDP")

HDI,0.873,0.888,0.891,0.907,0.913,0.915,0.916
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
America,,,,,1785387.0,17348075.0,
Asia,,,4602367.0,,,,
Europe,2167744.0,2833687.0,,2950039.0,,,3874437.0


In [42]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [43]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [44]:
print(df.columns)
print(df.index)
print(type(df["Population"]))
print(df.shape)
print(df.size)

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')
Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')
<class 'pandas.core.series.Series'>
(7, 5)
35


In [45]:
print(df.info())
print("")
print(df.dtypes)
print("")
print(df["Continent"].unique())
print("")
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes
None

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

['America' 'Europe' 'Asia']

       Population           GDP  Surface Area       HDI
count    7.000000  7.000000e+00  7.000000e+00  7.000000
mean   107.302571  5.080248e+06  3.061327e+06  0.900429
std     97.249970  5.494020e+06  4.576187e+06  0.016592
min     35.467000  1.785387e+06  2.424950e+05  0.873000
25%     62.308000  2.500716e+06  3.292250e+05  0.889500
50%     64.511

In [46]:
df["GDP"] = df["GDP"].astype(np.float64)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      float64
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(3), int64(1), object(1)
memory usage: 336.0+ bytes


In [47]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670,0.913,America
France,63.951,2833687.0,640679,0.888,Europe
Germany,80.94,3874437.0,357114,0.916,Europe
Italy,60.665,2167744.0,301336,0.873,Europe
Japan,127.061,4602367.0,377930,0.891,Asia


In [48]:
# Indexing and slicing
print(df.loc['Canada']) #access each row using index name
print("")

print(df.iloc[-1]) #Access each row using index number
print("")

print(df['Population']) #Access each column
print("")

print(df[['Population', 'GDP']]) #Access multiple columns

Population           35.467
GDP             1.78539e+06
Surface Area        9984670
HDI                   0.913
Continent           America
Name: Canada, dtype: object

Population          318.523
GDP             1.73481e+07
Surface Area        9525067
HDI                   0.915
Continent           America
Name: United States, dtype: object

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

                Population         GDP
Canada              35.467   1785387.0
France              63.951   2833687.0
Germany             80.940   3874437.0
Italy               60.665   2167744.0
Japan              127.061   4602367.0
United Kingdom      64.511   2950039.0
United States      318.523  17348075.0


In [49]:
df['Population'].to_frame()

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [50]:
#Always use loc and iloc to slice datafraemes
print(df.loc['France': 'Italy'])
print("")

print(df.loc['France': 'Italy', 'Population'])
print("")

print(df.iloc[[0, 1, -1]])
print("")

print(df.iloc[1:3, 3])

         Population        GDP  Surface Area    HDI Continent
France       63.951  2833687.0        640679  0.888    Europe
Germany      80.940  3874437.0        357114  0.916    Europe
Italy        60.665  2167744.0        301336  0.873    Europe

France     63.951
Germany    80.940
Italy      60.665
Name: Population, dtype: float64

               Population         GDP  Surface Area    HDI Continent
Canada             35.467   1785387.0       9984670  0.913   America
France             63.951   2833687.0        640679  0.888    Europe
United States     318.523  17348075.0       9525067  0.915   America

France     0.888
Germany    0.916
Name: HDI, dtype: float64


In [57]:
print(df.reset_index(drop = True)) #inplace = True 
print("")
print(df)

   Population         GDP  Surface Area    HDI Continent
0      35.467   1785387.0       9984670  0.913   America
1      63.951   2833687.0        640679  0.888    Europe
2      80.940   3874437.0        357114  0.916    Europe
3      60.665   2167744.0        301336  0.873    Europe
4     127.061   4602367.0        377930  0.891      Asia
5      64.511   2950039.0        242495  0.907    Europe
6     318.523  17348075.0       9525067  0.915   America

                Population         GDP  Surface Area    HDI Continent
Canada              35.467   1785387.0       9984670  0.913   America
France              63.951   2833687.0        640679  0.888    Europe
Germany             80.940   3874437.0        357114  0.916    Europe
Italy               60.665   2167744.0        301336  0.873    Europe
Japan              127.061   4602367.0        377930  0.891      Asia
United Kingdom      64.511   2950039.0        242495  0.907    Europe
United States      318.523  17348075.0       9525067 

In [16]:
#Conditional selection
print(df['Population'] > 70)
print("")

print(df.loc[df['Population'] > 70])
print("")

print(df.loc[~(df['Population'] > 70)])
print("")

print(df.loc[(df['Population'] > 70) & (df['Population'] < 100)])
print("")

print(df.loc[~(df['Population'] > 70), "Population"])
print("")

print(df.loc[~(df['Continent'] == 'Europe'), "Population"])


Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

               Population         GDP  Surface Area    HDI Continent
Germany            80.940   3874437.0        357114  0.916    Europe
Japan             127.061   4602367.0        377930  0.891      Asia
United States     318.523  17348075.0       9525067  0.915   America

                Population        GDP  Surface Area    HDI Continent
Canada              35.467  1785387.0       9984670  0.913   America
France              63.951  2833687.0        640679  0.888    Europe
Italy               60.665  2167744.0        301336  0.873    Europe
United Kingdom      64.511  2950039.0        242495  0.907    Europe

         Population        GDP  Surface Area    HDI Continent
Germany       80.94  3874437.0        357114  0.916    Europe

Canada            35.467
France            63.951
Ita

## Modifying dataframes

### Adding new columns

In [58]:
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name='Language'
)
df["Language"] = langs
df["GDP per capita"] = df["GDP"] / df["Population"]
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per capita
Canada,35.467,1785387.0,9984670,0.913,America,,50339.385908
France,63.951,2833687.0,640679,0.888,Europe,French,44310.284437
Germany,80.94,3874437.0,357114,0.916,Europe,German,47868.013343
Italy,60.665,2167744.0,301336,0.873,Europe,Italian,35733.025633
Japan,127.061,4602367.0,377930,0.891,Asia,,36221.712406
United Kingdom,64.511,2950039.0,242495,0.907,Europe,,45729.239975
United States,318.523,17348075.0,9525067,0.915,America,,54464.12033


In [141]:
df.rename(
    columns={
        'HDI': 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    })

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language,GDP per capita
Canada,35.467,1785387,9984670,0.913,America,,50339.385908
France,63.951,2833687,640679,0.888,Europe,French,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,German,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,Italian,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,,36221.712406
UK,64.511,2950039,242495,0.907,Europe,,45729.239975
USA,318.523,17348075,9525067,0.915,America,,54464.12033


In [142]:
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per capita
Canada,35.467,1785387,9984670,0.913,America,,50339.385908
France,63.951,2833687,640679,0.888,Europe,French,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,German,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,Italian,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,,36221.712406


In [147]:
df.rename(columns={'Language': 'LANGUAGE'}, inplace=True)
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,LANGUAGE,GDP per capita
Canada,35.467,1785387,9984670,0.913,America,,50339.385908
France,63.951,2833687,640679,0.888,Europe,French,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,German,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,Italian,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,,36221.712406


### Dropping stuff

In [149]:
#Drop columns
df.drop(["LANGUAGE", "GDP per capita"], axis=1)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [150]:
df.drop_duplicates(subset = "Continent", keep = 'first')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,LANGUAGE,GDP per capita
Canada,35.467,1785387,9984670,0.913,America,,50339.385908
France,63.951,2833687,640679,0.888,Europe,French,44310.284437
Japan,127.061,4602367,377930,0.891,Asia,,36221.712406


In [108]:
df.drop('Canada') #axis=0

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,LANGUAGE
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


In [109]:
df.drop(['Canada', 'Japan']) #inplace = True

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,LANGUAGE
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


In [151]:
df.drop(['LANGUAGE', 'GDP per capita'], axis=1, inplace=True)
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia


### Appending dataframes

In [60]:
# Appending new dataframe
df2 = pd.DataFrame({
    'Population': 3,
    'GDP': 5
}, index=['China'])
df2

Unnamed: 0,Population,GDP
China,3,5


In [61]:
df_append = df.append(df2)
df_append

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per capita
Canada,35.467,1785387.0,9984670.0,0.913,America,,50339.385908
France,63.951,2833687.0,640679.0,0.888,Europe,French,44310.284437
Germany,80.94,3874437.0,357114.0,0.916,Europe,German,47868.013343
Italy,60.665,2167744.0,301336.0,0.873,Europe,Italian,35733.025633
Japan,127.061,4602367.0,377930.0,0.891,Asia,,36221.712406
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe,,45729.239975
United States,318.523,17348075.0,9525067.0,0.915,America,,54464.12033
China,3.0,5.0,,,,,


In [154]:
df_append.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, Canada to China
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    8 non-null      float64
 1   GDP           8 non-null      int64  
 2   Surface Area  7 non-null      float64
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(3), int64(1), object(1)
memory usage: 384.0+ bytes


In [155]:
df_append.reset_index(inplace=True)
df_append.rename(columns = {"index" : "Country"})
df_append

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
0,Canada,35.467,1785387,9984670.0,0.913,America
1,France,63.951,2833687,640679.0,0.888,Europe
2,Germany,80.94,3874437,357114.0,0.916,Europe
3,Italy,60.665,2167744,301336.0,0.873,Europe
4,Japan,127.061,4602367,377930.0,0.891,Asia
5,United Kingdom,64.511,2950039,242495.0,0.907,Europe
6,United States,318.523,17348075,9525067.0,0.915,America
7,China,3.0,5,,,


In [156]:
df_append.dropna(subset=['HDI'])

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
0,Canada,35.467,1785387,9984670.0,0.913,America
1,France,63.951,2833687,640679.0,0.888,Europe
2,Germany,80.94,3874437,357114.0,0.916,Europe
3,Italy,60.665,2167744,301336.0,0.873,Europe
4,Japan,127.061,4602367,377930.0,0.891,Asia
5,United Kingdom,64.511,2950039,242495.0,0.907,Europe
6,United States,318.523,17348075,9525067.0,0.915,America


In [174]:
print(df_append.fillna(0))
print("")
print(df_append.fillna(method = "ffill"))

            index  Population       GDP  Surface Area    HDI Continent
0          Canada      35.467   1785387     9984670.0  0.913   America
1          France      63.951   2833687      640679.0  0.888    Europe
2         Germany      80.940   3874437      357114.0  0.916    Europe
3           Italy      60.665   2167744      301336.0  0.873    Europe
4           Japan     127.061   4602367      377930.0  0.891      Asia
5  United Kingdom      64.511   2950039      242495.0  0.907    Europe
6   United States     318.523  17348075     9525067.0  0.915   America
7           China       3.000         5           0.0  0.000         0

            index  Population       GDP  Surface Area    HDI Continent
0          Canada      35.467   1785387     9984670.0  0.913   America
1          France      63.951   2833687      640679.0  0.888    Europe
2         Germany      80.940   3874437      357114.0  0.916    Europe
3           Italy      60.665   2167744      301336.0  0.873    Europe
4    

In [135]:
df_sort = df3.sort_values('GDP', ascending = False)
df_sort.head()

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
6,United States,318.523,17348075,9525067.0,0.915,America
4,Japan,127.061,4602367,377930.0,0.891,Asia
2,Germany,80.94,3874437,357114.0,0.916,Europe
5,United Kingdom,64.511,2950039,242495.0,0.907,Europe
1,France,63.951,2833687,640679.0,0.888,Europe


In [157]:
df_drop = df_append.drop_duplicates(subset = "Continent", keep = "first")
df_drop

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
0,Canada,35.467,1785387,9984670.0,0.913,America
1,France,63.951,2833687,640679.0,0.888,Europe
4,Japan,127.061,4602367,377930.0,0.891,Asia
7,China,3.0,5,,,


In [158]:
df_drop.reset_index(drop = True)

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
0,Canada,35.467,1785387,9984670.0,0.913,America
1,France,63.951,2833687,640679.0,0.888,Europe
2,Japan,127.061,4602367,377930.0,0.891,Asia
3,China,3.0,5,,,


### Merging dataframes

In [62]:
df3 = pd.DataFrame({
    'Country' : ['Canada', 'France', 'Germany', 'Singapore'],
    'Population': [35.467, 63.951, 80.94, 0.5],
    'GDP': [
        1785387,
        2833687,
        3874437,
        50000
    ],
    'Languages': [
        'English',
        'French',
        'German',
        'English'
    ]
})
df3

Unnamed: 0,Country,Population,GDP,Languages
0,Canada,35.467,1785387,English
1,France,63.951,2833687,French
2,Germany,80.94,3874437,German
3,Singapore,0.5,50000,English


In [63]:
df["Country"] = df.index
df.reset_index(drop = True, inplace=True)
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per capita,Country
0,35.467,1785387.0,9984670,0.913,America,,50339.385908,Canada
1,63.951,2833687.0,640679,0.888,Europe,French,44310.284437,France
2,80.94,3874437.0,357114,0.916,Europe,German,47868.013343,Germany
3,60.665,2167744.0,301336,0.873,Europe,Italian,35733.025633,Italy
4,127.061,4602367.0,377930,0.891,Asia,,36221.712406,Japan
5,64.511,2950039.0,242495,0.907,Europe,,45729.239975,United Kingdom
6,318.523,17348075.0,9525067,0.915,America,,54464.12033,United States


In [64]:
df3[["Country", "Languages"]]

Unnamed: 0,Country,Languages
0,Canada,English
1,France,French
2,Germany,German
3,Singapore,English


In [68]:
df_merge = df.merge(df3[["Country", "Languages"]], on = 'Country', how = 'left')
df_merge

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per capita,Country,Languages
0,35.467,1785387.0,9984670,0.913,America,,50339.385908,Canada,English
1,63.951,2833687.0,640679,0.888,Europe,French,44310.284437,France,French
2,80.94,3874437.0,357114,0.916,Europe,German,47868.013343,Germany,German
3,60.665,2167744.0,301336,0.873,Europe,Italian,35733.025633,Italy,
4,127.061,4602367.0,377930,0.891,Asia,,36221.712406,Japan,
5,64.511,2950039.0,242495,0.907,Europe,,45729.239975,United Kingdom,
6,318.523,17348075.0,9525067,0.915,America,,54464.12033,United States,


In [200]:
df_merge["Languages"][4] = "Japanese"
df_merge

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Country,Languages
0,35.467,1785387.0,9984670,0.913,America,Canada,English
1,63.951,2833687.0,640679,0.888,Europe,France,French
2,80.94,3874437.0,357114,0.916,Europe,Germany,German
3,60.665,2167744.0,301336,0.873,Europe,Italy,
4,127.061,4602367.0,377930,0.891,Asia,Japan,Japanese
5,64.511,2950039.0,242495,0.907,Europe,United Kingdom,
6,318.523,17348075.0,9525067,0.915,America,United States,


In [201]:
pivot_table = pd.pivot_table(df_merge, values='GDP',
                    columns=['Continent'], aggfunc=np.mean, fill_value=0)
pivot_table

Continent,America,Asia,Europe
GDP,9566731.0,4602367.0,2956476.75


## Dataframe useful methods

In [202]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Country
0,35.467,1785387.0,9984670,0.913,America,Canada
1,63.951,2833687.0,640679,0.888,Europe,France
2,80.94,3874437.0,357114,0.916,Europe,Germany
3,60.665,2167744.0,301336,0.873,Europe,Italy
4,127.061,4602367.0,377930,0.891,Asia,Japan
5,64.511,2950039.0,242495,0.907,Europe,United Kingdom
6,318.523,17348075.0,9525067,0.915,America,United States


In [203]:
df["Currency"] = df["Continent"].map({"America" : "Dollar", "Europe" : "Euro", "Asia" : "Yen"})
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Country,Currency
0,35.467,1785387.0,9984670,0.913,America,Canada,Dollar
1,63.951,2833687.0,640679,0.888,Europe,France,Euro
2,80.94,3874437.0,357114,0.916,Europe,Germany,Euro
3,60.665,2167744.0,301336,0.873,Europe,Italy,Euro
4,127.061,4602367.0,377930,0.891,Asia,Japan,Yen
5,64.511,2950039.0,242495,0.907,Europe,United Kingdom,Euro
6,318.523,17348075.0,9525067,0.915,America,United States,Dollar


In [206]:
def crisis(gdp) :
    return gdp / 2

crisis(df.loc[0, "GDP"])

892693.5

In [208]:
df["GDP_crisis"] = df["GDP"].apply(crisis)
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Country,Currency,GDP_crisis
0,35.467,1785387.0,9984670,0.913,America,Canada,Dollar,892693.5
1,63.951,2833687.0,640679,0.888,Europe,France,Euro,1416843.5
2,80.94,3874437.0,357114,0.916,Europe,Germany,Euro,1937218.5
3,60.665,2167744.0,301336,0.873,Europe,Italy,Euro,1083872.0
4,127.061,4602367.0,377930,0.891,Asia,Japan,Yen,2301183.5
5,64.511,2950039.0,242495,0.907,Europe,United Kingdom,Euro,1475019.5
6,318.523,17348075.0,9525067,0.915,America,United States,Dollar,8674037.5


In [209]:
df.drop("GDP_crisis", axis = 1, inplace = True)
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Country,Currency
0,35.467,1785387.0,9984670,0.913,America,Canada,Dollar
1,63.951,2833687.0,640679,0.888,Europe,France,Euro
2,80.94,3874437.0,357114,0.916,Europe,Germany,Euro
3,60.665,2167744.0,301336,0.873,Europe,Italy,Euro
4,127.061,4602367.0,377930,0.891,Asia,Japan,Yen


In [210]:
df["GDP_crisis"] = df["GDP"].apply(lambda x : x / 2 )
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Country,Currency,GDP_crisis
0,35.467,1785387.0,9984670,0.913,America,Canada,Dollar,892693.5
1,63.951,2833687.0,640679,0.888,Europe,France,Euro,1416843.5
2,80.94,3874437.0,357114,0.916,Europe,Germany,Euro,1937218.5
3,60.665,2167744.0,301336,0.873,Europe,Italy,Euro,1083872.0
4,127.061,4602367.0,377930,0.891,Asia,Japan,Yen,2301183.5
5,64.511,2950039.0,242495,0.907,Europe,United Kingdom,Euro,1475019.5
6,318.523,17348075.0,9525067,0.915,America,United States,Dollar,8674037.5


## Working with external files

In [69]:
df = pd.read_csv('STI2.csv')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,"Jul 21, 2020",2629.33,2644.51,2622.97,2629.45,2629.45,235880500
1,"Jul 20, 2020",2618.01,2619.78,2595.72,2616.3,2616.3,190816900
2,"Jul 17, 2020",2635.8,2640.38,2618.48,2618.48,2618.48,171927300
3,"Jul 16, 2020",2651.93,2661.95,2621.02,2623.67,2623.67,206579000
4,"Jul 15, 2020",2643.76,2658.9,2641.8,2648.9,2648.9,230774500


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1380 entries, 0 to 1379
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       1380 non-null   object
 1   Open       1380 non-null   object
 2   High       1380 non-null   object
 3   Low        1380 non-null   object
 4   Close      1380 non-null   object
 5   Adj Close  1380 non-null   object
 6   Volume     1380 non-null   object
dtypes: object(7)
memory usage: 75.6+ KB


In [71]:
df["Date"] = pd.to_datetime(df["Date"]) #dayfirst = True

df["Open"] = pd.to_numeric(df['Open'], errors = "coerce")
df["High"] = pd.to_numeric(df['High'], errors = "coerce")
df["Low"] = pd.to_numeric(df['Low'], errors = "coerce")
df["Close"] = pd.to_numeric(df['Close'], errors = "coerce")
df["Adj Close"] = pd.to_numeric(df['Adj Close'], errors = "coerce")
df["Volume"] = pd.to_numeric(df['Volume'], errors = "coerce")



In [75]:
df.sort_values("Date", ascending = True)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1379,2015-01-22,3352.20,3377.34,3349.11,3370.29,3370.29,314391900.0
1378,2015-01-23,3390.46,3419.30,3388.33,3411.50,3411.50,347330100.0
1377,2015-01-26,3409.36,3409.75,3393.24,3398.52,3398.52,264258600.0
1376,2015-01-27,3414.48,3428.49,3406.89,3412.20,3412.20,279421000.0
1375,2015-01-28,3405.07,3427.19,3404.17,3419.15,3419.15,252717600.0
...,...,...,...,...,...,...,...
4,2020-07-15,2643.76,2658.90,2641.80,2648.90,2648.90,230774500.0
3,2020-07-16,2651.93,2661.95,2621.02,2623.67,2623.67,206579000.0
2,2020-07-17,2635.80,2640.38,2618.48,2618.48,2618.48,171927300.0
1,2020-07-20,2618.01,2619.78,2595.72,2616.30,2616.30,190816900.0


In [226]:
df.sort_values("Date", ascending = True).head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1379,2015-01-22,3352.2,3377.34,3349.11,3370.29,3370.29,314391900.0
1378,2015-01-23,3390.46,3419.3,3388.33,3411.5,3411.5,347330100.0
1377,2015-01-26,3409.36,3409.75,3393.24,3398.52,3398.52,264258600.0
1376,2015-01-27,3414.48,3428.49,3406.89,3412.2,3412.2,279421000.0
1375,2015-01-28,3405.07,3427.19,3404.17,3419.15,3419.15,252717600.0


In [228]:
ts_min = pd.Timestamp(2016,1,1)
ts_max = pd.Timestamp(2016,12,31)
df.loc[(df["Date"] > ts_min) & (df["Date"] < ts_max)]

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
893,2016-12-30,2898.26,2903.35,2880.76,2880.76,2880.76,
894,2016-12-29,2885.16,2891.22,2878.18,2889.15,2889.15,
895,2016-12-28,2885.68,2900.42,2884.55,2898.30,2898.30,
896,2016-12-27,2861.60,2887.58,2857.91,2885.76,2885.76,
897,2016-12-23,2869.45,2877.88,2866.17,2871.05,2871.05,
...,...,...,...,...,...,...,...
1140,2016-01-08,2710.33,2760.89,2697.94,2751.23,2751.23,
1141,2016-01-07,2797.18,2797.59,2726.92,2729.91,2729.91,
1142,2016-01-06,2838.56,2844.06,2800.78,2804.27,2804.27,
1143,2016-01-05,2836.80,2846.27,2820.42,2834.23,2834.23,


In [76]:
df_pivot = pd.DataFrame(df.set_index('Date').resample(rule = 'q').mean())
df_pivot

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-03-31,3411.275106,3423.69234,3400.765745,3411.964468,3411.964468,255689600.0
2015-06-30,3423.216774,3433.434194,3411.950161,3421.643226,3421.643226,233315000.0
2015-09-30,3087.169344,3102.139836,3067.002787,3081.491475,3081.491475,304202400.0
2015-12-31,2929.1,2943.204219,2914.982031,2928.621562,2928.621562,
2016-03-31,2707.721967,2724.317049,2686.011311,2704.561967,2704.561967,
2016-06-30,2811.481406,2825.027344,2795.640781,2810.192344,2810.192344,158708700.0
2016-09-30,2867.647778,2878.590476,2855.301587,2868.308571,2868.308571,
2016-12-31,2864.155781,2874.160156,2852.130937,2863.188906,2863.188906,
2017-03-31,3078.053333,3090.088413,3068.53746,3081.91127,3081.91127,256237400.0
2017-06-30,3210.696393,3219.906066,3200.233934,3210.848852,3210.848852,226011100.0


In [None]:
df_pivot.to_excel("STI_quarterly.xlsx") #index = False if don't want index

In [None]:
useful_list = pd.read_pickle(r'useful_list.pickle')