In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(np.random.randn(2,3), columns = ["First", "Second", "Third"], index = ["a", "b"])


In [3]:
df

Unnamed: 0,First,Second,Third
a,-0.045366,0.380838,-0.317379
b,-1.13161,0.853028,0.907957


In [4]:
df.index

Index(['a', 'b'], dtype='object')

In [5]:
df.columns

Index(['First', 'Second', 'Third'], dtype='object')

In [6]:
df2=pd.DataFrame(np.random.randn(2,3), index=["a", "b"])
df2

Unnamed: 0,0,1,2
a,-1.70893,-1.048812,0.486857
b,0.732361,-0.537547,0.195059


In [7]:
df2.columns

RangeIndex(start=0, stop=3, step=1)

In [9]:
s1 = pd.Series([1,2,3])

In [10]:
s1

0    1
1    2
2    3
dtype: int64

In [11]:
s2 = pd.Series([4,5,6], name="b")
s2

0    4
1    5
2    6
Name: b, dtype: int64

In [12]:
pd.DataFrame(s1, columns=["a"])

Unnamed: 0,a
0,1
1,2
2,3


In [13]:
pd.DataFrame(s2)

Unnamed: 0,b
0,4
1,5
2,6


In [14]:
pd.DataFrame({"a": s1, "b": s2})

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [15]:
df=pd.DataFrame([{"Wage" : 1000, "Name" : "Jack", "Age" : 21}, {"Wage" : 1500, "Name" : "John", "Age" : 29}])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [16]:
df = pd.DataFrame([[1000, "Jack", 21], [1500, "John", 29]], columns=["Wage", "Name", "Age"])
df

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21
1,1500,John,29


In [17]:
try:
    df[0]
except KeyError:
    import sys
    print("Key error", file=sys.stderr)

Key error


In [18]:
df["Wage"]

0    1000
1    1500
Name: Wage, dtype: int64

In [19]:
df[["Wage", "Name"]]

Unnamed: 0,Wage,Name
0,1000,Jack
1,1500,John


In [20]:
df[0:1]                           # slice

Unnamed: 0,Wage,Name,Age
0,1000,Jack,21


In [21]:
df[df.Wage > 1200]               # boolean mask

Unnamed: 0,Wage,Name,Age
1,1500,John,29


In [22]:
df["Wage"][1]                    # Note order of dimensions

1500

In [23]:
df.loc[1, "Wage"]

1500

In [24]:
df.iloc[-1,-1]             # Right lower corner of the DataFrame

29

In [25]:
df.loc[1, ["Name", "Wage"]]

Name    John
Wage    1500
Name: 1, dtype: object

In [26]:
wh = pd.read_csv("https://raw.githubusercontent.com/csmastersUH/data_analysis_with_python_2020/master/kumpula-weather-2017.csv")

In [27]:
wh.head()

Unnamed: 0,Year,m,d,Time,Time zone,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
0,2017,1,1,00:00,UTC,-1.0,-1.0,0.6
1,2017,1,2,00:00,UTC,4.4,-1.0,-3.9
2,2017,1,3,00:00,UTC,6.6,7.0,-6.5
3,2017,1,4,00:00,UTC,-1.0,13.0,-12.8
4,2017,1,5,00:00,UTC,-1.0,10.0,-17.8


In [29]:
wh2 = wh.drop(["Year", "m", "d"], axis=1)  # taking averages over these is not very interesting
wh2.head()

Unnamed: 0,Time,Time zone,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
0,00:00,UTC,-1.0,-1.0,0.6
1,00:00,UTC,4.4,-1.0,-3.9
2,00:00,UTC,6.6,7.0,-6.5
3,00:00,UTC,-1.0,13.0,-12.8
4,00:00,UTC,-1.0,10.0,-17.8


In [31]:
wh.describe()

Unnamed: 0,Year,m,d,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
count,365.0,365.0,365.0,365.0,358.0,365.0
mean,2017.0,6.526027,15.720548,1.966301,0.96648,6.527123
std,0.0,3.452584,8.808321,4.858423,3.717472,7.183934
min,2017.0,1.0,1.0,-1.0,-1.0,-17.8
25%,2017.0,4.0,8.0,-1.0,-1.0,1.2
50%,2017.0,7.0,16.0,0.2,-1.0,4.8
75%,2017.0,10.0,23.0,2.7,0.0,12.9
max,2017.0,12.0,31.0,35.0,15.0,19.6


In [32]:
wh2.describe()

Unnamed: 0,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
count,365.0,358.0,365.0
mean,1.966301,0.96648,6.527123
std,4.858423,3.717472,7.183934
min,-1.0,-1.0,-17.8
25%,-1.0,-1.0,1.2
50%,0.2,-1.0,4.8
75%,2.7,0.0,12.9
max,35.0,15.0,19.6


In [33]:
wh["Snow depth (cm)"].unique()

array([-1.,  7., 13., 10., 12.,  9.,  8.,  5.,  6.,  4.,  3., 15., 14.,
        2., nan,  0.])

In [34]:
pd.Series([1,3,2])

0    1
1    3
2    2
dtype: int64

In [35]:
pd.Series([1,3,2, np.nan])

0    1.0
1    3.0
2    2.0
3    NaN
dtype: float64

In [36]:
pd.Series(["jack", "joe", None])

0    jack
1     joe
2    None
dtype: object

In [37]:
wh.isnull()      # returns a boolean mask DataFrame

Unnamed: 0,Year,m,d,Time,Time zone,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
360,False,False,False,False,False,False,False,False
361,False,False,False,False,False,False,False,False
362,False,False,False,False,False,False,False,False
363,False,False,False,False,False,False,False,False


In [38]:
wh[wh.isnull().any(axis=1)]

Unnamed: 0,Year,m,d,Time,Time zone,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)
74,2017,3,16,00:00,UTC,1.8,,3.4
163,2017,6,13,00:00,UTC,0.6,,12.6
308,2017,11,5,00:00,UTC,0.2,,8.4
309,2017,11,6,00:00,UTC,2.0,,7.5
313,2017,11,10,00:00,UTC,3.6,,7.2
321,2017,11,18,00:00,UTC,11.3,,5.9
328,2017,11,25,00:00,UTC,8.5,,4.2


In [39]:
wh.dropna().shape   # Default axis is 0

(358, 8)

In [40]:
wh.dropna(axis=1).shape # Drops the columns containing missing values

(365, 7)

In [41]:
wh = wh.fillna(method='ffill')
wh[wh.isnull().any(axis=1)]

Unnamed: 0,Year,m,d,Time,Time zone,Precipitation amount (mm),Snow depth (cm),Air temperature (degC)


In [42]:
pd.Series(["1","2"]).map(int)                           # str -> int

0    1
1    2
dtype: int64

In [43]:
pd.Series([1,2]).map(str)                               # int -> str

0    1
1    2
dtype: object

In [44]:
pd.to_numeric(pd.Series([1,1.0]), downcast="integer")   # object -> int

0    1
1    1
dtype: int8

In [45]:
pd.to_numeric(pd.Series([1,"a"]), errors="coerce")      # conversion error produces Nan

0    1.0
1    NaN
dtype: float64

In [46]:
pd.Series([1,2]).astype(str)                            # works for a single series

0    1
1    2
dtype: object

In [47]:
df = pd.DataFrame({"a": [1,2,3], "b" : [4,5,6], "c" : [7,8,9]})
print(df.dtypes)
print(df)

a    int64
b    int64
c    int64
dtype: object
   a  b  c
0  1  4  7
1  2  5  8
2  3  6  9


In [48]:
df.astype(float)                       # Convert all columns

Unnamed: 0,a,b,c
0,1.0,4.0,7.0
1,2.0,5.0,8.0
2,3.0,6.0,9.0


In [49]:
df2 = df.astype({"b" : float, "c" : str})    # different types for columns
print(df2.dtypes)
print(df2)

a      int64
b    float64
c     object
dtype: object
   a    b  c
0  1  4.0  7
1  2  5.0  8
2  3  6.0  9


In [50]:
names = pd.Series(["donald", "theresa", "angela", "vladimir"])
names.str.capitalize()

0      Donald
1     Theresa
2      Angela
3    Vladimir
dtype: object

In [51]:
#names.str.

In [52]:
full_names = pd.Series(["Donald Trump", "Theresa May", "Angela Merkel", "Vladimir Putin"])
full_names.str.split()

0      [Donald, Trump]
1       [Theresa, May]
2     [Angela, Merkel]
3    [Vladimir, Putin]
dtype: object

In [53]:
full_names.str.split(expand=True)

Unnamed: 0,0,1
0,Donald,Trump
1,Theresa,May
2,Angela,Merkel
3,Vladimir,Putin
