### Pandas Series

In [1]:
import pandas as pd

In [2]:
#generate an empty series
s = pd.Series()
print(s)

Series([], dtype: float64)


In [3]:
#create a series using dictionary
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data)
print(s) #dictionary keys are used to construct the index

a    0.0
b    1.0
c    2.0
dtype: float64


In [4]:
#create a series with indexing
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data,index=['b','c','d','a'])
print(s)

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64


In [5]:
data=[1,2,3,4,5]
s = pd.Series(data,index = ['a','b','c','d','e'])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [11]:
#print(s[0])
#print(s[:3])
#print(s['a'])
#print(s['f'])

### Pandas Dataframes

In [17]:
#create df from list of lists
data = [['Alex',10, 'Bob'],
        ['Bob',12, 'Jane'],
        ['Clark',13, 'John']]
df = pd.DataFrame(data,columns=['Name','Age', 'Middle Name'])
df

Unnamed: 0,Name,Age,Middle Name
0,Alex,10,Bob
1,Bob,12,Jane
2,Clark,13,John


In [16]:
#change data type to float 
data = [['Alex',10],['Bob',12],['Clark',13]]
df = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
df

Unnamed: 0,Name,Age
0,Alex,10.0
1,Bob,12.0
2,Clark,13.0


In [18]:
#create dataframe from dictionary with series inside
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print(df)

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


In [20]:
print(df['two'])

a    1
b    2
c    3
d    4
Name: two, dtype: int64


In [25]:
#adding two columns
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df=pd.DataFrame(d)
df['three']=pd.Series([10,20,30],index=['a','b','c'])

df['four']=df['one']+df['three']
#print(df)
df

Unnamed: 0,one,two,three,four
a,1.0,1,10.0,11.0
b,2.0,2,20.0,22.0
c,3.0,3,30.0,33.0
d,,4,,


In [27]:
#delete a column
#del df['four']
df.pop('two')
df

Unnamed: 0,one,three
a,1.0,10.0
b,2.0,20.0
c,3.0,30.0
d,,


In [28]:
df=df.reindex(columns=['three','one'])

In [29]:
df

Unnamed: 0,three,one
a,10.0,1.0
b,20.0,2.0
c,30.0,3.0
d,,


In [31]:
#pip install pydataset in the terminal
from pydataset import data

titanic = data('titanic')
type(titanic)

pandas.core.frame.DataFrame

In [32]:
titanic.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [33]:
# Print out first 4 observations
print(titanic[0:4])

       class     age  sex survived
1  1st class  adults  man      yes
2  1st class  adults  man      yes
3  1st class  adults  man      yes
4  1st class  adults  man      yes


In [34]:
# Print out fifth, and sixth observation
print(titanic[4:6])

       class     age  sex survived
5  1st class  adults  man      yes
6  1st class  adults  man      yes


In [35]:
#print out details of observation 3
print(titanic.iloc[2])

class       1st class
age            adults
sex               man
survived          yes
Name: 3, dtype: object


In [43]:
#append rows
df = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2 = pd.DataFrame([[1, 2], [3, 4]], columns = ['a','b'])
df2
df = df.append(df2)
df

Unnamed: 0,a,b
0,1,2
1,3,4
0,1,2
1,3,4


In [41]:
#drop duplicates
df=df.drop_duplicates()

In [42]:
df

Unnamed: 0,a,b
0,1,2
1,3,4


In [39]:
## Drop rows with label 0
df = df.drop(0)
df

Unnamed: 0,a,b
1,3,4
1,7,8


In [46]:
#transpose a dataframe
print(df.T)
df=df.T

   0  1  0  1
a  1  3  1  3
b  2  4  2  4


In [47]:
#get the datatypes of the variables from the df
df.dtypes

0    int64
1    int64
0    int64
1    int64
dtype: object

In [48]:
#determine if a dataframe is empty
df.empty

False

In [49]:
#return nr of dimensions of df
df.ndim

2

In [50]:
#get the array version of the df
df.values

array([[1, 3, 1, 3],
       [2, 4, 2, 4]], dtype=int64)

### Basic Statistics Using Pandas

In [51]:
#get the average of values for each column
df.mean()

0    1.5
1    3.5
0    1.5
1    3.5
dtype: float64

In [52]:
#get standard deviation
df.std()

0    0.707107
1    0.707107
0    0.707107
1    0.707107
dtype: float64

In [53]:
#summarize the data
df.describe()

Unnamed: 0,0,1,0.1,1.1
count,2.0,2.0,2.0,2.0
mean,1.5,3.5,1.5,3.5
std,0.707107,0.707107,0.707107,0.707107
min,1.0,3.0,1.0,3.0
25%,1.25,3.25,1.25,3.25
50%,1.5,3.5,1.5,3.5
75%,1.75,3.75,1.75,3.75
max,2.0,4.0,2.0,4.0
