In [29]:
import pandas as pd
import numpy as np

In [30]:
# A series is a one-dimensional array-like object containing a sequence of values and an associated array of data labels, called index
obj = pd.Series([4, 7, 5, -3])
# To specify index
obj2 = pd.Series([4, 7, -5, 3], index = ["a", "b", "c", "d"])
print(obj)
print(obj2)

0    4
1    7
2    5
3   -3
dtype: int64
a    4
b    7
c   -5
d    3
dtype: int64


In [31]:
print(obj.values)
print(obj.index)
print(obj2.index)

[ 4  7  5 -3]
RangeIndex(start=0, stop=4, step=1)
Index(['a', 'b', 'c', 'd'], dtype='object')


In [32]:
# Accessing values in pd.Series 

obj[0] = 10

print(obj2["a"])
print(obj[0])
# Accessing multiple values in series
print(obj2[["a","b","c"]])

print('b' in obj2)
print('z' in obj2)

4
10
a    4
b    7
c   -5
dtype: int64
True
False


In [33]:
# Modifying valus in series
print(obj2[obj2 > 0])
print(obj2 * 2)
print(np.exp(obj2))

a    4
b    7
d    3
dtype: int64
a     8
b    14
c   -10
d     6
dtype: int64
a      54.598150
b    1096.633158
c       0.006738
d      20.085537
dtype: float64


In [34]:
# Converting dictionary into seriess
d_data = {
    'Ohio': 30000,
    'Texas': 75000,
    'Oregon': 16000,  
    'Utah': 5000,
}

obj1 = pd.Series(d_data)
print(obj)

# Modifying the index
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj2 = pd.Series(d_data, index = states)
# Since california is not found in d_data, it will return NaN which stands for Not a Number(means NA not available)
# And since Utah is not included, it is excluded from the resulting object
print(obj)

# Detecting missing data
print(obj.isnull())
print(obj.notnull())


0    10
1     7
2     5
3    -3
dtype: int64
0    10
1     7
2     5
3    -3
dtype: int64
0    False
1    False
2    False
3    False
dtype: bool
0    True
1    True
2    True
3    True
dtype: bool


In [35]:
# Modifying Series
print(obj1 + obj2)

print("====================================================================")

obj2.name = 'Population'
obj2.index.name = 'state'
print(obj2)

print("====================================================================")
# Modifying index
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)

California         NaN
Ohio           60000.0
Oregon         32000.0
Texas         150000.0
Utah               NaN
dtype: float64
state
California        NaN
Ohio          30000.0
Oregon        16000.0
Texas         75000.0
Name: Population, dtype: float64
Bob      10
Steve     7
Jeff      5
Ryan     -3
dtype: int64


=======================================================================================================================================

DATAFRAME

In [36]:
# Creating DataFrame
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

frame = pd.DataFrame(data)
print(frame)
print(frame.head())
print(frame.columns)


    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
Index(['state', 'year', 'pop'], dtype='object')


In [37]:
# Specifying column order
obj = pd.DataFrame(data, columns=['year', 'state', 'pop'])

print(obj)
# Showing year column
print(obj.year)

   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64


In [38]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], 
                    index = ['one', 'two', 'three', 'four', 'five', 'six'])

print(frame2)

print("====================================================================")

# Inserting NaN values
frame2['debt'] = 16.5
print(frame2)

frame2['pop'] = np.arange(6)
print(frame2)

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN
       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5
six    2003  Nevada  3.2  16.5
       year   state  pop  debt
one    2000    Ohio    0  16.5
two    2001    Ohio    1  16.5
three  2002    Ohio    2  16.5
four   2001  Nevada    3  16.5
five   2002  Nevada    4  16.5
six    2003  Nevada    5  16.5


In [39]:
# Modifying value in dataframe
val = pd.Series([1.5, 3.0, -5.2], index = ['one', 'four', 'five'])

frame2['debt'] = val 
print(frame2)

       year   state  pop  debt
one    2000    Ohio    0   1.5
two    2001    Ohio    1   NaN
three  2002    Ohio    2   NaN
four   2001  Nevada    3   3.0
five   2002  Nevada    4  -5.2
six    2003  Nevada    5   NaN


In [40]:
# Creating a new column in dataframe
frame2['eastern'] = frame2.state == 'Ohio'
print(frame2)

# Deleting column using del keyword
del frame2['eastern']
print(frame2.columns)


       year   state  pop  debt  eastern
one    2000    Ohio    0   1.5     True
two    2001    Ohio    1   NaN     True
three  2002    Ohio    2   NaN     True
four   2001  Nevada    3   3.0    False
five   2002  Nevada    4  -5.2    False
six    2003  Nevada    5   NaN    False
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [41]:
# Creating dataframe using nested dictionary
pop = {
    'Nevada': {
        2001: 2.4, 
        2002: 2.9,
    }, 
    'Ohio': {
        2000: 1.5,
        2001: 1.7,
        2002: 3.6,
    }
}

frame3 = pd.DataFrame(pop)
print(frame3)

# Transposing dataframe
print(frame3.T)

# The keys inside the inner dict are combined and sorted to form the index in the result unless it is specified
pd.DataFrame(pop, index = [2001, 2002, 2003])

# Naming columns and row
frame3.index.name = 'year'; frame3.columns.name = 'state'
print(frame3)
print(frame3.values)

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5
        2001  2002  2000
Nevada   2.4   2.9   NaN
Ohio     1.7   3.6   1.5
state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2000      NaN   1.5
[[2.4 1.7]
 [2.9 3.6]
 [nan 1.5]]


In [42]:
# Indexing pandas
index = pd.Index(np.arange(3))
# Inserting one more value into series will return error (index length != series length)
srs = pd.Series([1.5, 3, 5], index = index)
print(srs)

# Reindexing pandas (introduce NaN value if any index value is not already present)
srs.reindex([0, 1, 2, 3, 'a', 'b', 'c', 'd'])

0    1.5
1    3.0
2    5.0
dtype: float64


0    1.5
1    3.0
2    5.0
3    NaN
a    NaN
b    NaN
c    NaN
d    NaN
dtype: float64

In [43]:
# ffill method allow us to forward fill all the missing values in index
obj = pd.Series([1, 2, 3], index = [0, 2, 4])
print(obj)

obj.reindex(np.arange(6), method = 'ffill')

0    1
2    2
4    3
dtype: int64


0    1
1    1
2    2
3    2
4    3
5    3
dtype: int64

In [44]:
# With dataframe, reindex can alter the index, columns or both
data = pd.DataFrame(np.arange(9).reshape((3, 3)),
                    index = ['a', 'b', 'c'],
                    columns = ['Port Dickson', 'Cheras', 'Setapak'])

print(data)

data.reindex(['a', 'b', 'c', 'd'])

columns = ['Cheras', 'Johor', 'Port Dickson']
data.reindex(columns = columns)

   Port Dickson  Cheras  Setapak
a             0       1        2
b             3       4        5
c             6       7        8


Unnamed: 0,Cheras,Johor,Port Dickson
a,1,,0
b,4,,3
c,7,,6


======================================================================================================================================

data.loc & data.iloc

In [54]:
# .loc and .iloc enable us to select a subset of the rows and columns from a Dataframe with numpy like notations
# using either .loc(axis labels) or .iloc(integers)

data = pd.DataFrame(np.arange(16).reshape((4,4)),
                    index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns = ['one', 'two', 'three', 'four'])

print(data)

print(data.loc['Colorado' , ['two', 'three']])

# Same meaning as data.loc['Utah', ['four', 'one', 'two']]
print(data.iloc[2, [3, 0, 1]])

print(data.iloc[2])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
two      5
three    6
Name: Colorado, dtype: int32
four    11
one      8
two      9
Name: Utah, dtype: int32
one       8
two       9
three    10
four     11
Name: Utah, dtype: int32


In [56]:
# Arithmetic and Data Alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])

# The internal data alignment introduces missing values in the label locations that don't overlap
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [None]:
df1 = pd.DataFrame(np.arange)