# DataFrame

Data sets in Pandas are usually multi-dimensional tables, called DataFrames.
Series is like a column, a DataFrame is the whole table

A DataFrame represents a rectangular table of data and contains an ordered collection
of columns, each of which can be a different value type (numeric, string,boolean, etc.). 
The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index

In [2]:
import pandas as pd
from pandas import DataFrame
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [3]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [6]:
#For large DataFrames, the head method selects only the first five rows
frame.head(3)   # frame.head(3) first 3 rows

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6


In [7]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [8]:
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [9]:
pd.DataFrame(data, columns=['pop', 'state', 'year'])

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [10]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four','five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [11]:
print(frame2.columns)
frame2['year']

Index(['year', 'state', 'pop', 'debt'], dtype='object')


one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [12]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [13]:
frame2.state


one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

# loc attribute Dataframe

Rows can also be retrieved by position or name with the special loc attribute

In [14]:
print(frame2)
frame2.loc['three']

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN


year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [15]:
frame2.loc['six']

year       2003
state    Nevada
pop         3.2
debt        NaN
Name: six, dtype: object

In [16]:
#Assigning the same values to debt
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [17]:
#Assigning the diffrent values to debt
frame2['debt'] = [16.5, 17.5, 18.5, 19.5, 20.5, 21.5]
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,17.5
three,2002,Ohio,3.6,18.5
four,2001,Nevada,2.4,19.5
five,2002,Nevada,2.9,20.5
six,2003,Nevada,3.2,21.5


In [18]:
#Assigning the values to debt  using numpy arange function
import numpy as np
frame2['debt'] = np.arange(6)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [19]:
frame2.values

array([[2000, 'Ohio', 1.5, 0],
       [2001, 'Ohio', 1.7, 1],
       [2002, 'Ohio', 3.6, 2],
       [2001, 'Nevada', 2.4, 3],
       [2002, 'Nevada', 2.9, 4],
       [2003, 'Nevada', 3.2, 5]], dtype=object)

# Index Objects

pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names)

In [20]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [21]:
obj

a    0
b    1
c    2
dtype: int64

In [22]:
index[1:]

Index(['b', 'c'], dtype='object')

Index objects are immutable and thus can’t be modified by the user

In [23]:
index['a'] = 8 # TypeError

TypeError: Index does not support mutable operations

# Reindexing

In [24]:
#The below example shows the working of reindex() function to reindex the dataframe. 
#In the new index,default values are assigned NaN in the new index that does not have corresponding records in the DataFrame.

In [25]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=['b','p','y'])
obj3

b      blue
p    purple
y    yellow
dtype: object

In [26]:
#Reindexing in Pandas can be used to change the index of rows and columns of a DataFrame. 
#Indexes can be used with reference to many index DataStructure associated with several pandas series or pandas DataFrame.
obj3.reindex([4, 5, 7])

4    NaN
5    NaN
7    NaN
dtype: object

In [27]:
#Notice that the new indexes are populated with NaN values. We can fill in the missing values using the fill_value parameter.

In [28]:
obj3.reindex([4, 5, 7], fill_value =100)  

4    100
5    100
7    100
dtype: object

# Dropping Entries from an Axis

In [29]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj 

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [30]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [32]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [33]:
data.drop(['Colorado'])

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Utah,8,9,10,11
New York,12,13,14,15


In [34]:
data.drop(['Colorado','Ohio', 'New York', 'Utah'])

Unnamed: 0,one,two,three,four


In [35]:
#Many functions, like drop, which modify the size or shape of a Series or 
#DataFrame,can manipulate an object in-place without returning a new object:
obj.drop('c')
obj # Be careful with the inplace, as it destroys any data that is dropped.

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

# Indexing, Selection, and Filtering

In [36]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [37]:
obj[3]

3.0

In [38]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [39]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [110]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [37]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [40]:
# Indexing into a DataFrame is for retrieving one or more columns either 
# with a single value or sequence
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [41]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [42]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [43]:
data[2:3] 

Unnamed: 0,one,two,three,four
Utah,8,9,10,11


In [44]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [125]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [124]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# Selection with loc and iloc

The main distinction between loc and iloc is:
loc is label-based, which means that you have to specify rows and columns based on their row and column labels.
iloc is integer position-based, so you have to specify rows and columns by their integer position 
values (0-based integer position).

In [126]:
data.loc['Colorado', ['two', 'four']]

two     5
four    7
Name: Colorado, dtype: int32

In [127]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [43]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [44]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [46]:
# Both indexing functions work with slices in addition to single 
# labels or lists of labels:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [47]:
data.iloc[:, :3]

Unnamed: 0,one,two,three
Ohio,0,0,0
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [48]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [1]:
import pandas as pd
d= pd.read_csv('dset_diabetes.csv')
df = pd.DataFrame(d)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,6,148,72,35,0,33.6,0.627,50,1,,,,,
1,1,85,66,29,0,26.6,0.351,31,0,,,,,
2,8,183,64,0,0,23.3,0.672,32,1,,,,,
3,1,89,66,23,94,28.1,0.167,21,0,,,,,
4,0,137,40,35,168,43.1,2.288,33,1,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0,,,,,
764,2,122,70,27,0,36.8,0.340,27,0,,,,,
765,5,121,72,23,112,26.2,0.245,30,0,,,,,
766,1,126,60,0,0,30.1,0.349,47,1,,,,,
