# This nb describes the following
## Read from csv
## Skip rows, header, set column names
## Do indexing
## extract rows based on loc (index)
## extract rows based on conditions


In [1]:
import numpy as np
import pandas as pd
import pandas.io

In [2]:
df1=pd.read_csv("GradesNoHeadeExtraRows.csv",header=None, skiprows=1)
df1.head()

Unnamed: 0,0,1,2,3,4
0,John,Bangalore,Data Structures,80,B
1,Raman,Chennai,Data Structures,70,C
2,Venkat,Bangalore,C,90,A
3,Rani,Delhi,Data Structures,95,A
4,Lakshmi,Mumbai,Algorithms,90,A


In [3]:
#This causes the first line to be read as part of data and this gives error
df1=pd.read_csv("GradesNoHeadeExtraRows.csv",header=None)
df1.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 2, saw 5


In [4]:
#This takes the 1st line as header
df1=pd.read_csv("GradesNoHeadeExtraRows.csv")
df1.head()
df1.describe()
df1.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 21 entries, (John, Bangalore, Data Structures, 80) to (Sridhar, Ooty, Algorithms, 80)
Data columns (total 1 columns):
This file describes the grades secured by students    21 non-null object
dtypes: object(1)
memory usage: 820.0+ bytes


In [6]:
#This wrongly takes the first line as header, and the header as part of data
df1=pd.read_csv("GradesHeadeExtraRows.csv")
print(df1.head())
print("df[0:2]=", df1[0:2])
print ("df1.loc[name]=", df1.loc["name"]) #as first column is indexed refer by loc["name"]
print ("df1.loc[Raman]=", df1.loc["Raman"]) #as first column is indexed refer by loc["name"]
print ("df1[0:1]", df1[0:1])
#to print a specific row use iloc and not loc
print (df1.iloc[1:3])

                                        This file describes the grades secured by students
name    city      subject         marks                                              grade
John   Bangalore Data Structures 80                                                      B
Raman  Chennai   Data Structures 70                                                      C
Venkat Bangalore C               90                                                      A
Rani   Delhi     Data Structures 95                                                      A
df[0:2]=                                       This file describes the grades secured by students
name  city      subject         marks                                              grade
John Bangalore Data Structures 80                                                      B
df1.loc[name]=                       This file describes the grades secured by students
 city  subject  marks                                              grade
df1.loc[Raman]=  

In [25]:
#Read from csv and set the names to a list of column names. Here skipping extra rows
df2=pd.read_csv("GradesHeadeExtraRows.csv", skiprows=1, header=0)#, names=["name", "city", "subject", "marks", "grade"])
print("Head=", df2.head())
print("Columns=", df1.columns)
df2.columns = ["name", "city", "subject", "marks", "grade"]
print ("df2.city[0:8]", df2.city[0:8]) #Can slice in columns also
print ("df2.city[-3:-1]", df2.city[-3:-1]) #Can slice in columns also
print ("df2.city[0:-1]", df2.city[0:-1]) #Can slice in columns also
print ("len(df2.city)",len(df2.city))


Head=       name       city          subject   marks  grade
0     John  Bangalore  Data Structures      80      B
1    Raman    Chennai  Data Structures      70      C
2   Venkat  Bangalore                C      90      A
3     Rani      Delhi  Data Structures      95      A
4  Lakshmi     Mumbai       Algorithms      90      A
Columns= Index(['name', 'city', 'subject', 'marks', 'grade'], dtype='object')
df2.city[0:8] 0    Bangalore
1      Chennai
2    Bangalore
3        Delhi
4       Mumbai
5      Kolkata
6      Kolkata
7       Mumbai
Name: city, dtype: object
df2.city[-3:-1] 18    Darjeeling
19         Simla
Name: city, dtype: object
df2.city[0:-1] 0      Bangalore
1        Chennai
2      Bangalore
3          Delhi
4         Mumbai
5        Kolkata
6        Kolkata
7         Mumbai
8        Chennai
9      Bangalore
10         Patna
11     Hyderabad
12          Pune
13    Chandigarh
14    Chandigarh
15           Goa
16         Kochi
17       Manipur
18    Darjeeling
19         Simla
N

In [20]:
#Read from csv and set the names to a list of column names. Here NOT skipping extra rows
df1=pd.read_csv("GradesHeader.csv", header=0, names=["name", "city", "subject", "marks", "grade"])
df1.head()
print(df1.columns)
df1.city

Index(['name', 'city', 'subject', 'marks', 'grade'], dtype='object')


0      Bangalore
1        Chennai
2      Bangalore
3          Delhi
4         Mumbai
5        Kolkata
6        Kolkata
7         Mumbai
8        Chennai
9      Bangalore
10         Patna
11     Hyderabad
12          Pune
13    Chandigarh
14    Chandigarh
15           Goa
16         Kochi
17       Manipur
18    Darjeeling
19         Simla
20          Ooty
Name: city, dtype: object

In [19]:
print (df1==df2)

ValueError: Can only compare identically-labeled DataFrame objects

In [16]:
df1.set_index("name", drop=True, inplace=True)
df1.head()

Unnamed: 0_level_0,city,subject,marks,grade
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
John,Bangalore,Data Structures,80,B
Raman,Chennai,Data Structures,70,C
Venkat,Bangalore,C,90,A
Rani,Delhi,Data Structures,95,A
Lakshmi,Mumbai,Algorithms,90,A


In [17]:
df1.ix[0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


city             Bangalore
subject    Data Structures
marks                   80
grade                    B
Name: John, dtype: object

In [18]:
print (df1.loc[["John", "Rani"]])

           city          subject  marks grade
name                                         
John  Bangalore  Data Structures     80     B
Rani      Delhi  Data Structures     95     A


In [None]:
print (df1.loc["John": "Rani"])

In [None]:
df1.columns

df1.index

In [None]:
print (df1.index)

In [None]:
df1.loc[["John", "Sridhar"]]

In [None]:
df1.head()

In [None]:
df1

In [None]:
df1.columns=[ "city", "sub", "marks", "grade"]

In [None]:
df1.head()
df1.columns
df1.head()

In [None]:
df1["city"]

In [None]:
df1[df1["city"]=="Chennai"]

In [None]:
#For multiple conditions use bitwsie | and &
df1[(df1["city"]=="Chennai")| (df1["marks"]>=40)]

In [None]:
#For multiple conditions use bitwsie | and &
df1[(df1.city == "Chennai") | (df1.city == "Bangalore")]

In [None]:
df1[(((df1.city == "Chennai") | (df1.city=="Bangalore")) & (df1.marks >= 50))]