In [2]:
#Series data structure: cross between list and dictionary, meaning that items are stored in an order and can be retrieved with a label, kinda like hashmap of java
import pandas as pd
import numpy as np
nums = [4,2,571,58,983]
numS = pd.Series(nums,index=[5,6,7,8,9])
numS

5      4
6      2
7    571
8     58
9    983
dtype: int64

In [8]:
#python has None type to indicate lack of data
#for strings it puts in None in the series but for numeric values it puts in NaN
#NaNs are not comparable, can use np.isnan to find if a value is NaN

nums = [1,2,3,4,None]
words = ['hi','hello',None]
n = np.nan
print(np.isnan(n))
print(pd.Series(nums))

True
0    1.0
1    2.0
2    3.0
3    4.0
4    NaN
dtype: float64


In [10]:
print(pd.Series(words))

0       hi
1    hello
2     None
dtype: object


In [9]:
import csv

%precision 2

with open('mpg.csv') as csvfile:
    mpg = list(csv.DictReader(csvfile))

trialS = pd.Series(mpg[0])

In [24]:
trialS.index


Index(['', 'manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv',
       'cty', 'hwy', 'fl', 'class'],
      dtype='object')

In [26]:
#querying a Series: element can either be queried by index or by label- if no label given, then index and label are same thing
#to query by index, we use iloc[]
#to query by label, we use loc[]
#iloc and loc are not methods, they are attributes
trialS.loc['class']

'compact'

In [10]:
#DataFrames are 2D objects, where you have one index column and multiple columns of content- think of it as a 2 axes labelled array
#can create a data frame from a group of series or group of dictionaries

#creating from a bunch of series
car1 = pd.Series(mpg[0])
car2 = pd.Series(mpg[1])
car3 = pd.Series(mpg[2])

df = pd.DataFrame([car1,car2,car3],index=['m','p','d'])
df

Unnamed: 0,Unnamed: 1,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
m,1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
p,2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
d,3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact


In [21]:
#making a list of transmission types
lst = []
for i in range(len(df)):
    p = df.iloc[i]
    lst.append(df.iloc[i].loc['trans'])
lst


['auto(l5)', 'manual(m5)', 'manual(m6)']

In [22]:
#simpler way:
df['trans']

m      auto(l5)
p    manual(m5)
d    manual(m6)
Name: trans, dtype: object

In [12]:
df['cyl'] = 6
df

Unnamed: 0,Unnamed: 1,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
m,1,audi,a4,1.8,1999,6,auto(l5),f,18,29,p,compact
p,2,audi,a4,1.8,1999,6,manual(m5),f,21,29,p,compact
d,3,audi,a4,2.0,2008,6,manual(m6),f,20,31,p,compact


In [13]:
cyl = df['cyl']
cyl+=4

In [16]:
#changes were made to original dataframe: in order to prevent this, we will need to make a copy of the df and work on that
df
cyl-=4

cpy = df.copy()
cyl = cpy['cyl']
cyl+=4

cpy

Unnamed: 0,Unnamed: 1,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
m,1,audi,a4,1.8,1999,6,auto(l5),f,18,29,p,compact
p,2,audi,a4,1.8,1999,6,manual(m5),f,21,29,p,compact
d,3,audi,a4,2.0,2008,6,manual(m6),f,20,31,p,compact


In [17]:
df

Unnamed: 0,Unnamed: 1,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
m,1,audi,a4,1.8,1999,2,auto(l5),f,18,29,p,compact
p,2,audi,a4,1.8,1999,2,manual(m5),f,21,29,p,compact
d,3,audi,a4,2.0,2008,2,manual(m6),f,20,31,p,compact


In [3]:
df = pd.read_csv('mpg.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [4]:
df.columns

Index(['Unnamed: 0', 'manufacturer', 'model', 'displ', 'year', 'cyl', 'trans',
       'drv', 'cty', 'hwy', 'fl', 'class'],
      dtype='object')

In [9]:
df.rename(columns={df.columns[0]:'Sr. No.'},inplace = True)
df.head()

Unnamed: 0,Sr. No.,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
0,1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
1,2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
2,3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
3,4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
4,5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [3]:
#Boolean mask: array that can be 1D or 2D where each value is true or false- this is overlaid on top of our data frame
#to print out only the true fields

purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})

df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])


# Your code here
moreThan3 = df[(df['Cost'] > 3.00)]
moreThan3

Unnamed: 0,Name,Item Purchased,Cost
Store 1,Chris,Dog Food,22.5
Store 2,Vinod,Bird Seed,5.0
