# <center>Pandas DataFrames</center>
References:
* https://pandas.pydata.org/pandas-docs/stable/10min.html

- Next to Numpy, Pandas is one of the most widely used Python library
  * Data in an array must be of the same type
  * Data in a Pandas data frame can be of different data types
- Two commonly used data structures:
  * **Series**: a one-dimensional labeled array
  * **DataFrame**: two-dimensional labeled data structure with columns
- Labeling
  * **index**: the name of each row
  * **column name**: the name of each column
- Data Frames behave very similar to your Excel tables

In [1]:
import pandas as pd
import numpy as np

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 1 Creating DataFrames and viewing data
- Dataframes can be created from 
  * lists
  * numpy arrays
  * dictionaries

In [3]:
# create a series from a list 
# or a one-dimensional array
s = pd.Series(['01','02','10','50','30','20'])
s

0    01
1    02
2    10
3    50
4    30
5    20
dtype: object

In [4]:
# create a dataframe from a random array
df = pd.DataFrame(np.random.randint(60,100, (6,4)), index=s, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
1,74,92,68,69
2,92,83,61,90
10,99,76,94,76
50,85,96,84,78
30,97,66,78,74
20,86,69,94,80


In [5]:
# Create a dataframe from the list of tuples
x = [(1, 0.8, 'NJ'), (3, 0.7, 'NY'), (2, 0.65, 'PA'),(3, 0.25, 'PA'), \
   (2, 0.32, 'NJ'),(4, 0.95, 'PA'),(1, 0.44, 'NY'), (4, 0.15, 'NY')]

# Create a dataframe from the list of tuples (quarter, income, state)
p = pd.DataFrame(x, columns=['quarter','income','state'])
p

Unnamed: 0,quarter,income,state
0,1,0.8,NJ
1,3,0.7,NY
2,2,0.65,PA
3,3,0.25,PA
4,2,0.32,NJ
5,4,0.95,PA
6,1,0.44,NY
7,4,0.15,NY


In [6]:
# Create a dataframe from the dictionary
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
d = pd.DataFrame(data)
d

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [7]:
listing =pd.read_csv("listing.csv")#/Users/jiahuibi/Documents/435/Fall 2018/Labs slide
listing

Unnamed: 0,id,listing_url,scrape_id,host_id,host_name,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,city,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,calculated_host_listings_count,reviews_per_month
0,20469589,https://www.airbnb.com/rooms/20469589,2.017100e+13,18884528,Ann,67%,f,2.0,2.0,Brooklyn,...,,,,,,,,flexible,2,
1,7035579,https://www.airbnb.com/rooms/7035579,2.017100e+13,36885622,James,,f,1.0,1.0,New York,...,,,,,,,,flexible,1,
2,19610687,https://www.airbnb.com/rooms/19610687,2.017100e+13,2597159,Alana,100%,f,2.0,2.0,Brooklyn,...,98.0,10.0,10.0,10.0,10.0,10.0,10.0,strict,2,4.29
3,5783899,https://www.airbnb.com/rooms/5783899,2.017100e+13,23977712,Sarah,100%,t,4.0,4.0,New York,...,93.0,9.0,9.0,10.0,10.0,9.0,9.0,strict,4,5.77
4,7539610,https://www.airbnb.com/rooms/7539610,2.017100e+13,39503857,Craig,,f,1.0,1.0,New York,...,,,,,,,,flexible,1,
5,19842239,https://www.airbnb.com/rooms/19842239,2.017100e+13,140264879,Lucas,100%,f,1.0,1.0,Brooklyn,...,100.0,10.0,10.0,10.0,10.0,10.0,10.0,flexible,1,0.40
6,1655027,https://www.airbnb.com/rooms/1655027,2.017100e+13,8331572,Amy,100%,f,2.0,2.0,New York,...,90.0,9.0,9.0,10.0,10.0,9.0,9.0,strict,2,3.15
7,4053471,https://www.airbnb.com/rooms/4053471,2.017100e+13,3967335,Molly,100%,f,2.0,2.0,Brooklyn,...,89.0,9.0,9.0,9.0,9.0,10.0,9.0,strict,2,0.35
8,5680714,https://www.airbnb.com/rooms/5680714,2.017100e+13,15231014,Govind,100%,f,1.0,1.0,Brooklyn,...,99.0,10.0,10.0,10.0,10.0,9.0,10.0,strict,1,0.52
9,13427702,https://www.airbnb.com/rooms/13427702,2.017100e+13,11791633,Neel,,f,1.0,1.0,Queens,...,,,,,,,,flexible,1,0.07


In [8]:
pd.set_option('max.columns', None)

In [9]:
listing.head()

Unnamed: 0,id,listing_url,scrape_id,host_id,host_name,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,city,state,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,calculated_host_listings_count,reviews_per_month
0,20469589,https://www.airbnb.com/rooms/20469589,20171000000000.0,18884528,Ann,67%,f,2.0,2.0,Brooklyn,NY,11215.0,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$90.00,,,$100.00,$50.00,1,$20.00,0,,,,,,,,flexible,2,
1,7035579,https://www.airbnb.com/rooms/7035579,20171000000000.0,36885622,James,,f,1.0,1.0,New York,NY,10024.0,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,$125.00,$750.00,,,$50.00,1,$0.00,0,,,,,,,,flexible,1,
2,19610687,https://www.airbnb.com/rooms/19610687,20171000000000.0,2597159,Alana,100%,f,2.0,2.0,Brooklyn,NY,11211.0,House,Entire home/apt,4,1.0,1.0,2.0,Real Bed,$150.00,,,$0.00,$68.00,2,$20.00,11,98.0,10.0,10.0,10.0,10.0,10.0,10.0,strict,2,4.29
3,5783899,https://www.airbnb.com/rooms/5783899,20171000000000.0,23977712,Sarah,100%,t,4.0,4.0,New York,NY,10029.0,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$100.00,,,,$40.00,1,$0.00,174,93.0,9.0,9.0,10.0,10.0,9.0,9.0,strict,4,5.77
4,7539610,https://www.airbnb.com/rooms/7539610,20171000000000.0,39503857,Craig,,f,1.0,1.0,New York,NY,10030.0,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,$200.00,,,,,1,$0.00,0,,,,,,,,flexible,1,


In [10]:
listing.shape

(2000, 37)

In [11]:
listing.index

RangeIndex(start=0, stop=2000, step=1)

In [12]:
listing.columns

Index(['id', 'listing_url', 'scrape_id', 'host_id', 'host_name',
       'host_response_rate', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'city', 'state', 'zipcode',
       'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'bed_type', 'price', 'weekly_price', 'monthly_price',
       'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
       'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'cancellation_policy',
       'calculated_host_listings_count', 'reviews_per_month'],
      dtype='object')

In [13]:
listing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 37 columns):
id                                2000 non-null int64
listing_url                       2000 non-null object
scrape_id                         2000 non-null float64
host_id                           2000 non-null int64
host_name                         1988 non-null object
host_response_rate                1376 non-null object
host_is_superhost                 1988 non-null object
host_listings_count               1988 non-null float64
host_total_listings_count         1988 non-null float64
city                              1997 non-null object
state                             2000 non-null object
zipcode                           1976 non-null float64
property_type                     2000 non-null object
room_type                         2000 non-null object
accommodates                      2000 non-null int64
bathrooms                         1992 non-null float64
bedrooms   

In [14]:
listing.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,zipcode,accommodates,bathrooms,bedrooms,beds,guests_included,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,reviews_per_month
count,2000.0,2000.0,2000.0,1988.0,1988.0,1976.0,2000.0,1992.0,1998.0,1996.0,2000.0,2000.0,1544.0,1542.0,1545.0,1537.0,1543.0,1537.0,1536.0,2000.0,1569.0
mean,11513050.0,20171000000000.0,36207680.0,2.046781,2.046781,10671.917004,2.869,1.145582,1.172673,1.583667,1.508,18.264,93.453368,9.56096,9.238835,9.730644,9.736228,9.454782,9.365885,1.661,1.421281
std,6440973.0,0.0,40524560.0,5.096188,5.096188,599.038689,1.92943,0.433284,0.743332,1.121006,1.156116,32.486135,8.289662,0.87646,1.088943,0.663642,0.706485,0.793491,0.887984,1.887554,1.590191
min,8110.0,20171000000000.0,2845.0,0.0,0.0,10001.0,1.0,0.0,0.0,1.0,1.0,0.0,20.0,2.0,2.0,2.0,2.0,4.0,2.0,1.0,0.02
25%,6163242.0,20171000000000.0,5287706.0,1.0,1.0,10024.0,2.0,1.0,1.0,1.0,1.0,1.0,90.0,9.0,9.0,10.0,10.0,9.0,9.0,1.0,0.28
50%,12187860.0,20171000000000.0,20424140.0,1.0,1.0,11104.0,2.0,1.0,1.0,1.0,1.0,5.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,0.92
75%,17314080.0,20171000000000.0,51560990.0,2.0,2.0,11221.0,4.0,1.0,1.0,2.0,2.0,20.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,2.0,2.09
max,21176420.0,20171000000000.0,152877300.0,149.0,149.0,11692.0,16.0,6.0,8.0,13.0,16.0,343.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,31.0,14.74


## 2. Selection

In [15]:
df

Unnamed: 0,A,B,C,D
1,74,92,68,69
2,92,83,61,90
10,99,76,94,76
50,85,96,84,78
30,97,66,78,74
20,86,69,94,80


In [16]:
df['A']

01    74
02    92
10    99
50    85
30    97
20    86
Name: A, dtype: int64

In [17]:
df.A

01    74
02    92
10    99
50    85
30    97
20    86
Name: A, dtype: int64

In [18]:
df[['A','B']]

Unnamed: 0,A,B
1,74,92
2,92,83
10,99,76
50,85,96
30,97,66
20,86,69


In [19]:
df[0:3]#select by position

Unnamed: 0,A,B,C,D
1,74,92,68,69
2,92,83,61,90
10,99,76,94,76


In [20]:
df.loc['01']#select by label

A    74
B    92
C    68
D    69
Name: 01, dtype: int64

In [21]:
df.loc['01','A']

74

In [22]:
#exercise: print out 1st and 2nd columns using .loc
df.loc[:,['A','B']]

Unnamed: 0,A,B
1,74,92
2,92,83
10,99,76
50,85,96
30,97,66
20,86,69


In [23]:
df.iloc[2]#select by position

A    99
B    76
C    94
D    76
Name: 10, dtype: int64

In [24]:
df.iloc[0,0]

74

In [25]:
#exercise: print out the 3rd, 4th rows and 1st, 2nd columns using iloc
df.iloc[2:4,:2]

Unnamed: 0,A,B
10,99,76
50,85,96


### Select by condition (Boolean index)

In [26]:
df[df.B > 80]

Unnamed: 0,A,B,C,D
1,74,92,68,69
2,92,83,61,90
50,85,96,84,78


In [27]:
# add a new column "DEPT"
df["DEPT"]=['CS','BT','IS','BT','BT','CS']
df

Unnamed: 0,A,B,C,D,DEPT
1,74,92,68,69,CS
2,92,83,61,90,BT
10,99,76,94,76,IS
50,85,96,84,78,BT
30,97,66,78,74,BT
20,86,69,94,80,CS


In [28]:
df[df['DEPT'].isin(['BT','CS'])]

Unnamed: 0,A,B,C,D,DEPT
1,74,92,68,69,CS
2,92,83,61,90,BT
50,85,96,84,78,BT
30,97,66,78,74,BT
20,86,69,94,80,CS


## 3. Sort

In [29]:
print("sort by index")
df.sort_index(axis=0, ascending=False)

sort by index


Unnamed: 0,A,B,C,D,DEPT
50,85,96,84,78,BT
30,97,66,78,74,BT
20,86,69,94,80,CS
10,99,76,94,76,IS
2,92,83,61,90,BT
1,74,92,68,69,CS


In [30]:
df.sort_values(by='A',ascending=False)

Unnamed: 0,A,B,C,D,DEPT
10,99,76,94,76,IS
30,97,66,78,74,BT
2,92,83,61,90,BT
20,86,69,94,80,CS
50,85,96,84,78,BT
1,74,92,68,69,CS


## 4. Aggregation

In [31]:
df
df.DEPT.unique()
df.DEPT.value_counts()

Unnamed: 0,A,B,C,D,DEPT
1,74,92,68,69,CS
2,92,83,61,90,BT
10,99,76,94,76,IS
50,85,96,84,78,BT
30,97,66,78,74,BT
20,86,69,94,80,CS


array(['CS', 'BT', 'IS'], dtype=object)

BT    3
CS    2
IS    1
Name: DEPT, dtype: int64

In [32]:
grouped=df.groupby('DEPT')

In [33]:
grouped.size()

DEPT
BT    3
CS    2
IS    1
dtype: int64

In [34]:
grouped.sum()

Unnamed: 0_level_0,A,B,C,D
DEPT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BT,274,245,223,242
CS,160,161,162,149
IS,99,76,94,76


In [35]:
grouped['A'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
DEPT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BT,274,91.333333,6.027714
CS,160,80.0,8.485281
IS,99,99.0,


In [36]:
df['PRG']=['graduate','undergraduate','graduate','undergraduate','undergraduate','undergraduate']
df

Unnamed: 0,A,B,C,D,DEPT,PRG
1,74,92,68,69,CS,graduate
2,92,83,61,90,BT,undergraduate
10,99,76,94,76,IS,graduate
50,85,96,84,78,BT,undergraduate
30,97,66,78,74,BT,undergraduate
20,86,69,94,80,CS,undergraduate


In [37]:
group_dp= df.groupby(['DEPT','PRG'])

In [38]:
#exercise: 
#get the size of each group
group_dp.size()
#get the mean and std of all grades for each group
group_dp.agg([np.mean,np.std])

DEPT  PRG          
BT    undergraduate    3
CS    graduate         1
      undergraduate    1
IS    graduate         1
dtype: int64

Unnamed: 0_level_0,Unnamed: 1_level_0,A,A,B,B,C,C,D,D
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
DEPT,PRG,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
BT,undergraduate,91.333333,6.027714,81.666667,15.044379,74.333333,11.930353,80.666667,8.326664
CS,graduate,74.0,,92.0,,68.0,,69.0,
CS,undergraduate,86.0,,69.0,,94.0,,80.0,
IS,graduate,99.0,,76.0,,94.0,,76.0,


## 5. Apply
- The apply function takes each row (axis=1) or each column (axis=0) of the dataframe as an input
- Often **lambda** function (i.e. anonymous function, function without a name) is used
   * lambda function: *<font color='green'>lambda x : expression of x </font>*
   * it can be understood as: *<font color='green'> for each x, return expression of x </font>*

In [39]:
# Find the percentage of column A 
# in the sum of columns A-D for each row
# i.e. normalize by row

df['A_PERC']=df.apply(lambda row: row["A"]/(row["A"]+row["B"]+row["C"]+ row["D"]), axis=1)
df

Unnamed: 0,A,B,C,D,DEPT,PRG,A_PERC
1,74,92,68,69,CS,graduate,0.244224
2,92,83,61,90,BT,undergraduate,0.282209
10,99,76,94,76,IS,graduate,0.286957
50,85,96,84,78,BT,undergraduate,0.247813
30,97,66,78,74,BT,undergraduate,0.307937
20,86,69,94,80,CS,undergraduate,0.261398


In [40]:
df['DEPT'].apply(lambda x: x.lower())
df.apply(lambda x: x['DEPT'].lower(),axis=1)

01    cs
02    bt
10    is
50    bt
30    bt
20    cs
Name: DEPT, dtype: object

01    cs
02    bt
10    is
50    bt
30    bt
20    cs
dtype: object

## 6. Missing data

In [41]:
n = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, 6, 1],
                    [np.nan, np.nan, 10, 5],[np.nan, 3, np.nan, 4]],columns=list('ABCD'))
n

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,6.0,1
2,,,10.0,5
3,,3.0,,4


In [42]:
n.isnull()

Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,False,False
2,True,True,False,False
3,True,False,True,False


In [43]:
n.isnull().sum()

A    3
B    1
C    2
D    0
dtype: int64

In [44]:
n.isnull().any()

A     True
B     True
C     True
D    False
dtype: bool

In [45]:
n.dropna(subset=['B'])

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,6.0,1
3,,3.0,,4


In [46]:
n.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,6.0,1
2,0.0,0.0,10.0,5
3,0.0,3.0,0.0,4


In [47]:
n.dropna(subset=['C'],inplace = True)
n

Unnamed: 0,A,B,C,D
1,3.0,4.0,6.0,1
2,,,10.0,5


In [48]:
#exercise: 
listing.head()
#for listing, remove rows where if there is any 'NaN' value in column 'bedrooms','beds','bathrooms'
listing.dropna(subset=['bedrooms','beds','bathrooms'])
#for 'review_scores_rating' column, replace nan value with the mean of this column
listing['review_scores_rating'].fillna(listing['review_scores_rating'].mean())

Unnamed: 0,id,listing_url,scrape_id,host_id,host_name,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,city,state,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,calculated_host_listings_count,reviews_per_month
0,20469589,https://www.airbnb.com/rooms/20469589,20171000000000.0,18884528,Ann,67%,f,2.0,2.0,Brooklyn,NY,11215.0,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$90.00,,,$100.00,$50.00,1,$20.00,0,,,,,,,,flexible,2,
1,7035579,https://www.airbnb.com/rooms/7035579,20171000000000.0,36885622,James,,f,1.0,1.0,New York,NY,10024.0,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,$125.00,$750.00,,,$50.00,1,$0.00,0,,,,,,,,flexible,1,
2,19610687,https://www.airbnb.com/rooms/19610687,20171000000000.0,2597159,Alana,100%,f,2.0,2.0,Brooklyn,NY,11211.0,House,Entire home/apt,4,1.0,1.0,2.0,Real Bed,$150.00,,,$0.00,$68.00,2,$20.00,11,98.0,10.0,10.0,10.0,10.0,10.0,10.0,strict,2,4.29
3,5783899,https://www.airbnb.com/rooms/5783899,20171000000000.0,23977712,Sarah,100%,t,4.0,4.0,New York,NY,10029.0,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$100.00,,,,$40.00,1,$0.00,174,93.0,9.0,9.0,10.0,10.0,9.0,9.0,strict,4,5.77
4,7539610,https://www.airbnb.com/rooms/7539610,20171000000000.0,39503857,Craig,,f,1.0,1.0,New York,NY,10030.0,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,$200.00,,,,,1,$0.00,0,,,,,,,,flexible,1,


Unnamed: 0,id,listing_url,scrape_id,host_id,host_name,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,city,state,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,cancellation_policy,calculated_host_listings_count,reviews_per_month
0,20469589,https://www.airbnb.com/rooms/20469589,2.017100e+13,18884528,Ann,67%,f,2.0,2.0,Brooklyn,NY,11215.0,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$90.00,,,$100.00,$50.00,1,$20.00,0,,,,,,,,flexible,2,
1,7035579,https://www.airbnb.com/rooms/7035579,2.017100e+13,36885622,James,,f,1.0,1.0,New York,NY,10024.0,Apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,$125.00,$750.00,,,$50.00,1,$0.00,0,,,,,,,,flexible,1,
2,19610687,https://www.airbnb.com/rooms/19610687,2.017100e+13,2597159,Alana,100%,f,2.0,2.0,Brooklyn,NY,11211.0,House,Entire home/apt,4,1.0,1.0,2.0,Real Bed,$150.00,,,$0.00,$68.00,2,$20.00,11,98.0,10.0,10.0,10.0,10.0,10.0,10.0,strict,2,4.29
3,5783899,https://www.airbnb.com/rooms/5783899,2.017100e+13,23977712,Sarah,100%,t,4.0,4.0,New York,NY,10029.0,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$100.00,,,,$40.00,1,$0.00,174,93.0,9.0,9.0,10.0,10.0,9.0,9.0,strict,4,5.77
4,7539610,https://www.airbnb.com/rooms/7539610,2.017100e+13,39503857,Craig,,f,1.0,1.0,New York,NY,10030.0,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,$200.00,,,,,1,$0.00,0,,,,,,,,flexible,1,
5,19842239,https://www.airbnb.com/rooms/19842239,2.017100e+13,140264879,Lucas,100%,f,1.0,1.0,Brooklyn,NY,11213.0,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$26.00,,,$100.00,,1,$6.00,1,100.0,10.0,10.0,10.0,10.0,10.0,10.0,flexible,1,0.40
6,1655027,https://www.airbnb.com/rooms/1655027,2.017100e+13,8331572,Amy,100%,f,2.0,2.0,New York,NY,10002.0,Apartment,Entire home/apt,4,1.0,2.0,2.0,Real Bed,$250.00,,"$7,000.00",$500.00,$100.00,4,$25.00,155,90.0,9.0,9.0,10.0,10.0,9.0,9.0,strict,2,3.15
7,4053471,https://www.airbnb.com/rooms/4053471,2.017100e+13,3967335,Molly,100%,f,2.0,2.0,Brooklyn,NY,11222.0,Apartment,Private room,1,1.0,1.0,1.0,Real Bed,$65.00,$425.00,"$1,650.00",,,1,$20.00,13,89.0,9.0,9.0,9.0,9.0,10.0,9.0,strict,2,0.35
8,5680714,https://www.airbnb.com/rooms/5680714,2.017100e+13,15231014,Govind,100%,f,1.0,1.0,Brooklyn,NY,11233.0,Apartment,Private room,2,1.0,1.0,2.0,Real Bed,$90.00,$750.00,"$3,200.00",$95.00,$30.00,2,$50.00,16,99.0,10.0,10.0,10.0,10.0,9.0,10.0,strict,1,0.52
9,13427702,https://www.airbnb.com/rooms/13427702,2.017100e+13,11791633,Neel,,f,1.0,1.0,Queens,NY,11102.0,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,$65.00,,,$95.00,$15.00,1,$0.00,1,,,,,,,,flexible,1,0.07


0        93.453368
1        93.453368
2        98.000000
3        93.000000
4        93.453368
5       100.000000
6        90.000000
7        89.000000
8        99.000000
9        93.453368
10       93.453368
11      100.000000
12      100.000000
13       93.453368
14       95.000000
15      100.000000
16       99.000000
17       80.000000
18      100.000000
19       87.000000
20       87.000000
21       90.000000
22       98.000000
23       93.453368
24       95.000000
25       80.000000
26       97.000000
27       97.000000
28      100.000000
29       93.453368
           ...    
1970     87.000000
1971     83.000000
1972     93.453368
1973     83.000000
1974     90.000000
1975     80.000000
1976     93.453368
1977    100.000000
1978     93.453368
1979     92.000000
1980     95.000000
1981     93.453368
1982    100.000000
1983     92.000000
1984     93.453368
1985     80.000000
1986     99.000000
1987     93.000000
1988     93.453368
1989    100.000000
1990     93.453368
1991     98.

## 7. duplicate

In [49]:
d = pd.DataFrame({"A":["foo", "foo", "foo", "bar"], "B":[0,1,1,1], "C":["A","A","B","A"]})
d

Unnamed: 0,A,B,C
0,foo,0,A
1,foo,1,A
2,foo,1,B
3,bar,1,A


In [50]:
d.drop_duplicates(subset=['A', 'C'], keep=False)

Unnamed: 0,A,B,C
2,foo,1,B
3,bar,1,A


In [51]:
d.drop_duplicates(subset=['A', 'C'], keep="first")

Unnamed: 0,A,B,C
0,foo,0,A
2,foo,1,B
3,bar,1,A


## 8. Getting data out

In [None]:
df.to_csv('student.csv', header=True, index=False)