# Pandas

In [1]:
import numpy as np
import pandas as pd

### Series 
series out of  a list

In [3]:
x = pd.Series([10,20,30,40,50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
x.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
x.values

array([10, 20, 30, 40, 50], dtype=int64)

In [6]:
x.dtype

dtype('int64')

#### Data with our own labels

In [9]:
data = [450,650,870]
sales = pd.Series(data, index=['Bob','Sally','Tom'])
sales

Bob      450
Sally    650
Tom      870
dtype: int64

In [10]:
sales.index

Index(['Bob', 'Sally', 'Tom'], dtype='object')

In [11]:
sales["Bob"]

450

In [13]:
#same as
sales[0]
#only if the data labels are not numerical ie won't get confused with indexing

450

In [14]:
#use bools to filter out data

In [15]:
sales > 500

Bob      False
Sally     True
Tom       True
dtype: bool

In [16]:
sales[sales > 500]

Sally    650
Tom      870
dtype: int64

In [17]:
#check if data present with index
'Bob' in sales

True

In [18]:
650 in sales #checks the index not to value

False

In [19]:
sales_dict = sales.to_dict()
sales_dict

{'Bob': 450, 'Sally': 650, 'Tom': 870}

In [20]:
sales_ser = pd.Series(sales_dict)
sales_ser

Bob      450
Sally    650
Tom      870
dtype: int64

In [21]:
new_sales = pd.Series(sales, index=['Sally', 'Lucy', 'Bob', 'Tom', 'Darren'])
new_sales
#change order of series but also add new indexes that are auto assignes NaN

Sally     650.0
Lucy        NaN
Bob       450.0
Tom       870.0
Darren      NaN
dtype: float64

In [22]:
#check if values are Nan with numpy
np.isnan(new_sales)

Sally     False
Lucy       True
Bob       False
Tom       False
Darren     True
dtype: bool

In [24]:
#same with pandas
pd.isnull(new_sales)

Sally     False
Lucy       True
Bob       False
Tom       False
Darren     True
dtype: bool

In [26]:
new_sales.index.name = "Sales_Person" #naming indexes
new_sales

Sales_Person
Sally     650.0
Lucy        NaN
Bob       450.0
Tom       870.0
Darren      NaN
dtype: float64

In [27]:
#naming whole series
new_sales.name = "total sales"
new_sales

Sales_Person
Sally     650.0
Lucy        NaN
Bob       450.0
Tom       870.0
Darren      NaN
Name: total sales, dtype: float64

# DataFrames

In [29]:
data = [['Don',870],["Sally",678],['Bob',4756]]
df = pd.DataFrame(data, columns=["Name","Sales"])
df #makes autoincrementing index if one not actually specified

Unnamed: 0,Name,Sales
0,Don,870
1,Sally,678
2,Bob,4756


In [30]:
df_dict = pd.DataFrame(sales_dict)
df_dict

ValueError: If using all scalar values, you must pass an index

In [31]:
df_dict = pd.DataFrame(sales_dict, index=[1,2,3])
df_dict

Unnamed: 0,Bob,Sally,Tom
1,450,650,870
2,450,650,870
3,450,650,870


In [32]:
sales_dict

{'Bob': 450, 'Sally': 650, 'Tom': 870}

In [34]:
df_dict = pd.DataFrame(sales_dict, index=["Sales"])
df_dict

Unnamed: 0,Bob,Sally,Tom
Sales,450,650,870


### Create datafram from list of dictionaries

In [35]:
dict_list = [{'Name':'Tom','Sales':250},{'Name':'Jane','Sales':300},{'Name':'Steve','Sales':350}
            ,{'Name':'Lucy','Sales':400}]

df_dict_list = pd.DataFrame(dict_list)
df_dict_list  #Now it know how we want it formatted because there are keys for column names

Unnamed: 0,Name,Sales
0,Tom,250
1,Jane,300
2,Steve,350
3,Lucy,400


## Create datafram out of series

In [36]:
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])

In [37]:
df_region = pd.DataFrame({'East':east, 'West':west})
df_region

Unnamed: 0,East,West
Q1,1000.0,1100
Q2,1200.0,1300
Q3,3400.0,2400
Q4,,3500


In [38]:
years = ["2015","2016","2017","2018"]
df_region["Years"] = years
df_region
# adds it on but doesn't make sense for this example. just to see how to add columns

Unnamed: 0,East,West,Years
Q1,1000.0,1100,2015
Q2,1200.0,1300,2016
Q3,3400.0,2400,2017
Q4,,3500,2018


In [39]:
df_region = df_region.set_index("Years")  # wrong about quarters, want years instead
df_region

Unnamed: 0_level_0,East,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1
2015,1000.0,1100
2016,1200.0,1300
2017,3400.0,2400
2018,,3500


In [40]:
#reindex shifts index
new_df = df_region.reindex(["2014","2015","2016","2017","2018","2019","2020"])
new_df

Unnamed: 0_level_0,East,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,,
2015,1000.0,1100.0
2016,1200.0,1300.0
2017,3400.0,2400.0
2018,,3500.0
2019,,
2020,,


In [42]:
#can also use reindex on columns maybe for reorganising
new_df = new_df.reindex(columns=["East","South","West"])
new_df

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,,,
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0
2018,,,3500.0
2019,,,
2020,,,


## How to deal with missing values.

In [44]:
#replace NaN's with zeros
new_df.fillna(0)

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,0.0,0.0,0.0
2015,1000.0,0.0,1100.0
2016,1200.0,0.0,1300.0
2017,3400.0,0.0,2400.0
2018,0.0,0.0,3500.0
2019,0.0,0.0,0.0
2020,0.0,0.0,0.0


In [47]:
new_df.fillna(method='ffill')

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,,,
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0
2018,3400.0,,3500.0
2019,3400.0,,3500.0
2020,3400.0,,3500.0


In [48]:
new_df.fillna(method='bfill')

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,1000.0,,1100.0
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0
2018,,,3500.0
2019,,,
2020,,,


In [49]:
new_df.interpolate() #default method linear interpolation kinda like making a line of best fit???

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,,,
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0
2018,3400.0,,3500.0
2019,3400.0,,3500.0
2020,3400.0,,3500.0


In [51]:
new_df.dropna(axis=1, how='all')  #axis 1 refers to cols, axis 0 refers to row

Unnamed: 0_level_0,East,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,,
2015,1000.0,1100.0
2016,1200.0,1300.0
2017,3400.0,2400.0
2018,,3500.0
2019,,
2020,,


In [52]:
new_df.dropna(axis=0, how='all') #if all values in row in nan then drop

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0
2018,,,3500.0


In [54]:
new_df.dropna(thresh=2) #dnt drop if more

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0


In [55]:
new_df.drop("2019")

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,,,
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0
2018,,,3500.0
2020,,,


In [57]:
new_df.drop("South", axis = 1)

Unnamed: 0_level_0,East,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1
2014,,
2015,1000.0,1100.0
2016,1200.0,1300.0
2017,3400.0,2400.0
2018,,3500.0
2019,,
2020,,


In [58]:
new_df["East"]

Years
2014       NaN
2015    1000.0
2016    1200.0
2017    3400.0
2018       NaN
2019       NaN
2020       NaN
Name: East, dtype: float64

In [60]:
new_df.iloc[2]

East     1200.0
South       NaN
West     1300.0
Name: 2016, dtype: float64

In [61]:
new_df.iloc[1:3]

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,1000.0,,1100.0
2016,1200.0,,1300.0


In [62]:
new_df.loc[["2016","2019"]]

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016,1200.0,,1300.0
2019,,,


In [63]:
new_df

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014,,,
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0
2018,,,3500.0
2019,,,
2020,,,


In [64]:
new_df.sort_index(ascending=False)

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,,,
2019,,,
2018,,,3500.0
2017,3400.0,,2400.0
2016,1200.0,,1300.0
2015,1000.0,,1100.0
2014,,,


In [65]:
new_df.sort_index(ascending=0)

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,,,
2019,,,
2018,,,3500.0
2017,3400.0,,2400.0
2016,1200.0,,1300.0
2015,1000.0,,1100.0
2014,,,


In [66]:
new_df.sort_values(by=["East"])

Unnamed: 0_level_0,East,South,West
Years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,1000.0,,1100.0
2016,1200.0,,1300.0
2017,3400.0,,2400.0
2014,,,
2018,,,3500.0
2019,,,
2020,,,


In [70]:
iris = pd.read_csv("iris.csv")
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [87]:
group_by_species = iris.groupby(by="species", axis=0)
[print(i) for i in group_by_species]
group_by_species

('setosa',     sepal_length  sepal_width  petal_length  petal_width species
0            5.1          3.5           1.4          0.2  setosa
1            4.9          3.0           1.4          0.2  setosa
2            4.7          3.2           1.3          0.2  setosa
3            4.6          3.1           1.5          0.2  setosa
4            5.0          3.6           1.4          0.2  setosa
5            5.4          3.9           1.7          0.4  setosa
6            4.6          3.4           1.4          0.3  setosa
7            5.0          3.4           1.5          0.2  setosa
8            4.4          2.9           1.4          0.2  setosa
9            4.9          3.1           1.5          0.1  setosa
10           5.4          3.7           1.5          0.2  setosa
11           4.8          3.4           1.6          0.2  setosa
12           4.8          3.0           1.4          0.1  setosa
13           4.3          3.0           1.1          0.1  setosa
14           5

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017273D82670>

In [83]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [84]:
group_by_species.describe()

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,...,petal_length,petal_length,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50.0,5.006,0.35249,4.3,4.8,5.0,5.2,5.8,50.0,3.418,...,1.575,1.9,50.0,0.244,0.10721,0.1,0.2,0.2,0.3,0.6
versicolor,50.0,5.936,0.516171,4.9,5.6,5.9,6.3,7.0,50.0,2.77,...,4.6,5.1,50.0,1.326,0.197753,1.0,1.2,1.3,1.5,1.8
virginica,50.0,6.588,0.63588,4.9,6.225,6.5,6.9,7.9,50.0,2.974,...,5.875,6.9,50.0,2.026,0.27465,1.4,1.8,2.0,2.3,2.5


In [88]:
group_by_species = iris.groupby(by=["species","sepal_length"], axis=0)
[print(i) for i in group_by_species]

(('setosa', 4.3),     sepal_length  sepal_width  petal_length  petal_width species
13           4.3          3.0           1.1          0.1  setosa)
(('setosa', 4.4),     sepal_length  sepal_width  petal_length  petal_width species
8            4.4          2.9           1.4          0.2  setosa
38           4.4          3.0           1.3          0.2  setosa
42           4.4          3.2           1.3          0.2  setosa)
(('setosa', 4.5),     sepal_length  sepal_width  petal_length  petal_width species
41           4.5          2.3           1.3          0.3  setosa)
(('setosa', 4.6),     sepal_length  sepal_width  petal_length  petal_width species
3            4.6          3.1           1.5          0.2  setosa
6            4.6          3.4           1.4          0.3  setosa
22           4.6          3.6           1.0          0.2  setosa
47           4.6          3.2           1.4          0.2  setosa)
(('setosa', 4.7),     sepal_length  sepal_width  petal_length  petal_width spec

52           6.9          3.1           4.9          1.5  versicolor)
(('versicolor', 7.0),     sepal_length  sepal_width  petal_length  petal_width     species
50           7.0          3.2           4.7          1.4  versicolor)
(('virginica', 4.9),      sepal_length  sepal_width  petal_length  petal_width    species
106           4.9          2.5           4.5          1.7  virginica)
(('virginica', 5.6),      sepal_length  sepal_width  petal_length  petal_width    species
121           5.6          2.8           4.9          2.0  virginica)
(('virginica', 5.7),      sepal_length  sepal_width  petal_length  petal_width    species
113           5.7          2.5           5.0          2.0  virginica)
(('virginica', 5.8),      sepal_length  sepal_width  petal_length  petal_width    species
101           5.8          2.7           5.1          1.9  virginica
114           5.8          2.8           5.1          2.4  virginica
142           5.8          2.7           5.1          1.9  vi

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [89]:
iris.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [106]:
group_by_species.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width,petal_length,petal_length,petal_length,petal_length,petal_length,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
species,sepal_length,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
setosa,4.3,1.0,3.0,,3.0,3.0,3.0,3.0,3.0,1.0,1.1,...,1.1,1.1,1.0,0.1,,0.1,0.1,0.1,0.1,0.1
setosa,4.4,3.0,3.033333,0.152753,2.9,2.95,3.0,3.1,3.2,3.0,1.333333,...,1.35,1.4,3.0,0.2,3.3993500000000003e-17,0.2,0.2,0.2,0.2,0.2
setosa,4.5,1.0,2.3,,2.3,2.3,2.3,2.3,2.3,1.0,1.3,...,1.3,1.3,1.0,0.3,,0.3,0.3,0.3,0.3,0.3
setosa,4.6,4.0,3.325,0.221736,3.1,3.175,3.3,3.45,3.6,4.0,1.325,...,1.425,1.5,4.0,0.225,0.05,0.2,0.2,0.2,0.225,0.3
setosa,4.7,2.0,3.2,0.0,3.2,3.2,3.2,3.2,3.2,2.0,1.45,...,1.525,1.6,2.0,0.2,0.0,0.2,0.2,0.2,0.2,0.2
setosa,4.8,5.0,3.18,0.204939,3.0,3.0,3.1,3.4,3.4,5.0,1.58,...,1.6,1.9,5.0,0.2,0.07071068,0.1,0.2,0.2,0.2,0.3
setosa,4.9,4.0,3.075,0.05,3.0,3.075,3.1,3.1,3.1,4.0,1.475,...,1.5,1.5,4.0,0.125,0.05,0.1,0.1,0.1,0.125,0.2
setosa,5.0,8.0,3.3625,0.192261,3.0,3.275,3.4,3.5,3.6,8.0,1.45,...,1.6,1.6,8.0,0.2875,0.1457738,0.2,0.2,0.2,0.325,0.6
setosa,5.1,8.0,3.6,0.2,3.3,3.475,3.6,3.8,3.8,8.0,1.5625,...,1.625,1.9,8.0,0.3125,0.1125992,0.2,0.2,0.3,0.4,0.5
setosa,5.2,3.0,3.666667,0.378594,3.4,3.45,3.5,3.8,4.1,3.0,1.466667,...,1.5,1.5,3.0,0.166667,0.05773503,0.1,0.15,0.2,0.2,0.2


In [93]:
iris[iris['species']=='setosa']

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


## Using the Iris dataset, do the following:

- Add a rank column to the iris DataFrame, ranking the petal length in an ascending order
- Find the mean of each column for each species
- Create a new DataFrame for each of the species separately - should have 3 in total, make sure each index starts at 0
- Create two final DataFrames: One for the Sepal information, and one for the Petal information. This should be done on the whole dataset so that in each DataFrame we have information about all three species

In [128]:
pd.set_option('display.max_rows', 1000, 'display.max_columns', 1000)

In [119]:
iris2 = pd.read_csv("iris.csv")
iris2

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [120]:
iris2.sort_values(by=["petal_length"], ascending=True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
22,4.6,3.6,1.0,0.2,setosa
13,4.3,3.0,1.1,0.1,setosa
14,5.8,4.0,1.2,0.2,setosa
35,5.0,3.2,1.2,0.2,setosa
36,5.5,3.5,1.3,0.2,setosa
...,...,...,...,...,...
131,7.9,3.8,6.4,2.0,virginica
105,7.6,3.0,6.6,2.1,virginica
117,7.7,3.8,6.7,2.2,virginica
122,7.7,2.8,6.7,2.0,virginica


In [136]:
iris2["Rank"] = iris["petal_length"].rank(ascending=True, method="max")
print(iris2.sort_values(by=["Rank"], ascending=True))

     sepal_length  sepal_width  petal_length  petal_width     species   Rank
22            4.6          3.6           1.0          0.2      setosa    1.0
13            4.3          3.0           1.1          0.1      setosa    2.0
14            5.8          4.0           1.2          0.2      setosa    4.0
35            5.0          3.2           1.2          0.2      setosa    4.0
36            5.5          3.5           1.3          0.2      setosa   11.0
40            5.0          3.5           1.3          0.3      setosa   11.0
38            4.4          3.0           1.3          0.2      setosa   11.0
42            4.4          3.2           1.3          0.2      setosa   11.0
2             4.7          3.2           1.3          0.2      setosa   11.0
41            4.5          2.3           1.3          0.3      setosa   11.0
16            5.4          3.9           1.3          0.4      setosa   11.0
17            5.1          3.5           1.4          0.3      setosa   23.0

In [126]:
iris3 = pd.read_csv("iris.csv")

In [127]:
means = iris3.groupby(by=["species"], axis=0).mean()
means

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.418,1.464,0.244
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [141]:
setosa_df = iris3[iris3['species'] =='setosa'].reset_index()
versicolor_df = iris3[iris3['species'] =='versicolor'].reset_index()
virginica_df = iris3[iris3['species'] =='virginica'].reset_index()

In [142]:
virginica_df

Unnamed: 0,index,sepal_length,sepal_width,petal_length,petal_width,species
0,100,6.3,3.3,6.0,2.5,virginica
1,101,5.8,2.7,5.1,1.9,virginica
2,102,7.1,3.0,5.9,2.1,virginica
3,103,6.3,2.9,5.6,1.8,virginica
4,104,6.5,3.0,5.8,2.2,virginica
5,105,7.6,3.0,6.6,2.1,virginica
6,106,4.9,2.5,4.5,1.7,virginica
7,107,7.3,2.9,6.3,1.8,virginica
8,108,6.7,2.5,5.8,1.8,virginica
9,109,7.2,3.6,6.1,2.5,virginica


Create two final DataFrames: One for the Sepal information, and one for the Petal information. This should be done on the whole dataset so that in each DataFrame we have information about all three species

In [145]:
sepal = iris3[["species","sepal_length","sepal_width"]]
sepal

Unnamed: 0,species,sepal_length,sepal_width
0,setosa,5.1,3.5
1,setosa,4.9,3.0
2,setosa,4.7,3.2
3,setosa,4.6,3.1
4,setosa,5.0,3.6
5,setosa,5.4,3.9
6,setosa,4.6,3.4
7,setosa,5.0,3.4
8,setosa,4.4,2.9
9,setosa,4.9,3.1


In [147]:
petal = iris3[["species", "petal_length", "petal_width"]]
petal

Unnamed: 0,species,petal_length,petal_width
0,setosa,1.4,0.2
1,setosa,1.4,0.2
2,setosa,1.3,0.2
3,setosa,1.5,0.2
4,setosa,1.4,0.2
5,setosa,1.7,0.4
6,setosa,1.4,0.3
7,setosa,1.5,0.2
8,setosa,1.4,0.2
9,setosa,1.5,0.1


#### merge and concat to get back to the same kind of dataframe

In [190]:
frames = [sepal, petal]
reverse = pd.concat(frames, axis=1, join="outer")
reverse

Unnamed: 0,species,sepal_length,sepal_width,species.1,petal_length,petal_width
0,setosa,5.1,3.5,setosa,1.4,0.2
1,setosa,4.9,3.0,setosa,1.4,0.2
2,setosa,4.7,3.2,setosa,1.3,0.2
3,setosa,4.6,3.1,setosa,1.5,0.2
4,setosa,5.0,3.6,setosa,1.4,0.2
5,setosa,5.4,3.9,setosa,1.7,0.4
6,setosa,4.6,3.4,setosa,1.4,0.3
7,setosa,5.0,3.4,setosa,1.5,0.2
8,setosa,4.4,2.9,setosa,1.4,0.2
9,setosa,4.9,3.1,setosa,1.5,0.1


In [181]:
reverse2 = pd.merge(sepal, petal, how="inner")
reverse2

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,5.1,3.5,1.4,0.2
2,setosa,5.1,3.5,1.3,0.2
3,setosa,5.1,3.5,1.5,0.2
4,setosa,5.1,3.5,1.4,0.2
...,...,...,...,...,...
7495,virginica,5.9,3.0,5.2,2.3
7496,virginica,5.9,3.0,5.0,1.9
7497,virginica,5.9,3.0,5.2,2.0
7498,virginica,5.9,3.0,5.4,2.3
