# <font color="purple"><h3 align="center">DataFrame Basics Tutorial</h3></font>

## **Dataframe is most commonly used object in pandas. It is a table like datastructure containing rows and columns similar to excel spreadsheet**

In [3]:
import pandas as pd
weather_data = {
    'day': ['1/1/2017','1/2/2017','1/3/2017','1/4/2017','1/5/2017','1/6/2017'],
    'temperature': [32,35,28,24,32,31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow','Snow','Rain', 'windy']
}
df = pd.DataFrame(weather_data)
# df = pd.read_csv("weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,windy


In [11]:
df.shape # rows, columns = df.shape
# df.columns[-2:] # indexing columns using dot notation
# df.day
# df.windspeed


df["windspeed"] # indexing columns using square bracket notation
df.["02/12-2021"]


0    6
1    7
2    2
3    7
4    4
5    2
Name: windspeed, dtype: int64

In [7]:
type(df.day)

pandas.core.series.Series

## <font color='blue'>Rows</font>

In [12]:
df.head(6) #first 5 entries

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,windy


In [10]:
df.describe() #statist

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,30.333333,4.666667
std,3.829708,2.33809
min,24.0,2.0
25%,28.75,2.5
50%,31.5,5.0
75%,32.0,6.75
max,35.0,7.0


In [14]:
df[1:3]

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow


In [18]:
# .iloc[] (index location) this gives you access to a particular row in a dataframe by index.

df.iloc[2]# will return a series

df.iloc[2:3] # if a dataframe is required then preferably use index location slicing

Unnamed: 0,day,temperature,windspeed,event
2,1/3/2017,28,2,Snow


## <font color='blue'>Columns</font>

In [9]:
df.columns

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')

In [10]:
df['day'] # or df.day

0    1/1/2017
1    1/2/2017
2    1/3/2017
3    1/4/2017
4    1/5/2017
5    1/6/2017
Name: day, dtype: object

In [13]:
type(df['temperature'])

pandas.core.series.Series

In [12]:
df[['day','temperature']]

Unnamed: 0,day,temperature
0,1/1/2017,32
1,1/2/2017,35
2,1/3/2017,28
3,1/4/2017,24
4,1/5/2017,32
5,1/6/2017,31


## <font color='blue'>Operations On DataFrame</font>

In [21]:
df['temperature'].mean()

30.333333333333332

In [23]:
df['temperature'].mean().round(2)

30.33

In [15]:
df[df['temperature']>28]

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
4,1/5/2017,32,4,Rain
5,1/6/2017,31,2,Sunny


In [24]:
df['temperature'] > 32 # THIS RETURNS A BOOLEAN MASK OF REQUIRED CONDITION

0    False
1     True
2    False
3    False
4    False
5    False
Name: temperature, dtype: bool

In [25]:
greater_than_32_mask = df['temperature'] > 32 # THIS RETURNS A BOOLEAN MASK OF REQUIRED CONDITION
df[greater_than_32_mask]

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2017,35,7,Sunny


In [26]:
df["above 32"] = greater_than_32_mask
df

Unnamed: 0,day,temperature,windspeed,event,above 32
0,1/1/2017,32,6,Rain,False
1,1/2/2017,35,7,Sunny,True
2,1/3/2017,28,2,Snow,False
3,1/4/2017,24,7,Snow,False
4,1/5/2017,32,4,Rain,False
5,1/6/2017,31,2,windy,False


In [27]:
df[["day", "above 32"]]

Unnamed: 0,day,above 32
0,1/1/2017,False
1,1/2/2017,True
2,1/3/2017,False
3,1/4/2017,False
4,1/5/2017,False
5,1/6/2017,False


In [15]:
df['day'][df['temperature'] == df['temperature'].max()]

1    1/2/2017
Name: day, dtype: object

In [16]:
df[df['temperature'] == df['temperature'].max()] # Kinda doing SQL in pandas

Unnamed: 0,day,temperature,windspeed,event
1,1/2/2017,35,7,Sunny


In [17]:
df['temperature'].std()

3.8297084310253524

In [18]:
df['event'].max() # But mean() won't work since data type is string

'Sunny'

In [19]:
df.describe()

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,30.333333,4.666667
std,3.829708,2.33809
min,24.0,2.0
25%,28.75,2.5
50%,31.5,5.0
75%,32.0,6.75
max,35.0,7.0


**Google pandas series operations to find out list of all operations**
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

## <font color='blue'>set_index</font>

In [20]:
df.set_index('day')

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,32,4,Rain
1/6/2017,31,2,Sunny


In [3]:
df.set_index('day', inplace=True)

In [22]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,32,4,Rain
1/6/2017,31,2,Sunny


In [23]:
df.index

Index(['1/1/2017', '1/2/2017', '1/3/2017', '1/4/2017', '1/5/2017', '1/6/2017'], dtype='object', name='day')

In [24]:
df.loc['1/2/2017']

temperature       35
windspeed          7
event          Sunny
Name: 1/2/2017, dtype: object

In [25]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Rain


In [26]:
df.set_index('event',inplace=True) # this is kind of building a hash map using event as a key
df

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Sunny,1/2/2017,35,7
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
Rain,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [29]:
df.loc['Rain']

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Rain,1/5/2017,32,4
