# Installing and Getting Started with Pandas

# Pandas DataFrame

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "State": ['Andhra Pradesh', 'Maharastra', 'Karnata', 'Kerala', 'Tamil Nadu'],
    "Capital": ['Hyderabad', 'Mumbai', 'Bengaluru', 'Trivaandrum', 'Chennai'],
    "Literacy %": [89, 77, 82, 97, 85],
    "Avg High Temp(c)": [33, 30, 29, 31, 32]
})

print(df)

            State      Capital  Literacy %  Avg High Temp(c)
0  Andhra Pradesh    Hyderabad          89                33
1      Maharastra       Mumbai          77                30
2         Karnata    Bengaluru          82                29
3          Kerala  Trivaandrum          97                31
4      Tamil Nadu      Chennai          85                32


# Importing data from CSV to DataFrame

In [7]:
import pandas as pd

data = pd.read_csv('/home/bipinhyoju/Downloads/train.csv')

print(data)

                                                    path          name  \
0      /kaggle/input/benetech-making-graphs-accessibl...  cc68f19b708c   
1      /kaggle/input/benetech-making-graphs-accessibl...  0e6a9bca7381   
2      /kaggle/input/benetech-making-graphs-accessibl...  ac9680f519f5   
3      /kaggle/input/benetech-making-graphs-accessibl...  4a6cce801fe8   
4      /kaggle/input/benetech-making-graphs-accessibl...  19938a5762f6   
...                                                  ...           ...   
60573  /kaggle/input/benetech-making-graphs-accessibl...  d3953f6dc2b6   
60574  /kaggle/input/benetech-making-graphs-accessibl...  292f04b08480   
60575  /kaggle/input/benetech-making-graphs-accessibl...  2af23469c1d8   
60576  /kaggle/input/benetech-making-graphs-accessibl...  fc88838dcd7d   
60577  /kaggle/input/benetech-making-graphs-accessibl...  a1733f81c31e   

                          annotation                    image     source  \
0      annotations/cc68f19b708c.jso

# Inspecting data in DataFrame

In [8]:
print(df.head(2))

            State    Capital  Literacy %  Avg High Temp(c)
0  Andhra Pradesh  Hyderabad          89                33
1      Maharastra     Mumbai          77                30


In [10]:
print(df.tail(1))

        State  Capital  Literacy %  Avg High Temp(c)
4  Tamil Nadu  Chennai          85                32


In [12]:
print(df.dtypes)

State               object
Capital             object
Literacy %           int64
Avg High Temp(c)     int64
dtype: object


In [13]:
print(df.index)

RangeIndex(start=0, stop=5, step=1)


In [14]:
print(df.columns)

Index(['State', 'Capital', 'Literacy %', 'Avg High Temp(c)'], dtype='object')


In [15]:
print(df.values)

[['Andhra Pradesh' 'Hyderabad' 89 33]
 ['Maharastra' 'Mumbai' 77 30]
 ['Karnata' 'Bengaluru' 82 29]
 ['Kerala' 'Trivaandrum' 97 31]
 ['Tamil Nadu' 'Chennai' 85 32]]


In [19]:
# 1. Getting Statistical summary of records

In [18]:
print(df['Literacy %'].describe())

count     5.000000
mean     86.000000
std       7.549834
min      77.000000
25%      82.000000
50%      85.000000
75%      89.000000
max      97.000000
Name: Literacy %, dtype: float64


# 2.Sorting records

In [20]:
print(df.sort_values('Literacy %', ascending=False))

            State      Capital  Literacy %  Avg High Temp(c)
3          Kerala  Trivaandrum          97                31
0  Andhra Pradesh    Hyderabad          89                33
4      Tamil Nadu      Chennai          85                32
2         Karnata    Bengaluru          82                29
1      Maharastra       Mumbai          77                30


# 3. Slicing records

In [23]:
df['Capital']

0      Hyderabad
1         Mumbai
2      Bengaluru
3    Trivaandrum
4        Chennai
Name: Capital, dtype: object

In [24]:
print(df[['State', 'Capital']])

            State      Capital
0  Andhra Pradesh    Hyderabad
1      Maharastra       Mumbai
2         Karnata    Bengaluru
3          Kerala  Trivaandrum
4      Tamil Nadu      Chennai


In [25]:
df[0:3]

Unnamed: 0,State,Capital,Literacy %,Avg High Temp(c)
0,Andhra Pradesh,Hyderabad,89,33
1,Maharastra,Mumbai,77,30
2,Karnata,Bengaluru,82,29


# 4. Filtering data

In [29]:
print(df[df['Literacy %']>90])

    State      Capital  Literacy %  Avg High Temp(c)
3  Kerala  Trivaandrum          97                31


In [34]:
print(df[df['State'].isin(['Tamil Nadu', 'Karnata'])])

        State    Capital  Literacy %  Avg High Temp(c)
2     Karnata  Bengaluru          82                29
4  Tamil Nadu    Chennai          85                32


# 5. Rename column

In [35]:
df.rename(columns = {'Literacy %': 'Literacy percentage'}, inplace=True)
print(df.head())

            State      Capital  Literacy percentage  Avg High Temp(c)
0  Andhra Pradesh    Hyderabad                   89                33
1      Maharastra       Mumbai                   77                30
2         Karnata    Bengaluru                   82                29
3          Kerala  Trivaandrum                   97                31
4      Tamil Nadu      Chennai                   85                32


# 6. Data Wrangling

In [37]:
import pandas as pd

d = {
    'Employee_id': ['1', '2', '3', '4', '5'],
    'Employee_name': ['Akshar', 'Jones', 'Kate', 'Mike', 'Tina']
}

df1 = pd.DataFrame(d, columns=['Employee_id', 'Employee_name'])

print(df1)

  Employee_id Employee_name
0           1        Akshar
1           2         Jones
2           3          Kate
3           4          Mike
4           5          Tina


In [38]:
import pandas as pd

data = {
    'Employee_id': ['4', '5', '6', '7', '8'],
    'Employee_name': ['Meera', 'Tia', 'Varsha', 'Williams', 'Ziva']
}

df2 = pd.DataFrame(data, columns=['Employee_id', 'Employee_name'])

print(df2)

  Employee_id Employee_name
0           4         Meera
1           5           Tia
2           6        Varsha
3           7      Williams
4           8          Ziva


# a. Merging

In [39]:
print(pd.merge(df1, df2, on='Employee_id'))

  Employee_id Employee_name_x Employee_name_y
0           4            Mike           Meera
1           5            Tina             Tia


# b. Grouping

In [41]:
import pandas as pd
import numpy as np

data = {
    'Employee_id': ['4', '5', '6', '7', '8'],
    'Employee_name': ['Meera', 'Meera', 'Varsha', 'Williams', 'Ziva']
}

df2 = pd.DataFrame(data)

group = df2.groupby('Employee_name')

print(group.get_group('Meera'))

  Employee_id Employee_name
0           4         Meera
1           5         Meera


# c. Concatenating

In [42]:
print(pd.concat([df1, df2]))

  Employee_id Employee_name
0           1        Akshar
1           2         Jones
2           3          Kate
3           4          Mike
4           5          Tina
0           4         Meera
1           5         Meera
2           6        Varsha
3           7      Williams
4           8          Ziva


# Create a DataFrame by passing Dict of Series

In [43]:
series_sample = pd.Series([100, 200, 300, 400])
print(series_sample)

0    100
1    200
2    300
3    400
dtype: int64


In [44]:
d = {'Matches played': pd.Series([400, 300, 200], index=['Sachin', 'Kohli', 'Raina']),
     'Position': pd.Series([1, 2, 3, 4], index=['Sachin', 'Kohli', 'Raina', 'Dravid'])}

df = pd.DataFrame(d)

print(df)

        Matches played  Position
Dravid             NaN         4
Kohli            300.0         2
Raina            200.0         3
Sachin           400.0         1


# Column Selection, Addition, Deletion

In [45]:
d = {'Matches played': pd.Series([400, 300,200], index=['Sachin', 'Kohli', 'Raina']),
'Position': pd.Series([1, 2, 3, 4], index=['Sachin', 'Kohli', 'Raina', 'Dravid'])}

df = pd.DataFrame(d)

print(df['Matches played'])

Dravid      NaN
Kohli     300.0
Raina     200.0
Sachin    400.0
Name: Matches played, dtype: float64


In [None]:
d = {'Matches played': pd.Series([400, 300, 200], index = ['Sachin', 'Kohli', 'Raina']),
     'Position': pd.Series([1, 2, 3, 4], index= ['Sachin', 'Kohli', 'Raina', 'Dravid'])}

df = pd.DataFrame(d)

df['Runrate']