# **Pandas**

### 1) Load the CSV

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("Data/mckinsey.csv")
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,1987,9216418,Africa,62.351,706.157306
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623


### Add a new Row
1) Create a dictionary
2) create a dataframe from dictionary
3) Concat new dataframe to existing

In [4]:
new_row = {
'country':'India',
    'year':2024,
    'population':1540000000,
    'continent':'Asia',
    'life_exp':80.00,
    'gdp_cap':939.123
}

In [5]:
new_row

{'country': 'India',
 'year': 2024,
 'population': 1540000000,
 'continent': 'Asia',
 'life_exp': 80.0,
 'gdp_cap': 939.123}

In [8]:
pd.DataFrame([new_row]) # Create a DataFrame from Dictionary

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,India,2024,1540000000,Asia,80.0,939.123


In [9]:
df1 = pd.DataFrame([new_row])

In [10]:
pd.concat([df, df1])

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298


#### New Row is added but the index started from 0. We need to reset the index. 

In [12]:
pd.concat([df,df1],ignore_index = True)

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298


### We can add a new row using loc

In [13]:
df.loc[len(df) + 1] = ['new loc',2000, 1350000000, 'Asia',79.5, 913.13]
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,Afghanistan,1952,8425333,Asia,28.801,779.445314
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298


#### When column is missing, this throws an error

In [15]:
# df.loc[len(df) + 1] = ['new loc',2000, 1350000000,79.5, 913.13] # This throws an error as 'continent' column is missing

#### We cannot add a new row using iloc because we cannot extend the dataframe

In [17]:
#df.iloc[len(df) + 1] = ['new loc',2000, 1350000000, 'Asia',79.5, 913.13] # This throws an error because we cannot extend theh dataframe
#df

### Update the row

In [18]:
df.loc[0] = ['update loc',2000, 1350000000, 'Asia',79.5, 913.13]
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,update loc,2000,1350000000,Asia,79.500,913.130000
1,Afghanistan,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298


In [19]:
df.iloc[1] = ['updated iloc',1957, 9240934, 'Asia',30.332, 820.853030]
df

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,update loc,2000,1350000000,Asia,79.500,913.130000
1,updated iloc,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298


#### Negative Indexing

In [20]:
df.iloc[-1]  # Working because of implicit indexing

country          new loc
year                2000
population    1350000000
continent           Asia
life_exp            79.5
gdp_cap           913.13
Name: 1705, dtype: object

In [22]:
#df.loc[-1] # This will throw error because label -1 is not found

### Drop Rows

In [23]:
df.drop([0,1,1705])

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
5,Afghanistan,1977,14880372,Asia,38.438,786.113360
6,Afghanistan,1982,12881816,Asia,39.854,978.011439
...,...,...,...,...,...,...
1699,Zimbabwe,1987,9216418,Africa,62.351,706.157306
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623


#### Index is not reset. We can use reset_index() to reset index

In [24]:
df.reset_index()

Unnamed: 0,index,country,year,population,continent,life_exp,gdp_cap
0,0,update loc,2000,1350000000,Asia,79.500,913.130000
1,1,updated iloc,1957,9240934,Asia,30.332,820.853030
2,2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...,...
1700,1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298


#### Now we have two indexes. We have to use reset_index(drop= True)

In [25]:
df.reset_index(drop = True)

Unnamed: 0,country,year,population,continent,life_exp,gdp_cap
0,update loc,2000,1350000000,Asia,79.500,913.130000
1,updated iloc,1957,9240934,Asia,30.332,820.853030
2,Afghanistan,1962,10267083,Asia,31.997,853.100710
3,Afghanistan,1967,11537966,Asia,34.020,836.197138
4,Afghanistan,1972,13079460,Asia,36.088,739.981106
...,...,...,...,...,...,...
1700,Zimbabwe,1992,10704340,Africa,60.377,693.420786
1701,Zimbabwe,1997,11404948,Africa,46.809,792.449960
1702,Zimbabwe,2002,11926563,Africa,39.989,672.038623
1703,Zimbabwe,2007,12311143,Africa,43.487,469.709298


### Duplicates

In [26]:
df_dup_dict = {
'brand': ['Yum Yum', 'Yum Yum', 'Indomine','Indomine','Indomine'],
    'style':['cup','cup','pack','pack','pack'],
    'rating':[14,4,15,15,15]
}

In [27]:
df_dup = pd.DataFrame(df_dup_dict)

In [28]:
df_dup

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,14
1,Yum Yum,cup,4
2,Indomine,pack,15
3,Indomine,pack,15
4,Indomine,pack,15


In [30]:
df_dup.duplicated() # This gives us bool with True for duplicated rows

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [32]:
df_dup.loc[[True, True, True, False, False]] # This gives us rows which are true

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,14
1,Yum Yum,cup,4
2,Indomine,pack,15


In [34]:
df_dup.iloc[[True, True, True, False, False]]

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,14
1,Yum Yum,cup,4
2,Indomine,pack,15


In [36]:
df_dup.loc[df_dup.duplicated()] # This gives duplicate rows in the dataframe

Unnamed: 0,brand,style,rating
3,Indomine,pack,15
4,Indomine,pack,15


#### To drop the duplicate rows

In [39]:
df_dup.drop_duplicates() # Drop the duplicates
# if we use inplace = True, this makes permanant change

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,14
1,Yum Yum,cup,4
2,Indomine,pack,15


In [40]:
df_dup

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,14
1,Yum Yum,cup,4
2,Indomine,pack,15
3,Indomine,pack,15
4,Indomine,pack,15


#### Drop Based on Columns

In [44]:
df_dup.drop_duplicates(subset='brand') # Keep first row

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,14
2,Indomine,pack,15


In [45]:
df_dup.drop_duplicates(subset='brand', keep = 'last') # Keep Last row

Unnamed: 0,brand,style,rating
1,Yum Yum,cup,4
4,Indomine,pack,15


In [46]:
df_dup.drop_duplicates(subset=['brand','style'], keep = 'first')

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,14
2,Indomine,pack,15


### Diagonal Elements

In [47]:
df.loc[[0,1,2],['year','continent','life_exp']]

Unnamed: 0,year,continent,life_exp
0,2000,Asia,79.5
1,1957,Asia,30.332
2,1962,Asia,31.997


In [48]:
df.iloc[[0,1,2],[1,0,-1]]

Unnamed: 0,year,country,gdp_cap
0,2000,update loc,913.13
1,1957,updated iloc,820.85303
2,1962,Afghanistan,853.10071


### Pandas Builtin Operations

In [49]:
df['life_exp']

0       79.500
1       30.332
2       31.997
3       34.020
4       36.088
         ...  
1700    60.377
1701    46.809
1702    39.989
1703    43.487
1705    79.500
Name: life_exp, Length: 1705, dtype: float64

In [50]:
df['life_exp'].mean()

59.51592004692082

In [51]:
df['life_exp'].min()

23.599