In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## reading data

In [None]:
# dict of lists
info = {
    'names': ['Suraj','Shiv','Shivi','Anjali','Sunny'],
    'age' : [19, 5, 21, 20, 22],
    'height' : [5.23, 3.5, 5.1, 5.9, 6.1],
}
d1 = pd.DataFrame(info)
d1

In [None]:
d2 = pd.read_csv('datasample.csv')
d2

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_excel('Canada.xlsx', 
    sheet_name=1,
    skiprows=20,
    skipfooter=2
)
df

## looking at the data
- head(n) - top n rows of the data
- tail(n) - bottom n rows of the data
- sample(n) - random n samples of the data
- info() - column wise datatypes
- describe() - summary statistics of the data

In [None]:
df.head(2)

In [None]:
df.tail()

In [None]:
df.sample(3)

In [None]:
df.info()

In [None]:
df.describe() # numerical columns

In [None]:
df.describe(include='object')

## column wise data

In [None]:
df.columns

In [None]:
df['AreaName'] # series object

In [None]:
# multiple columns
df[['AreaName', 'RegName']] # dataframe object

In [None]:
years = list(range(1980, 2014))
df[years]

In [None]:
df[['OdName',1980,1990,2000,2010]]

In [None]:
# row wise access using loc and iloc
df.loc[1] # label wise location

In [None]:
df.iloc[1] # index wise location

In [None]:
df.iloc[:10, [0,1,2,3,4]] # column should be indexes

In [None]:
df.loc[:10, ['OdName',1980,1990,2000,2010]] # column should be labels

## manipulating data

In [None]:
d2['english'] + d2['math'] + d2['science']

In [None]:
d2['total'] = d2['english'] + d2['math'] + d2['science']
d2

In [None]:
df['total'] = df[years].sum(axis=1)
df

In [None]:
df['dummy_col'] = 1
df

# dropping the unwanted columns
- drop() - drop the columns
- dropna() - drop the rows with missing values
- drop_duplicates() - drop the duplicate rows

In [None]:
# drop
cols_to_drop = ['Type','Coverage','AREA','REG','DEV','dummy_col']
df = df.drop(columns=cols_to_drop)

In [None]:
df = df.rename(
    columns={
        'OdName': 'country',
        'AreaName':'continent',
        'RegName':'region',
        'DevName':'status'
    }
)

In [None]:
df.sort_values(by='total') # ascending order

In [None]:
df = df.sort_values(by='total', ascending=False) # descending order

In [None]:
df.head(10)['total'].mean()

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
sns.barplot(x='country', y='total', data=df.head(10), ax=ax)
ax.axhline(df.head(10)['total'].mean(), color='r', linestyle='--')
ax.text(5, df.head(10)['total'].mean()+10000, df.head(10)['total'].mean())
plt.title("Top 10 countries with highest immigration overall")
plt.xticks(rotation=90)
plt.show()

In [None]:
df['country'].tolist()

In [None]:
df = df.replace('United Kingdom of Great Britain and Northern Ireland', 'UK')
df

In [None]:
df['total_str'] = df['total'].apply(lambda i: f"{i/100000:.2f} Lac")
df

# grouping data
- groupby() - group the data
  - groups the data according a single/multiple columns and then allows us to do the operations like mean, sum, max, etc
- pivot_table() - pivot the data
  - generate a matrix based on row and column provided

In [None]:
df.groupby(by='continent') # groupby object

In [None]:
# getting a particular group subset
df.groupby(by='continent').get_group('Europe') # only subset for asia

In [None]:
# getting group based stats
df.groupby(by='continent')['total'].sum()

In [None]:
df.groupby(by='continent')[[1980,1981,1982,1983,1984]].sum()

In [None]:
df.groupby(by='continent')[years].sum()

In [None]:
# when grouping on multiple columns, add reset_index to get the dataframe
df.groupby(by=['continent','status'])['total'].sum().reset_index()

In [None]:
continent_df = df.groupby(by='continent')[years].sum()
sns.heatmap(continent_df, cmap='coolwarm')

In [None]:
df.groupby(by='continent')['total'].sum()

In [None]:
cdf = df.groupby(by='continent')['total'].sum()
sns.barplot(x=cdf.index, y=cdf.values)

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
sns.lineplot(data=continent_df.T, dashes=False)

In [None]:
# 3 items for pivot - index, column, values
# index and columns should be categorical type
# values must be numerical type
df2 = df.pivot_table(index='continent', 
            columns='status', 
            values='total')

In [None]:
# pandas plot function
df2.plot(kind='pie', subplots=True, figsize=(15,6), autopct='%1.1f%%')

### task - download the pokemon dataset from kaggle.com and put it inside the folder