In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## reading data

In [None]:
# dict of lists
info = {
    'names': ['Suraj','Shiv','Shivi','Anjali','Sunny'],
    'age' : [19, 5, 21, 20, 22],
    'height' : [5.23, 3.5, 5.1, 5.9, 6.1],
}
d1 = pd.DataFrame(info)
d1

In [None]:
d2 = pd.read_csv('datasample.csv')
d2

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_excel('Canada.xlsx', 
    sheet_name=1,
    skiprows=20,
    skipfooter=2
)
df

## looking at the data
- head(n) - top n rows of the data
- tail(n) - bottom n rows of the data
- sample(n) - random n samples of the data
- info() - column wise datatypes
- describe() - summary statistics of the data

In [None]:
df.head(2)

In [None]:
df.tail()

In [None]:
df.sample(3)

In [None]:
df.info()

In [None]:
df.describe() # numerical columns

In [None]:
df.describe(include='object')

## column wise data

In [None]:
df.columns

In [None]:
df['AreaName'] # series object

In [None]:
# multiple columns
df[['AreaName', 'RegName']] # dataframe object

In [None]:
years = list(range(1980, 2014))
df[years]

In [None]:
df[['OdName',1980,1990,2000,2010]]

In [None]:
# row wise access using loc and iloc
df.loc[1] # label wise location

In [None]:
df.iloc[1] # index wise location

In [None]:
df.iloc[:10, [0,1,2,3,4]] # column should be indexes

In [None]:
df.loc[:10, ['OdName',1980,1990,2000,2010]] # column should be labels

## manipulating data

In [None]:
d2['english'] + d2['math'] + d2['science']

In [None]:
d2['total'] = d2['english'] + d2['math'] + d2['science']
d2

In [None]:
df['total'] = df[years].sum(axis=1)
df

In [None]:
df['dummy_col'] = 1
df

# dropping the unwanted columns
- drop() - drop the columns
- dropna() - drop the rows with missing values
- drop_duplicates() - drop the duplicate rows

In [None]:
# drop
cols_to_drop = ['Type','Coverage','AREA','REG','DEV','dummy_col']
df = df.drop(columns=cols_to_drop)

In [None]:
df = df.rename(
    columns={
        'OdName': 'country',
        'AreaName':'continent',
        'RegName':'region',
        'DevName':'status'
    }
)

In [None]:
df.sort_values(by='total') # ascending order

In [None]:
df = df.sort_values(by='total', ascending=False) # descending order

In [None]:
df.head(10)['total'].mean()

In [None]:
fig, ax = plt.subplots(figsize=(15,6))
sns.barplot(x='country', y='total', data=df.head(10), ax=ax)
ax.axhline(df.head(10)['total'].mean(), color='r', linestyle='--')
ax.text(5, df.head(10)['total'].mean()+10000, df.head(10)['total'].mean())
plt.title("Top 10 countries with highest immigration overall")
plt.xticks(rotation=90)
plt.show()

In [None]:
df['country'].tolist()

In [None]:
df = df.replace('United Kingdom of Great Britain and Northern Ireland', 'UK')
df

In [85]:
df['total_str'] = df['total'].apply(lambda i: f"{i/100000:.2f} Lac")
df

Unnamed: 0,country,continent,region,status,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,total,total_str
79,India,Asia,Southern Asia,Developing regions,8880,8670,8147,7338,5704,4211,7150,10189,11522,10343,12041,13734,13673,21496,18620,18489,23859,22268,17241,18974,28572,31223,31889,27155,28235,36210,33848,28742,28261,29456,34235,27509,30933,33087,691904,6.92 Lac
36,China,Asia,Eastern Asia,Developing regions,5123,6682,3308,1863,1527,1816,1960,2643,2758,4323,8076,14255,10846,9817,13128,14398,19415,20475,21049,30069,35529,36434,31961,36439,36619,42584,33518,27642,30037,29622,30391,28502,33024,34129,659962,6.60 Lac
183,UK,Europe,Northern Europe,Developed regions,22045,24796,20620,10015,10170,9564,9470,21337,27359,23795,31668,23380,34123,33720,39231,30145,29322,22965,10367,7045,8840,11728,8046,6797,7533,7258,7140,8216,8979,8876,8724,6204,6195,5827,551500,5.51 Lac
136,Philippines,Asia,South-Eastern Asia,Developing regions,6051,5921,5249,4562,3801,3150,4166,7360,8639,11865,12509,12718,13670,20479,19532,15864,13692,11549,8735,9734,10763,13836,11707,12758,14004,18139,18400,19837,24887,28573,38617,36765,34315,29544,511391,5.11 Lac
130,Pakistan,Asia,Southern Asia,Developing regions,978,972,1201,900,668,514,691,1072,1334,2261,2470,3079,4071,4777,4666,4994,9125,13073,9068,9979,15400,16708,15110,13205,13399,14314,13127,10124,8994,7217,6811,7468,11227,12603,241600,2.42 Lac
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,New Caledonia,Oceania,Melanesia,Developing regions,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,2,5,0.00 Lac
149,San Marino,Europe,Southern Europe,Developed regions,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,5,0.00 Lac
109,Marshall Islands,Oceania,Micronesia,Developing regions,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0.00 Lac
191,Western Sahara,Africa,Northern Africa,Developing regions,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,0.00 Lac
