In [1]:
import pandas as pd

### Grouping information

In [101]:
hr_data = pd.read_csv('../data-science-complete-tutorial/Data/HR_comma_sep.csv.txt')

In [4]:
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
hr_data.rename(columns={'sales':'dept'}, inplace=True)

In [7]:
hr_data.left.unique()

array([1, 0])

In [9]:
gdf = hr_data.groupby(['left'])

In [11]:
gdf.satisfaction_level.mean()

left
0    0.666810
1    0.440098
Name: satisfaction_level, dtype: float64

### Find out mean of average_montly_hours of people in different salary bracket

In [13]:
hr_data.groupby(['salary']).average_montly_hours.mean()

salary
high      199.867421
low       200.996583
medium    201.338349
Name: average_montly_hours, dtype: float64

### Find satisfaction level across different department & salary bracket

In [16]:
hr_data.groupby(['dept','salary']).satisfaction_level.mean().sort_values(ascending=False)

dept         salary
hr           high      0.673111
support      high      0.655035
management   high      0.653333
sales        high      0.648959
support      medium    0.645149
marketing    medium    0.638218
IT           high      0.638193
technical    high      0.625970
sales        medium    0.625327
IT           medium    0.624187
RandD        low       0.623929
technical    medium    0.620968
product_mng  low       0.620909
RandD        medium    0.620349
product_mng  medium    0.619112
             high      0.614118
accounting   high      0.614054
management   low       0.610722
IT           low       0.610099
hr           low       0.608657
marketing    high      0.605250
             low       0.602910
sales        low       0.600838
management   medium    0.597867
technical    low       0.594322
support      low       0.591710
RandD        high      0.586667
accounting   medium    0.583642
hr           medium    0.580306
accounting   low       0.574162
Name: satisfaction_l

### Socio-economic data

In [17]:
gap_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/gapminder-FiveYearData.csv')

In [18]:
gap_data.sample(10)

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
1090,Netherlands,2002,16122830.0,Europe,78.53,33724.75778
574,Germany,2002,82350671.0,Europe,78.67,30035.80198
1201,Peru,1957,9146100.0,Americas,46.263,4245.256698
767,Israel,2007,6426679.0,Asia,80.745,25523.2771
915,Madagascar,1967,6334556.0,Africa,42.881,1634.047282
483,Equatorial Guinea,1967,259864.0,Africa,38.987,915.596003
1186,Panama,2002,2990875.0,Americas,74.712,7356.031934
914,Madagascar,1962,5703324.0,Africa,40.848,1643.38711
1403,Somalia,2007,9118773.0,Africa,48.159,926.141068
1227,Poland,1967,31785378.0,Europe,69.61,6557.152776


### Find total population of each continent

In [21]:
gap_data.groupby(['continent']).pop.sum().sort_values(ascending=False)

continent
Asia        3.050733e+10
Americas    7.351438e+09
Africa      6.187586e+09
Europe      6.181115e+09
Oceania     2.129921e+08
Name: pop, dtype: float64

### Get Across continent Per Year gdppercapita
* To calculate percapgdp of each continent we have to do the following
  - Calculate total capital of the continent
  - Calculate total population of the continent
  - Div first by second

In [24]:
gap_data['totalcap'] = gap_data['pop'] * gap_data.gdpPercap

In [28]:
total_cont_cap = gap_data.groupby(['continent','year']).totalcap.sum()

In [34]:
total_cont_pop = gap_data.groupby(['continent','year']).pop.sum()

In [36]:
df1 = pd.DataFrame(total_cont_cap)

In [37]:
df2 = pd.DataFrame(total_cont_pop)

In [41]:
res = pd.concat([df1,df2],axis=1)

In [43]:
res['percapgdpcont'] = res['totalcap']/res['pop']

In [44]:
res

Unnamed: 0_level_0,Unnamed: 1_level_0,totalcap,pop,percapgdpcont
continent,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,1952,311599300000.0,237640500.0,1311.221439
Africa,1957,382677800000.0,264837700.0,1444.951993
Africa,1962,456813600000.0,296516900.0,1540.599054
Africa,1967,595087700000.0,335289500.0,1774.847446
Africa,1972,783756600000.0,379879500.0,2063.171343
Africa,1977,972134700000.0,433061000.0,2244.798509
Africa,1982,1146101000000.0,499348600.0,2295.191944
Africa,1987,1253578000000.0,574834100.0,2180.764349
Africa,1992,1365363000000.0,659081500.0,2071.614521
Africa,1997,1561205000000.0,743833000.0,2098.865031


### Stacking & Unstacking
* Stacking - Columns are getting converted into index


In [46]:
import numpy as np
index = pd.MultiIndex.from_product([[2013, 2014], ['yes','no']],
                                   names=['year', 'death'])
columns = pd.MultiIndex.from_product([['Mumbai', 'Delhi', 'Bangalore'], 
                                      ['two-wheeler', 'four-wheeler']],
                                     names=['city', 'type'])


data = np.random.randint(1,100,(4,6))

accident_data = pd.DataFrame(data, index=index, columns=columns)
accident_data

Unnamed: 0_level_0,city,Mumbai,Mumbai,Delhi,Delhi,Bangalore,Bangalore
Unnamed: 0_level_1,type,two-wheeler,four-wheeler,two-wheeler,four-wheeler,two-wheeler,four-wheeler
year,death,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,yes,74,40,73,40,22,21
2013,no,42,48,30,82,1,85
2014,yes,35,14,56,79,2,32
2014,no,20,65,78,33,71,3


In [48]:
accident_data.columns

MultiIndex([(   'Mumbai',  'two-wheeler'),
            (   'Mumbai', 'four-wheeler'),
            (    'Delhi',  'two-wheeler'),
            (    'Delhi', 'four-wheeler'),
            ('Bangalore',  'two-wheeler'),
            ('Bangalore', 'four-wheeler')],
           names=['city', 'type'])

In [49]:
accident_data.index

MultiIndex([(2013, 'yes'),
            (2013,  'no'),
            (2014, 'yes'),
            (2014,  'no')],
           names=['year', 'death'])

In [51]:
accident_data.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,city,Bangalore,Delhi,Mumbai
year,death,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013,yes,four-wheeler,21,40,40
2013,yes,two-wheeler,22,73,74
2013,no,four-wheeler,85,82,48
2013,no,two-wheeler,1,30,42
2014,yes,four-wheeler,32,79,14
2014,yes,two-wheeler,2,56,35
2014,no,four-wheeler,3,33,65
2014,no,two-wheeler,71,78,20


In [52]:
accident_data.unstack()

city,Mumbai,Mumbai,Mumbai,Mumbai,Delhi,Delhi,Delhi,Delhi,Bangalore,Bangalore,Bangalore,Bangalore
type,two-wheeler,two-wheeler,four-wheeler,four-wheeler,two-wheeler,two-wheeler,four-wheeler,four-wheeler,two-wheeler,two-wheeler,four-wheeler,four-wheeler
death,no,yes,no,yes,no,yes,no,yes,no,yes,no,yes
year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
2013,42,74,48,40,30,73,82,40,1,22,85,21
2014,20,35,65,14,78,56,33,79,71,2,3,32


In [55]:
res.stack()

continent  year               
Africa     1952  totalcap         3.115993e+11
                 pop              2.376405e+08
                 percapgdpcont    1.311221e+03
           1957  totalcap         3.826778e+11
                 pop              2.648377e+08
                                      ...     
Oceania    2002  pop              2.345483e+07
                 percapgdpcont    2.943845e+04
           2007  totalcap         8.073141e+11
                 pop              2.454995e+07
                 percapgdpcont    3.288456e+04
Length: 180, dtype: float64

### Pivot Table

In [57]:
gap_data.sample(10)

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap,totalcap
1459,Swaziland,1987,779348.0,Africa,57.678,3984.839812,3105577000.0
444,Ecuador,1952,3548753.0,Americas,48.357,3522.110717,12499100000.0
483,Equatorial Guinea,1967,259864.0,Africa,38.987,915.596003,237930400.0
286,Chile,2002,15497046.0,Americas,77.86,10778.78385,167039300000.0
524,Finland,1992,5041039.0,Europe,75.7,20647.16499,104083200000.0
935,Malawi,2007,13327079.0,Africa,48.303,759.34991,10119920000.0
57,Argentina,1997,36203463.0,Americas,73.275,10967.28195,397053600000.0
1687,Zambia,1987,7272406.0,Africa,50.821,1213.315116,8823720000.0
508,Ethiopia,1972,30770372.0,Africa,43.515,566.243944,17423540000.0
993,Mexico,1997,95895146.0,Americas,73.67,9767.29753,936636400000.0


In [59]:
gap_data.pivot_table(index='country',columns='year',values='pop').sample(10)

year,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Zambia,2672000.0,3016000.0,3421000.0,3900000.0,4506497.0,5216550.0,6100407.0,7272406.0,8381163.0,9417789.0,10595811.0,11746035.0
Serbia,6860147.0,7271135.0,7616060.0,7971222.0,8313288.0,8686367.0,9032824.0,9230783.0,9826397.0,10336594.0,10111559.0,10150265.0
Uruguay,2252965.0,2424959.0,2598466.0,2748579.0,2829526.0,2873520.0,2953997.0,3045153.0,3149262.0,3262838.0,3363085.0,3447496.0
Taiwan,8550362.0,10164215.0,11918938.0,13648692.0,15226039.0,16785196.0,18501390.0,19757799.0,20686918.0,21628605.0,22454239.0,23174294.0
Togo,1219113.0,1357445.0,1528098.0,1735550.0,2056351.0,2308582.0,2644765.0,3154264.0,3747553.0,4320890.0,4977378.0,5701579.0
Greece,7733250.0,8096218.0,8448233.0,8716441.0,8888628.0,9308479.0,9786480.0,9974490.0,10325429.0,10502372.0,10603863.0,10706290.0
Botswana,442308.0,474639.0,512764.0,553541.0,619351.0,781472.0,970347.0,1151184.0,1342614.0,1536536.0,1630347.0,1639131.0
Canada,14785584.0,17010154.0,18985849.0,20819767.0,22284500.0,23796400.0,25201900.0,26549700.0,28523502.0,30305843.0,31902268.0,33390141.0
Myanmar,20092996.0,21731844.0,23634436.0,25870271.0,28466390.0,31528087.0,34680442.0,38028578.0,40546538.0,43247867.0,45598081.0,47761980.0
Cameroon,5009067.0,5359923.0,5793633.0,6335506.0,7021028.0,7959865.0,9250831.0,10780667.0,12467171.0,14195809.0,15929988.0,17696293.0


In [63]:
gap_data.pivot_table(index='continent',columns='year',values='pop', aggfunc=np.median)

year,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Africa,2668124.5,2885790.5,3145210.0,3473692.5,3945594.5,4522666.0,5668228.5,6635611.5,7140388.5,7805422.5,8821778.5,10093310.5
Americas,3146381.0,3507701.0,3880130.0,4318137.0,4698301.0,5302800.0,5968349.0,6655297.0,7351181.0,7992357.0,8650322.0,9319622.0
Asia,7982342.0,9128546.0,10267083.0,11261690.0,12412593.0,13933198.0,14441916.0,16495304.0,17861905.0,21229759.0,22662365.0,24821286.0
Europe,7199786.5,7507528.0,7814503.0,8140724.0,8444744.0,8741694.5,8962461.0,9101370.5,9272632.0,9527017.0,9518744.0,9493598.0
Oceania,5343003.0,5970988.0,6641759.0,7300207.0,8053050.0,8619500.0,9197425.0,9787207.5,10459825.5,11120715.0,11727414.5,12274973.5


### Pivot Table for country & year for lifeexp

In [64]:
gap_data.pivot_table(index='country',columns='year',values='lifeExp')

year,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Afghanistan,28.801,30.33200,31.99700,34.02000,36.08800,38.43800,39.854,40.822,41.674,41.763,42.129,43.828
Albania,55.230,59.28000,64.82000,66.22000,67.69000,68.93000,70.420,72.000,71.581,72.950,75.651,76.423
Algeria,43.077,45.68500,48.30300,51.40700,54.51800,58.01400,61.368,65.799,67.744,69.152,70.994,72.301
Angola,30.015,31.99900,34.00000,35.98500,37.92800,39.48300,39.942,39.906,40.647,40.963,41.003,42.731
Argentina,62.485,64.39900,65.14200,65.63400,67.06500,68.48100,69.942,70.774,71.868,73.275,74.340,75.320
Australia,69.120,70.33000,70.93000,71.10000,71.93000,73.49000,74.740,76.320,77.560,78.830,80.370,81.235
Austria,66.800,67.48000,69.54000,70.14000,70.63000,72.17000,73.180,74.940,76.040,77.510,78.980,79.829
Bahrain,50.939,53.83200,56.92300,59.92300,63.30000,65.59300,69.052,70.750,72.601,73.925,74.795,75.635
Bangladesh,37.484,39.34800,41.21600,43.45300,45.25200,46.92300,50.009,52.819,56.018,59.412,62.013,64.062
Belgium,68.000,69.24000,70.25000,70.94000,71.44000,72.80000,73.930,75.350,76.460,77.530,78.320,79.441


### Pivot table for department & number_project for satisfaction_level

In [67]:
hr_data.pivot_table(index='dept' ,columns='number_project',values='satisfaction_level', aggfunc=np.mean ) 

number_project,2,3,4,5,6,7
dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IT,0.485659,0.689625,0.714323,0.665042,0.251579,0.1025
RandD,0.48092,0.672308,0.687689,0.641043,0.372885,0.099231
accounting,0.454444,0.653819,0.684434,0.640643,0.246515,0.128
hr,0.432429,0.708981,0.681757,0.685957,0.216481,0.101538
management,0.516962,0.67655,0.673623,0.673983,0.334286,0.146923
marketing,0.483636,0.696883,0.696853,0.680786,0.243725,0.1
product_mng,0.489195,0.680596,0.687939,0.694142,0.379577,0.09875
sales,0.481116,0.696972,0.695983,0.676993,0.288505,0.107313
support,0.4806,0.691275,0.704062,0.684827,0.23569,0.107692
technical,0.479926,0.679533,0.694418,0.699786,0.253024,0.152241


### Time Series

In [72]:
churn_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/churn.csv.txt', parse_dates=['last_trip_date','signup_date'])

In [73]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 4.2+ MB


In [77]:
churn_data.set_index('signup_date', inplace=True)

In [81]:
churn_data['2014-01-25':'2014-01-27']

Unnamed: 0_level_0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
signup_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-01-25,3.67,5.0,4.7,1.10,King's Landing,2014-06-17,iPhone,15.4,4,True,46.2
2014-01-27,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,11.8,14,False,82.4
2014-01-27,26.01,5.0,,1.00,Astapor,2014-01-28,Android,0.0,1,False,100.0
2014-01-27,11.25,5.0,4.0,1.00,Astapor,2014-02-27,Android,0.0,1,False,100.0
2014-01-26,6.12,4.8,5.0,1.00,Astapor,2014-02-07,iPhone,0.0,4,True,25.0
2014-01-25,11.86,5.0,4.9,1.00,Winterfell,2014-06-14,iPhone,0.0,1,False,40.0
2014-01-27,4.48,5.0,,1.00,Winterfell,2014-02-28,iPhone,0.0,1,True,100.0
2014-01-25,5.72,5.0,4.0,1.50,Winterfell,2014-01-26,Android,100.0,1,False,0.0
2014-01-25,6.05,4.5,5.0,1.25,Astapor,2014-03-29,iPhone,100.0,1,False,0.0
2014-01-26,2.47,5.0,4.8,1.00,Astapor,2014-01-30,iPhone,0.0,5,False,100.0


In [83]:
churn_data.reset_index()

Unnamed: 0,signup_date,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
0,2014-01-25,3.67,5.0,4.7,1.10,King's Landing,2014-06-17,iPhone,15.4,4,True,46.2
1,2014-01-29,8.26,5.0,5.0,1.00,Astapor,2014-05-05,Android,0.0,0,False,50.0
2,2014-01-06,0.77,5.0,4.3,1.00,Astapor,2014-01-07,iPhone,0.0,3,False,100.0
3,2014-01-10,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,20.0,9,True,80.0
4,2014-01-27,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,11.8,14,False,82.4
5,2014-01-09,10.56,5.0,3.5,1.00,Winterfell,2014-06-06,iPhone,0.0,2,True,100.0
6,2014-01-24,3.95,4.0,,1.00,Astapor,2014-01-25,Android,0.0,1,False,100.0
7,2014-01-28,2.04,5.0,5.0,1.00,Winterfell,2014-01-29,iPhone,0.0,2,False,100.0
8,2014-01-21,4.36,5.0,4.5,1.00,Winterfell,2014-02-01,Android,0.0,2,False,100.0
9,2014-01-03,2.37,5.0,,1.00,Winterfell,2014-01-05,Android,0.0,1,False,0.0


In [85]:
churn_data.set_index('last_trip_date', inplace=True)

In [86]:
churn_data.index

DatetimeIndex(['2014-06-17', '2014-05-05', '2014-01-07', '2014-06-29',
               '2014-03-15', '2014-06-06', '2014-01-25', '2014-01-29',
               '2014-02-01', '2014-01-05',
               ...
               '2014-05-18', '2014-06-29', '2014-01-19', '2014-07-01',
               '2014-05-31', '2014-06-05', '2014-01-25', '2014-05-22',
               '2014-01-15', '2014-04-20'],
              dtype='datetime64[ns]', name='last_trip_date', length=50000, freq=None)

In [87]:
churn_data['2014-06-17']

Unnamed: 0_level_0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,phone,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
last_trip_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-06-17,3.67,5.0,4.7,1.10,King's Landing,iPhone,15.4,4,True,46.2
2014-06-17,2.61,4.8,3.9,1.09,Astapor,iPhone,10.5,5,True,63.2
2014-06-17,3.81,5.0,4.7,1.00,King's Landing,iPhone,0.0,1,True,100.0
2014-06-17,5.45,5.0,,1.00,Astapor,iPhone,0.0,0,False,100.0
2014-06-17,3.96,4.7,4.6,1.00,Astapor,iPhone,0.0,1,True,85.7
2014-06-17,1.58,5.0,5.0,1.00,King's Landing,iPhone,0.0,1,True,100.0
2014-06-17,5.58,5.0,4.5,1.00,Astapor,Android,0.0,2,True,85.7
2014-06-17,7.93,4.7,4.2,1.17,Winterfell,iPhone,8.3,0,True,58.3
2014-06-17,5.84,4.8,3.9,1.00,Astapor,iPhone,0.0,8,True,85.7
2014-06-17,23.98,5.0,4.0,1.00,King's Landing,iPhone,0.0,0,True,100.0


In [89]:
churn_data['last_trip_date'] = churn_data.index

In [94]:
date = pd.to_datetime("25th of Feb, 2020")

In [95]:
date

Timestamp('2020-02-25 00:00:00')

In [98]:
pd.Series(range(3), index=pd.date_range('2000', freq='D', periods=3))

2000-01-01    0
2000-01-02    1
2000-01-03    2
Freq: D, dtype: int64

### Exercises

In [100]:
human_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/adult.data.txt')

In [None]:
human_data = pd.read_csv('..')