In [3]:
import pandas as pd
df = pd.read_csv('datasets/groupbydata2.csv')
df.head()

Unnamed: 0,date,city,temperature,humidity
0,01/01/2022,lahore,8,60
1,02/01/2022,lahore,10,58
2,03/01/2022,lahore,5,51
3,04/01/2022,lahore,6,49
4,05/01/2022,lahore,12,54


In [4]:
df.min()

date           01/01/2022
city              karachi
temperature            -7
humidity               49
dtype: object

In [8]:
df.count()

date           15
city           15
temperature    15
humidity       15
dtype: int64

In [11]:
# this works only on numerical values
df.median(numeric_only=True)

temperature     8.0
humidity       68.0
dtype: float64

In [14]:
df.agg(['min', 'max', 'count'])

Unnamed: 0,date,city,temperature,humidity
min,01/01/2022,karachi,-7,49
max,05/01/2022,murree,18,78
count,15,15,15,15


In [17]:
# > We can call the `describe()` method on the dataframe to get descriptive statistical measures on all its numeric columns.
df.describe()

Unnamed: 0,temperature,humidity
count,15.0,15.0
mean,6.133333,64.933333
std,8.253715,9.153194
min,-7.0,49.0
25%,-2.0,59.0
50%,8.0,68.0
75%,12.0,71.5
max,18.0,78.0


In [18]:
### b. Applying a Built-in Aggregation Function on a Series Object

In [19]:
df['humidity'].mean()

64.93333333333334

In [20]:
df['humidity'].count()

15

In [22]:
df['humidity'].min()

49

In [23]:
df['humidity'].agg(['min', 'max', 'count', 'median'])

min       49.0
max       78.0
count     15.0
median    68.0
Name: humidity, dtype: float64

In [24]:
df['humidity'].describe()

count    15.000000
mean     64.933333
std       9.153194
min      49.000000
25%      59.000000
50%      68.000000
75%      71.500000
max      78.000000
Name: humidity, dtype: float64

In [25]:
df.humidity

0     60
1     58
2     51
3     49
4     54
5     74
6     71
7     78
8     76
9     70
10    61
11    68
12    69
13    63
14    72
Name: humidity, dtype: int64

In [29]:
def celcius(temp):
    return temp * 9/5 + 32

df.temperature.apply(celcius)

0     46.4
1     50.0
2     41.0
3     42.8
4     53.6
5     64.4
6     50.0
7     53.6
8     59.0
9     60.8
10    23.0
11    26.6
12    24.8
13    30.2
14    19.4
Name: temperature, dtype: float64

In [30]:
df.temperature.apply(lambda temp: temp * 9/5 + 32)

0     46.4
1     50.0
2     41.0
3     42.8
4     53.6
5     64.4
6     50.0
7     53.6
8     59.0
9     60.8
10    23.0
11    26.6
12    24.8
13    30.2
14    19.4
Name: temperature, dtype: float64

In [31]:
# How to Compute the Minimum Temperature of Each City?
import pandas as pd

In [32]:
df = pd.read_csv("datasets/groupbydata1.csv")
df

Unnamed: 0,date,city,temperature
0,01/01/2022,lahore,8
1,02/01/2022,lahore,10
2,03/01/2022,lahore,5
3,04/01/2022,lahore,6
4,05/01/2022,lahore,12
5,01/01/2022,karachi,18
6,02/01/2022,karachi,10
7,03/01/2022,karachi,12
8,04/01/2022,karachi,15
9,05/01/2022,karachi,16


In [33]:
### a. Splitting the Dataframe


In [34]:
df[df['city'] == 'murree']

Unnamed: 0,date,city,temperature
10,01/01/2022,murree,-5
11,02/01/2022,murree,-3
12,03/01/2022,murree,-4
13,04/01/2022,murree,-1
14,05/01/2022,murree,-7


In [35]:
df.loc[df.city == 'karachi', :]

Unnamed: 0,date,city,temperature
5,01/01/2022,karachi,18
6,02/01/2022,karachi,10
7,03/01/2022,karachi,12
8,04/01/2022,karachi,15
9,05/01/2022,karachi,16


In [39]:
city_1 = df.loc[df.city == 'karachi', :].temperature.min()
city_2 = df.loc[df.city == 'murree', :].temperature.min()
city_3 = df.loc[df.city == 'lahore', :].temperature.min()


In [44]:
###Combining the cities

city = pd.Series(data=[city_1, city_2, city_3], index=['kar_min', 'mur_min', 'lah_min'])
city.name = 'city_minimum'
city


kar_min    10
mur_min    -7
lah_min     5
Name: city_minimum, dtype: int64

In [45]:
# How to Compute the Minimum Temperature of Each City?

In [47]:
import pandas as pd
df = pd.read_csv("datasets/groupbydata1.csv")
df

Unnamed: 0,date,city,temperature
0,01/01/2022,lahore,8
1,02/01/2022,lahore,10
2,03/01/2022,lahore,5
3,04/01/2022,lahore,6
4,05/01/2022,lahore,12
5,01/01/2022,karachi,18
6,02/01/2022,karachi,10
7,03/01/2022,karachi,12
8,04/01/2022,karachi,15
9,05/01/2022,karachi,16


In [49]:
# a. Step 1: Split Step
# In the split step we divide the data inside the dataframe into multiple groups
# # Since we need to calculate the minimum temperature of each city, therefore, 

# we will use groupby() method on the city column of the dataframe.

# This will result a DataFrameGroupBy object, which is an iterable containing multiple small dataframes 
# based on the by argument passed to the groupby() method

In [54]:
city_group = df.groupby('city')
city_group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021BFCFAA410>

In [56]:
for my_df in city_group:
    print(my_df)

('karachi',          date     city  temperature
5  01/01/2022  karachi           18
6  02/01/2022  karachi           10
7  03/01/2022  karachi           12
8  04/01/2022  karachi           15
9  05/01/2022  karachi           16)
('lahore',          date    city  temperature
0  01/01/2022  lahore            8
1  02/01/2022  lahore           10
2  03/01/2022  lahore            5
3  04/01/2022  lahore            6
4  05/01/2022  lahore           12)
('murree',           date    city  temperature
10  01/01/2022  murree           -5
11  02/01/2022  murree           -3
12  03/01/2022  murree           -4
13  04/01/2022  murree           -1
14  05/01/2022  murree           -7)


In [60]:
city_group.groups
# df.groupby('city').groups

{'karachi': [5, 6, 7, 8, 9], 'lahore': [0, 1, 2, 3, 4], 'murree': [10, 11, 12, 13, 14]}

In [61]:
# get the data fro a specific group
city_group.get_group('karachi')

Unnamed: 0,date,city,temperature
5,01/01/2022,karachi,18
6,02/01/2022,karachi,10
7,03/01/2022,karachi,12
8,04/01/2022,karachi,15
9,05/01/2022,karachi,16


In [62]:
# get the size of a group
city_group.size()

city
karachi    5
lahore     5
murree     5
dtype: int64

In [63]:
# APPLYING A FUNCTION TO EACH GROUP

In [69]:
city_murree = city_group.get_group('murree')
city_murree

Unnamed: 0,date,city,temperature
10,01/01/2022,murree,-5
11,02/01/2022,murree,-3
12,03/01/2022,murree,-4
13,04/01/2022,murree,-1
14,05/01/2022,murree,-7


In [68]:
city_murree.temperature.min()

-7

In [70]:
city_lahore = city_group.get_group('lahore')
city_lahore

Unnamed: 0,date,city,temperature
0,01/01/2022,lahore,8
1,02/01/2022,lahore,10
2,03/01/2022,lahore,5
3,04/01/2022,lahore,6
4,05/01/2022,lahore,12


In [71]:
city_lahore.temperature.mean()

8.2

In [72]:
# we can combine the min value of each group

In [73]:
city_karachi = city_group.get_group('karachi')
city_karachi

Unnamed: 0,date,city,temperature
5,01/01/2022,karachi,18
6,02/01/2022,karachi,10
7,03/01/2022,karachi,12
8,04/01/2022,karachi,15
9,05/01/2022,karachi,16


In [75]:
kar_min = city_karachi.temperature.min()
lah_min = city_lahore.temperature.min()
mur_min = city_murree.temperature.min()

In [77]:
series = pd.Series(data = [kar_min, lah_min, mur_min], index = ["K_min", "L_min", "M_min"])
series.name = "Min Temp"
series

K_min    10
L_min     5
M_min    -7
Name: Min Temp, dtype: int64

In [79]:
df.groupby('city').temperature.min()

city
karachi    10
lahore      5
murree     -7
Name: temperature, dtype: int64

In [81]:
city_group.temperature.agg(['min', 'max', 'count', 'mean'])

Unnamed: 0_level_0,min,max,count,mean
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
karachi,10,18,5,14.2
lahore,5,12,5,8.2
murree,-7,-1,5,-4.0


In [4]:
import pandas as pd
df = pd.read_csv("datasets/so_survey_subset.csv", index_col = "Respondent")
df.head()

Unnamed: 0_level_0,MainBranch,Hobbyist,Country,YearsCode,ConvertedComp,LanguageWorkedWith,SocialMedia,Age,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,I am a student who is learning to code,Yes,United Kingdom,4.0,,HTML/CSS;Java;JavaScript;Python,Twitter,14.0,Man
2,I am a student who is learning to code,No,Bosnia and Herzegovina,,,C++;HTML/CSS;Python,Instagram,19.0,Man
3,"I am not primarily a developer, but I write co...",Yes,Thailand,3.0,8820.0,HTML/CSS,Reddit,28.0,Man
4,I am a developer by profession,No,United States,3.0,61000.0,C;C++;C#;Python;SQL,Reddit,22.0,Man
5,I am a developer by profession,Yes,Ukraine,16.0,,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,Facebook,30.0,Man


In [85]:
df.shape

(88883, 9)

In [87]:
df.loc[df['Country'] == 'United States']

Unnamed: 0_level_0,MainBranch,Hobbyist,Country,YearsCode,ConvertedComp,LanguageWorkedWith,SocialMedia,Age,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4,I am a developer by profession,No,United States,3,61000.0,C;C++;C#;Python;SQL,Reddit,22.0,Man
13,I am a developer by profession,Yes,United States,17,90000.0,Bash/Shell/PowerShell;HTML/CSS;JavaScript;PHP;...,Twitter,28.0,Man
22,I am a developer by profession,Yes,United States,35,103000.0,Bash/Shell/PowerShell;C++;HTML/CSS;JavaScript;...,Instagram,47.0,Man
23,I am a developer by profession,Yes,United States,3,69000.0,Bash/Shell/PowerShell;HTML/CSS;JavaScript;Pyth...,Reddit,22.0,Man
26,I am a developer by profession,Yes,United States,12,114000.0,Bash/Shell/PowerShell;C++;C#;HTML/CSS;JavaScri...,I don't use social media,34.0,Man
...,...,...,...,...,...,...,...,...,...
78292,,No,United States,42,,Bash/Shell/PowerShell;C;Python,I don't use social media,60.0,Man
82717,,No,United States,Less than 1 year,,,Facebook,44.0,Man
83397,,Yes,United States,12,,HTML/CSS;JavaScript;Python;SQL,,27.0,Woman
85642,,No,United States,20,,Go;HTML/CSS,,34.0,"Non-binary, genderqueer, or gender non-conforming"


In [2]:
import pandas as pd
schema = pd.read_csv("datasets/so_survey_subset_schema.csv", index_col = "Column")
schema

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Respondent,Randomized respondent ID number (not in order ...
MainBranch,Which of the following options best describes ...
Hobbyist,Do you code as a hobby?
Country,In which country do you currently reside?
YearsCode,"Including any education, how many years have y..."
ConvertedComp,Salary converted to annual USD salaries using ...
LanguageWorkedWith,"Which of the following programming, scripting,..."
SocialMedia,What social media site do you use the most?
Age,What is your age (in years)? If you prefer not...
Gender,Which of the following do you currently identi...


In [91]:
schema.loc['Hobbyist']

QuestionText    Do you code as a hobby?
Name: Hobbyist, dtype: object

In [92]:
df['Hobbyist']

Respondent
1        Yes
2         No
3        Yes
4         No
5        Yes
        ... 
88377    Yes
88601     No
88802     No
88816     No
88863    Yes
Name: Hobbyist, Length: 88883, dtype: object

In [93]:
schema.loc['Country']

QuestionText    In which country do you currently reside?
Name: Country, dtype: object

In [94]:
df['Country']

Respondent
1                United Kingdom
2        Bosnia and Herzegovina
3                      Thailand
4                 United States
5                       Ukraine
                  ...          
88377                    Canada
88601                       NaN
88802                       NaN
88816                       NaN
88863                     Spain
Name: Country, Length: 88883, dtype: object

In [96]:
# Returns the count of non-NA values for a series object.
df['Hobbyist'].count()

88883

In [98]:
# Returns a Series containing counts of unique rows(yes or no).
df['Hobbyist'].value_counts()

Hobbyist
Yes    71257
No     17626
Name: count, dtype: int64

In [99]:
# Returns the count of non-NA values for a series object.
df['Country'].count()

88751

In [101]:
df['Country'].value_counts()

Country
United States        20949
India                 9061
Germany               5866
United Kingdom        5737
Canada                3395
                     ...  
Tonga                    1
Timor-Leste              1
North Korea              1
Brunei Darussalam        1
Chad                     1
Name: count, Length: 179, dtype: int64

In [102]:
# Locate the social media in the country PAKISTAN and count them
df.loc[df.Country == 'Pakistan', 'SocialMedia'].value_counts()

SocialMedia
WhatsApp                    266
Facebook                    232
YouTube                     182
LinkedIn                     71
Twitter                      58
Instagram                    41
Reddit                       28
I don't use social media     23
Snapchat                      5
Hello                         1
VK ВКонта́кте                 1
Name: count, dtype: int64

In [105]:
df.loc[df.Country == 'China', :].head()

Unnamed: 0_level_0,MainBranch,Hobbyist,Country,YearsCode,ConvertedComp,LanguageWorkedWith,SocialMedia,Age,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
56,I am a developer by profession,No,China,13,51150.0,Bash/Shell/PowerShell;HTML/CSS;Java;SQL,YouTube,33.0,Man
86,"I am not primarily a developer, but I write co...",Yes,China,6,16008.0,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...,WeChat 微信,26.0,Woman
204,I am a student who is learning to code,Yes,China,1,,Python;Other(s):,,12.0,Man
575,I am a developer by profession,Yes,China,7,30240.0,Bash/Shell/PowerShell;Go;Python;SQL,WeChat 微信,28.0,Man
841,I am a developer by profession,Yes,China,20,84000.0,Go;HTML/CSS;JavaScript;Ruby;SQL;TypeScript,WeChat 微信,35.0,Man


In [5]:
df.loc[df.Country == "China", "SocialMedia"].value_counts()

SocialMedia
WeChat 微信                   403
YouTube                      53
Weibo 新浪微博                   42
I don't use social media     27
Twitter                      27
Reddit                       12
LinkedIn                     11
Facebook                      8
Instagram                     7
Youku Tudou 优酷                7
WhatsApp                      3
VK ВКонта́кте                 1
Name: count, dtype: int64

In [9]:
df.groupby('Country').get_group('Nigeria')

Unnamed: 0_level_0,MainBranch,Hobbyist,Country,YearsCode,ConvertedComp,LanguageWorkedWith,SocialMedia,Age,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
205,"I am not primarily a developer, but I write co...",Yes,Nigeria,7,2664.0,C#;HTML/CSS;JavaScript;PHP,YouTube,24.0,Man
348,I am a developer by profession,Yes,Nigeria,2,,HTML/CSS;PHP,LinkedIn,39.0,Man
365,I code primarily as a hobby,Yes,Nigeria,5,,Java;SQL,WhatsApp,,Man
688,I am a developer by profession,Yes,Nigeria,2,1668.0,HTML/CSS;JavaScript;PHP;Python;SQL,Instagram,19.0,Man
885,I am a developer by profession,Yes,Nigeria,4,2770.0,Java;JavaScript;PHP;Python,Facebook,23.0,Man
...,...,...,...,...,...,...,...,...,...
88812,I am a developer by profession,Yes,Nigeria,8,16620.0,HTML/CSS;JavaScript;PHP;Python,WhatsApp,25.0,Man
88821,I am a student who is learning to code,No,Nigeria,Less than 1 year,,,Facebook,20.0,Woman
76960,,No,Nigeria,Less than 1 year,,C,,,
79616,,No,Nigeria,,,,Facebook,,Man


In [11]:
df.groupby('Country').get_group('Nigeria').loc[:, 'SocialMedia']

Respondent
205        YouTube
348       LinkedIn
365       WhatsApp
688      Instagram
885       Facebook
           ...    
88812     WhatsApp
88821     Facebook
76960          NaN
79616     Facebook
86806     WhatsApp
Name: SocialMedia, Length: 522, dtype: object

In [15]:
df.groupby('Country').get_group('Nigeria').loc[:, 'LanguageWorkedWith'].value_counts()

LanguageWorkedWith
HTML/CSS;JavaScript                                                28
HTML/CSS;JavaScript;PHP;SQL                                        24
HTML/CSS;JavaScript;PHP                                            16
Java                                                               13
HTML/CSS;Java;JavaScript;PHP;SQL                                   10
                                                                   ..
Dart;Elixir;Java;PHP;Python;Scala                                   1
C;C++;C#;Java;JavaScript;PHP;SQL;TypeScript;VBA                     1
Assembly;Bash/Shell/PowerShell;C;HTML/CSS;JavaScript;TypeScript     1
C;HTML/CSS;JavaScript;TypeScript                                    1
C#;Elixir;HTML/CSS;JavaScript;PHP;Python;SQL                        1
Name: count, Length: 277, dtype: int64

In [18]:
# Get the group of countries, get their social media
df.groupby('Country')['SocialMedia'].value_counts().head(50)

Country              SocialMedia             
Afghanistan          Facebook                     15
                     YouTube                       9
                     I don't use social media      6
                     WhatsApp                      4
                     Instagram                     1
                     LinkedIn                      1
                     Twitter                       1
Albania              WhatsApp                     18
                     Facebook                     16
                     Instagram                    13
                     YouTube                      10
                     Twitter                       8
                     LinkedIn                      7
                     Reddit                        6
                     I don't use social media      4
                     Snapchat                      1
                     WeChat 微信                     1
Algeria              YouTube                      42


In [19]:
df.head()

Unnamed: 0_level_0,MainBranch,Hobbyist,Country,YearsCode,ConvertedComp,LanguageWorkedWith,SocialMedia,Age,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,I am a student who is learning to code,Yes,United Kingdom,4.0,,HTML/CSS;Java;JavaScript;Python,Twitter,14.0,Man
2,I am a student who is learning to code,No,Bosnia and Herzegovina,,,C++;HTML/CSS;Python,Instagram,19.0,Man
3,"I am not primarily a developer, but I write co...",Yes,Thailand,3.0,8820.0,HTML/CSS,Reddit,28.0,Man
4,I am a developer by profession,No,United States,3.0,61000.0,C;C++;C#;Python;SQL,Reddit,22.0,Man
5,I am a developer by profession,Yes,Ukraine,16.0,,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,Facebook,30.0,Man


In [20]:
# Give me all the rows but only countries
df.loc[:, "Country"]

Respondent
1                United Kingdom
2        Bosnia and Herzegovina
3                      Thailand
4                 United States
5                       Ukraine
                  ...          
88377                    Canada
88601                       NaN
88802                       NaN
88816                       NaN
88863                     Spain
Name: Country, Length: 88883, dtype: object

In [23]:
tc = df["Country"].value_counts()
tc.name = 'Total'
tc

Country
United States        20949
India                 9061
Germany               5866
United Kingdom        5737
Canada                3395
                     ...  
Tonga                    1
Timor-Leste              1
North Korea              1
Brunei Darussalam        1
Chad                     1
Name: Total, Length: 179, dtype: int64

In [26]:
group = df.groupby('Country')['Country'].apply(lambda x: x.value_counts())
group

Country                                                                   
Afghanistan                           Afghanistan                              44
Albania                               Albania                                  86
Algeria                               Algeria                                 134
Andorra                               Andorra                                   7
Angola                                Angola                                    5
                                                                             ... 
Venezuela, Bolivarian Republic of...  Venezuela, Bolivarian Republic of...     88
Viet Nam                              Viet Nam                                231
Yemen                                 Yemen                                    19
Zambia                                Zambia                                   12
Zimbabwe                              Zimbabwe                                 39
Name: Country, Length: 

In [31]:
pp = df.groupby('Country')['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())
pp

Country
Afghanistan                              8
Albania                                 23
Algeria                                 40
Andorra                                  0
Angola                                   2
                                        ..
Venezuela, Bolivarian Republic of...    28
Viet Nam                                78
Yemen                                    3
Zambia                                   4
Zimbabwe                                14
Name: LanguageWorkedWith, Length: 179, dtype: int64

In [32]:
pp.name = "Knows Python"

In [33]:
# concatenating columns total(tc) and 'Knows python' (pp)
new_df = pd.concat([tc, pp], axis = 1)
new_df

Unnamed: 0_level_0,Total,Knows Python
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,20949,10083
India,9061,3105
Germany,5866,2451
United Kingdom,5737,2384
Canada,3395,1558
...,...,...
Tonga,1,0
Timor-Leste,1,1
North Korea,1,0
Brunei Darussalam,1,0


In [34]:
new_df.loc['Nigeria']

Total           522
Knows Python    152
Name: Nigeria, dtype: int64

In [35]:
new_df.loc['China']

Total           664
Knows Python    297
Name: China, dtype: int64

In [36]:
new_df.loc['United States']

Total           20949
Knows Python    10083
Name: United States, dtype: int64

In [40]:
# Percentage of people in the country that knows python
new_df['Percentage'] = (new_df['Knows Python'] / new_df['Total']) * 100
new_df

Unnamed: 0_level_0,Total,Knows Python,Percentage
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
United States,20949,10083,48.131176
India,9061,3105,34.267741
Germany,5866,2451,41.783157
United Kingdom,5737,2384,41.554820
Canada,3395,1558,45.891016
...,...,...,...
Tonga,1,0,0.000000
Timor-Leste,1,1,100.000000
North Korea,1,0,0.000000
Brunei Darussalam,1,0,0.000000


In [41]:
new_df.loc['Nigeria']

Total           522.000000
Knows Python    152.000000
Percentage       29.118774
Name: Nigeria, dtype: float64