In [1]:
import pandas as pd

df = pd.read_csv('../data/survey_results_public.csv', index_col='Respondent')
schema = pd.read_csv('../data/survey_results_schema.csv', index_col='Column')
df.info()
pd.set_option('display.max_rows', 200)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64461 entries, 1 to 65112
Data columns (total 60 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   MainBranch                    64162 non-null  object 
 1   Hobbyist                      64416 non-null  object 
 2   Age                           45446 non-null  float64
 3   Age1stCode                    57900 non-null  object 
 4   CompFreq                      40069 non-null  object 
 5   CompTotal                     34826 non-null  float64
 6   ConvertedComp                 34756 non-null  float64
 7   Country                       64072 non-null  object 
 8   CurrencyDesc                  45472 non-null  object 
 9   CurrencySymbol                45472 non-null  object 
 10  DatabaseDesireNextYear        44070 non-null  object 
 11  DatabaseWorkedWith            49537 non-null  object 
 12  DevType                       49370 non-null  object 
 13  E

In [2]:
# Get Standard Statical data on a DataFrame with .describe()
df.describe()

Unnamed: 0,Age,CompTotal,ConvertedComp,WorkWeekHrs
count,45446.0,34826.0,34756.0,41151.0
mean,30.834111,3.190464e+242,103756.1,40.782174
std,9.585392,inf,226885.3,17.816383
min,1.0,0.0,0.0,1.0
25%,24.0,20000.0,24648.0,40.0
50%,29.0,63000.0,54049.0,40.0
75%,35.0,125000.0,95000.0,44.0
max,279.0,1.1111110000000001e+247,2000000.0,475.0


In [3]:
# This Years (2020) survey doesn't have a SocialMedia Column
# So lets get more data on Salary based on Country
series = df.groupby('Country')['ConvertedComp']
median_salary = series.median()
max_salary = series.max()
min_salary = series.min()
mean_salary = series.mean()
salary_df = pd.DataFrame({'Max': max_salary, 'Min': min_salary, "Median": median_salary, 'Mean': mean_salary})
salary_df.loc[['United States', 'India', 'China', 'United Kingdom']]

Unnamed: 0_level_0,Max,Min,Median,Mean
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
United States,2000000.0,0.0,115000.0,208826.496443
India,1800000.0,0.0,10056.0,28751.271364
China,857352.0,5.0,35861.5,61493.567308
United Kingdom,1000000.0,0.0,67215.0,140849.074189


In [4]:
country_grp = df.groupby('Country')
# country_grp.get_group('United States')

# There's a better way
country_grp['DevType'].value_counts().loc['India']

DevType
Developer, full-stack                                             549
Developer, back-end                                               419
Developer, mobile                                                 346
Developer, back-end;Developer, front-end;Developer, full-stack    289
Developer, front-end                                              215
                                                                 ... 
Engineer, data;Engineer, site reliability;System administrator      1
Engineer, data;Product manager;Senior executive/VP                  1
Engineer, data;System administrator                                 1
Engineer, site reliability;System administrator                     1
Engineering manager;Product manager;Scientist                       1
Name: DevType, Length: 1362, dtype: int64

In [5]:
df[(df['Country'] == 'India') & (df['LanguageWorkedWith'].str.contains('Python'))]

Unnamed: 0_level_0,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,CurrencySymbol,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22,I am a developer by profession,Yes,,18,Monthly,,,India,Indian rupee,INR,...,Easy,Appropriate in length,,"Computer science, computer engineering, or sof...",,,Just as welcome now as I felt last year,50.0,10,2
63,I am a student who is learning to code,Yes,21.0,17,,,,India,,,...,Easy,Appropriate in length,No,,Angular.js;Django;jQuery;Laravel;Vue.js,,Not applicable - I did not use Stack Overflow ...,,4,
227,I am a developer by profession,Yes,24.0,19,Yearly,840000.0,11728.0,India,Indian rupee,INR,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Gatsby,Flask;Spring,Somewhat more welcome now than last year,45.0,5,2
232,I am a student who is learning to code,No,20.0,12,,,,India,,,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;jQuery;Laravel,Flask,Just as welcome now as I felt last year,,7,
234,I am a developer by profession,Yes,22.0,16,,,,India,,,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",Django;Flask,Angular;Angular.js;Django;Flask;jQuery,Just as welcome now as I felt last year,,4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32165,,Yes,20.0,12,,,,India,,,...,Easy,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core;Django,ASP.NET Core;Django;Drupal;jQuery,Just as welcome now as I felt last year,,8,
37383,,Yes,,,,,,India,,,...,,,,,Django,,,,,
42965,,Yes,,,,,,India,,,...,,,,,Django;Flask,,A lot more welcome now than last year,,,
52184,,Yes,20.0,17,,,,India,,,...,Neither easy nor difficult,Appropriate in length,No,,Angular;Angular.js;jQuery;Ruby on Rails;Vue.js,Django;Flask;React.js,Just as welcome now as I felt last year,,3,


In [28]:
# Get number of respondents by Country
country_respondents = df['Country'].value_counts()
# Get number of repondents that know python, by country
country_python = country_grp['LanguageWorkedWith'].apply(lambda x: x.str.contains('Python').sum())
# Combine above series, renaming columns 
python_df = pd.concat({'Respondents': country_respondents, 'KnowsPython': country_python}, axis='columns', sort=False)
# Calculate Perctange of People that know Python per Country
python_df['PctKnowsPython'] = (python_df['KnowsPython']/python_df['Respondents']) * 100
# Sort Data - High to Low Percentage
python_df.sort_values(by='PctKnowsPython', ascending=False, inplace=True)
python_df

Unnamed: 0,Respondents,KnowsPython,PctKnowsPython
Saint Lucia,1,1,100.0
"Micronesia, Federated States of...",1,1,100.0
Gabon,1,1,100.0
Montenegro,13,9,69.230769
Brunei Darussalam,3,2,66.666667
Guyana,5,3,60.0
Swaziland,7,4,57.142857
Iceland,53,28,52.830189
Finland,349,183,52.43553
Uganda,81,42,51.851852


In [26]:
python_df.loc['Japan']

Respondents       297.000000
KnowsPython       132.000000
PctKnowsPython     44.444444
Name: Japan, dtype: float64