In [39]:
import pandas as pd
file = 'Data/Survey.csv'
df = pd.read_csv(file, index_col="Respondent", 
                 usecols=["Respondent", "Hobbyist", "Country", 
                          "Student", "YearsCode", "ConvertedComp", 
                          "LanguageWorkedWith", "Age", "Gender"])
df.head()

Unnamed: 0_level_0,Hobbyist,Country,Student,YearsCode,ConvertedComp,LanguageWorkedWith,Age,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Yes,United Kingdom,No,4.0,,HTML/CSS;Java;JavaScript;Python,14.0,Man
2,No,Bosnia and Herzegovina,"Yes, full-time",,,C++;HTML/CSS;Python,19.0,Man
3,Yes,Thailand,No,3.0,8820.0,HTML/CSS,28.0,Man
4,No,United States,No,3.0,61000.0,C;C++;C#;Python;SQL,22.0,Man
5,Yes,Ukraine,No,16.0,,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,30.0,Man


In [30]:
# Count of all unique values in column Gender of the DataFrame (NaN values are not included)
genderCount = df['Gender'].value_counts()
print(f"{genderCount}")
print()
nanCount = df['Gender'].isnull().sum()
print(f"NaN: {nanCount}")

Man                                                            77919
Woman                                                           6344
Non-binary, genderqueer, or gender non-conforming                597
Man;Non-binary, genderqueer, or gender non-conforming            181
Woman;Non-binary, genderqueer, or gender non-conforming          163
Woman;Man                                                        132
Woman;Man;Non-binary, genderqueer, or gender non-conforming       70
Name: Gender, dtype: int64

NaN: 3477


In [31]:
# Use map function to change 'Man' to 'Male' and 'Woman' to 'Female'
df['Gender'] = df['Gender'].map({'Man':'Male', 'Woman': 'Female'})

genderValueCount = df['Gender'].value_counts()
print(genderValueCount)
print()
nanCount = df['Gender'].isnull().sum()
print(f"NaN: {nanCount}")

Male      77919
Female     6344
Name: Gender, dtype: int64

NaN: 4620


In [32]:
# Function to calculate the number of language
def get_TotalLanguage(x):
    return len(str(x).split(';'))

In [33]:
# Using 'Apply' function to get number of languages known by respondent using function get_TotalLanguage
df['NumberOfLangauge'] = df['LanguageWorkedWith'].apply(get_TotalLanguage)
df.loc[:, ['LanguageWorkedWith', 'NumberOfLangauge']].head()

Unnamed: 0_level_0,LanguageWorkedWith,NumberOfLangauge
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
1,HTML/CSS;Java;JavaScript;Python,4
2,C++;HTML/CSS;Python,3
3,HTML/CSS,1
4,C;C++;C#;Python;SQL,5
5,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,7


In [6]:
# df.loc[:, ['ConvertedComp', 'Age']].apply(max, axis=0)

In [35]:
# Creating a new Dataframe 
df = df.loc[:, ['ConvertedComp', 'Age']]
df.head()

Unnamed: 0_level_0,ConvertedComp,Age
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
1,,14.0
2,,19.0
3,8820.0,28.0
4,61000.0,22.0
5,,30.0


In [36]:
# droping all rows with atleat one na value
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,ConvertedComp,Age
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1
3,8820.0,28.0
4,61000.0,22.0
6,366420.0,28.0
9,95179.0,23.0
13,90000.0,28.0


In [37]:
# Applying max function along the columns to get the max value for each column
df.apply(max, axis=0)

ConvertedComp    2000000.0
Age                   99.0
dtype: float64

In [38]:
# Applying max function along the row to get the max value for each row
df.apply(max, axis=1)

Respondent
3           8820.0
4          61000.0
6         366420.0
9          95179.0
13         90000.0
           ...    
88877    2000000.0
88878     130000.0
88879      82488.0
88881      68745.0
88883      22915.0
Length: 53513, dtype: float64

In [40]:
# new Dataframe to undersand APPLYMAP
df = df.loc[:, ["Hobbyist", "Country", "LanguageWorkedWith", "Gender"]]
df.dropna(inplace=True)
df.head()

Unnamed: 0_level_0,Hobbyist,Country,LanguageWorkedWith,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Yes,United Kingdom,HTML/CSS;Java;JavaScript;Python,Man
2,No,Bosnia and Herzegovina,C++;HTML/CSS;Python,Man
3,Yes,Thailand,HTML/CSS,Man
4,No,United States,C;C++;C#;Python;SQL,Man
5,Yes,Ukraine,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,Man


In [41]:
# Applying len function to each element of the DataFrame
df.applymap(len).head()

Unnamed: 0_level_0,Hobbyist,Country,LanguageWorkedWith,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,14,31,3
2,2,22,19,3
3,3,8,8,3
4,2,13,19,3
5,3,7,43,3
