In [1]:
import pandas as pd
file = 'Data/Survey.csv'
df = pd.read_csv(file, index_col="Respondent", 
                 usecols=["Respondent", "Hobbyist", "Country", 
                          "Student", "YearsCode", "ConvertedComp", 
                          "LanguageWorkedWith", "Age", "Gender"])
df

Unnamed: 0_level_0,Hobbyist,Country,Student,YearsCode,ConvertedComp,LanguageWorkedWith,Age,Gender
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Yes,United Kingdom,No,4,,HTML/CSS;Java;JavaScript;Python,14.0,Man
2,No,Bosnia and Herzegovina,"Yes, full-time",,,C++;HTML/CSS;Python,19.0,Man
3,Yes,Thailand,No,3,8820.0,HTML/CSS,28.0,Man
4,No,United States,No,3,61000.0,C;C++;C#;Python;SQL,22.0,Man
5,Yes,Ukraine,No,16,,C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA,30.0,Man
...,...,...,...,...,...,...,...,...
88377,Yes,Canada,No,,,HTML/CSS;JavaScript;Other(s):,,Man
88601,No,,,,,,,
88802,No,,,,,,,
88816,No,,,,,,,


In [2]:
# Splitting every language in column 'LanguageWorkedWith' using split()
language = df['LanguageWorkedWith'].str.split(';')
language.head(10)

Respondent
1                  [HTML/CSS, Java, JavaScript, Python]
2                               [C++, HTML/CSS, Python]
3                                            [HTML/CSS]
4                             [C, C++, C#, Python, SQL]
5     [C++, HTML/CSS, Java, JavaScript, Python, SQL,...
6                                        [Java, R, SQL]
7                                [HTML/CSS, JavaScript]
8     [Bash/Shell/PowerShell, C, C++, HTML/CSS, Java...
9     [Bash/Shell/PowerShell, C#, HTML/CSS, JavaScri...
10                 [C#, Go, JavaScript, Python, R, SQL]
Name: LanguageWorkedWith, dtype: object

In [3]:
# Splitting 'LanguageWorkedWith' column 2 times
language = df['LanguageWorkedWith'].str.split(';', n=2)
language.head(10)

Respondent
1                   [HTML/CSS, Java, JavaScript;Python]
2                               [C++, HTML/CSS, Python]
3                                            [HTML/CSS]
4                               [C, C++, C#;Python;SQL]
5       [C++, HTML/CSS, Java;JavaScript;Python;SQL;VBA]
6                                        [Java, R, SQL]
7                                [HTML/CSS, JavaScript]
8     [Bash/Shell/PowerShell, C, C++;HTML/CSS;Java;J...
9     [Bash/Shell/PowerShell, C#, HTML/CSS;JavaScrip...
10                    [C#, Go, JavaScript;Python;R;SQL]
Name: LanguageWorkedWith, dtype: object

In [4]:
# Splitting 'LanguageWorkedWith' 3 times and expanding list to Dataframe
language = df['LanguageWorkedWith'].str.split(';', n=8, expand=True)
language.head(10)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,HTML/CSS,Java,JavaScript,Python,,,,,
2,C++,HTML/CSS,Python,,,,,,
3,HTML/CSS,,,,,,,,
4,C,C++,C#,Python,SQL,,,,
5,C++,HTML/CSS,Java,JavaScript,Python,SQL,VBA,,
6,Java,R,SQL,,,,,,
7,HTML/CSS,JavaScript,,,,,,,
8,Bash/Shell/PowerShell,C,C++,HTML/CSS,Java,JavaScript,Python,SQL,
9,Bash/Shell/PowerShell,C#,HTML/CSS,JavaScript,Python,Ruby,Rust,SQL,TypeScript;WebAssembly;Other(s):
10,C#,Go,JavaScript,Python,R,SQL,,,


In [5]:
# upper() => converting string to uppercase
upper = df['Country'].str.upper()
upper.head(10)

Respondent
1             UNITED KINGDOM
2     BOSNIA AND HERZEGOVINA
3                   THAILAND
4              UNITED STATES
5                    UKRAINE
6                     CANADA
7                    UKRAINE
8                      INDIA
9                NEW ZEALAND
10                     INDIA
Name: Country, dtype: object

In [6]:
# lower() => converting string to lowercase
lower = df['Country'].str.upper()
lower.head(10)

Respondent
1             UNITED KINGDOM
2     BOSNIA AND HERZEGOVINA
3                   THAILAND
4              UNITED STATES
5                    UKRAINE
6                     CANADA
7                    UKRAINE
8                      INDIA
9                NEW ZEALAND
10                     INDIA
Name: Country, dtype: object

In [7]:
# swapcase() => swapping case of the string. 
swap = df['Country'].str.swapcase()
swap.head(10)

Respondent
1             uNITED kINGDOM
2     bOSNIA AND hERZEGOVINA
3                   tHAILAND
4              uNITED sTATES
5                    uKRAINE
6                     cANADA
7                    uKRAINE
8                      iNDIA
9                nEW zEALAND
10                     iNDIA
Name: Country, dtype: object

In [8]:
# title() => making string into title case. 
title = df['Country'].str.title()
title.head(10)

Respondent
1             United Kingdom
2     Bosnia And Herzegovina
3                   Thailand
4              United States
5                    Ukraine
6                     Canada
7                    Ukraine
8                      India
9                New Zealand
10                     India
Name: Country, dtype: object

In [9]:
# strip(): Used to strip characters passd as an argument from starting and ending of string.
# If nothing is passed as parameter the default value of ' ' is used.

# strips characters "HT" from the start and end of string
strip = df['LanguageWorkedWith'].str.strip("HT")
strip.head(10)

Respondent
1                         ML/CSS;Java;JavaScript;Python
2                                   C++;HTML/CSS;Python
3                                                ML/CSS
4                                   C;C++;C#;Python;SQL
5           C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
6                                            Java;R;SQL
7                                     ML/CSS;JavaScript
8     Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
9     Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...
10                        C#;Go;JavaScript;Python;R;SQL
Name: LanguageWorkedWith, dtype: object

In [10]:
# rstrip(): Used to strip characters passd as an argument from the end of string.
# If nothing is passed as parameter the default value of ' ' is used.

# strips characters "HT" from the end of string
rstrip = df['LanguageWorkedWith'].str.rstrip("on")
rstrip.head(10)

Respondent
1                         HTML/CSS;Java;JavaScript;Pyth
2                                     C++;HTML/CSS;Pyth
3                                              HTML/CSS
4                                   C;C++;C#;Python;SQL
5           C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
6                                            Java;R;SQL
7                                   HTML/CSS;JavaScript
8     Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
9     Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...
10                        C#;Go;JavaScript;Python;R;SQL
Name: LanguageWorkedWith, dtype: object

In [11]:
# lstrip(): Used to strip characters passd as an argument from the start of string.
# If nothing is passed as parameter the default value of ' ' is used.

# strips characters "HT" from the start of string
lstrip = df['LanguageWorkedWith'].str.lstrip("HT")
lstrip.head(10)

Respondent
1                         ML/CSS;Java;JavaScript;Python
2                                   C++;HTML/CSS;Python
3                                                ML/CSS
4                                   C;C++;C#;Python;SQL
5           C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
6                                            Java;R;SQL
7                                     ML/CSS;JavaScript
8     Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
9     Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...
10                        C#;Go;JavaScript;Python;R;SQL
Name: LanguageWorkedWith, dtype: object

In [12]:
# replace(): to replaces the string passed as the first argument in the 
# function with the string passed as the second argument of this function.

# rplace Man with Male
print("Before Replace: \n", df['Gender'].head(10))
replace = df['Gender'].str.replace("Man", "Male")
print()
print("After Replace: \n", replace.head(10))

Before Replace: 
 Respondent
1     Man
2     Man
3     Man
4     Man
5     Man
6     Man
7     Man
8     Man
9     Man
10    NaN
Name: Gender, dtype: object

After Replace: 
 Respondent
1     Male
2     Male
3     Male
4     Male
5     Male
6     Male
7     Male
8     Male
9     Male
10     NaN
Name: Gender, dtype: object


In [13]:
# slice(): special function which is used to truncate the string in the column
# Trancate country columns to 3 characters
slice = df['Country'].str.slice(start=0, stop=3)
slice.head(5)

Respondent
1    Uni
2    Bos
3    Tha
4    Uni
5    Ukr
Name: Country, dtype: object

In [14]:
# Every 3rd character after 1st character
slice = df['Country'].str.slice(start=0, stop=-1, step=3) 
slice.head(5)

Respondent
1      Ut no
2    Bn deev
3        Tin
4       Ut a
5         Ua
Name: Country, dtype: object

In [22]:
# isalpha() => eturns True if all characters in a 
# string of a column are alphabets else returns False

alpha = df['Gender'].str.isalpha()
print("Gender Column \n",alpha.head(10))
print()
alpha2 = df['YearsCode'].str.isalpha()
print("Year Column \n",alpha2.head(10))

Gender Column 
 Respondent
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10     NaN
Name: Gender, dtype: object

Year Column 
 Respondent
1     False
2       NaN
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
Name: YearsCode, dtype: object


In [23]:
# isnumeric() => returns True if all characters in a 
# string of a column are numbers else returns False

numeric = df['Gender'].str.isnumeric()
print("Gender Column \n",numeric.head(10))
print()
numeric2 = df['YearsCode'].str.isnumeric()
print("Year Column \n",numeric2.head(10))

Gender Column 
 Respondent
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10      NaN
Name: Gender, dtype: object

Year Column 
 Respondent
1     True
2      NaN
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
Name: YearsCode, dtype: object


In [24]:
# len() => returns length of string in columns
length = df['LanguageWorkedWith'].str.len()
length.head(10)

Respondent
1     31.0
2     19.0
3      8.0
4     19.0
5     43.0
6     10.0
7     19.0
8     63.0
9     98.0
10    29.0
Name: LanguageWorkedWith, dtype: float64