In [1]:
import pandas as pd
import re

In [2]:
df=pd.read_csv('internshala_dataset_raw.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2567 entries, 0 to 2566
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   internship                     2567 non-null   object
 1   company_name                   2567 non-null   object
 2   skills                         2220 non-null   object
 3   perks                          1712 non-null   object
 4   location                       2567 non-null   object
 5   duration                       2567 non-null   object
 6   stipend                        2567 non-null   object
 7   applicants                     2567 non-null   object
 8   ifSkillsorPerksMissingUseThis  2535 non-null   object
dtypes: object(9)
memory usage: 180.6+ KB


Step 1: Create Sample DataFrame

In [4]:
df.sample(2)

Unnamed: 0,internship,company_name,skills,perks,location,duration,stipend,applicants,ifSkillsorPerksMissingUseThis
2535,Node.js Development,Monkhub,"JavaScript, Node.js, GitHub, Firebase, Firebas...",,Work From Home,2 Months,8000-12000 /month,191 applicants,JavaScript\nNode.js\nGitHub\nFirebase\nFirebas...
276,Web Development,Dignity Software Private Limited,"PHP, HTML, CSS, JavaScript, Adobe Photoshop, W...","Certificate, Letter of recommendation, Job offer",Delhi,6 Months,2000 /month,Be an early applicant,PHP\nHTML\nCSS\nJavaScript\nAdobe Photoshop\nW...


Step 2: Replace String Values with Regex in Column

Let's start with replacing string values in column applicants. As you can see the values in the column are mixed.

There are two options:

In [5]:
df['applicants'].unique()

array(['119 applicants', '194 applicants', '113 applicants',
       '183 applicants', '205 applicants', '457 applicants',
       '54 applicants', '618 applicants', 'Be an early applicant',
       '34 applicants', '28 applicants', '32 applicants', '41 applicants',
       '35 applicants', '31 applicants', '46 applicants', '49 applicants',
       '57 applicants', '38 applicants', '43 applicants', '44 applicants',
       '75 applicants', '39 applicants', '37 applicants', '48 applicants',
       '47 applicants', '55 applicants', '66 applicants', '27 applicants',
       '26 applicants', '52 applicants', '62 applicants', '68 applicants',
       '72 applicants', '50 applicants', '78 applicants', '70 applicants',
       '84 applicants', '120 applicants', '139 applicants',
       '33 applicants', '45 applicants', '263 applicants',
       '59 applicants', '95 applicants', '36 applicants', '64 applicants',
       '89 applicants', '86 applicants', '76 applicants', '63 applicants',
       '107 appli

In [6]:
df['applicants']= df['applicants'].str.replace(r'\sapplicants', '', regex=True)
df['applicants'].unique()

array(['119', '194', '113', '183', '205', '457', '54', '618',
       'Be an early applicant', '34', '28', '32', '41', '35', '31', '46',
       '49', '57', '38', '43', '44', '75', '39', '37', '48', '47', '55',
       '66', '27', '26', '52', '62', '68', '72', '50', '78', '70', '84',
       '120', '139', '33', '45', '263', '59', '95', '36', '64', '89',
       '86', '76', '63', '107', '143', '80', '42', '58', '127', '110',
       '309', '91', '30', '82', '93', '85', '129', '117', '29', '60',
       '40', '61', '51', '122', '192', '230', '102', '100', '88', '268',
       '115', '265', '56', '87', '65', '238', '144', '69', '299', '121',
       '233', '125', '79', '174', '155', '73', '104', '135', '156', '167',
       '131', '164', '94', '81', '53', '71', '106', '142', '83', '178',
       '74', '114', '96', '90', '172', '123', '103', '405', '99', '108',
       '150', '185', '145', '202', '184', '128', '130', '147', '77',
       '260', '134', '67', '351', '136', '379', '179', '92', '204', '242

In [7]:
df['applicants_clean']=df['applicants'].str.replace(r'(\sapplicants|Be an early applicant)', '', regex=True)
df['applicants_clean'].unique()

array(['119', '194', '113', '183', '205', '457', '54', '618', '', '34',
       '28', '32', '41', '35', '31', '46', '49', '57', '38', '43', '44',
       '75', '39', '37', '48', '47', '55', '66', '27', '26', '52', '62',
       '68', '72', '50', '78', '70', '84', '120', '139', '33', '45',
       '263', '59', '95', '36', '64', '89', '86', '76', '63', '107',
       '143', '80', '42', '58', '127', '110', '309', '91', '30', '82',
       '93', '85', '129', '117', '29', '60', '40', '61', '51', '122',
       '192', '230', '102', '100', '88', '268', '115', '265', '56', '87',
       '65', '238', '144', '69', '299', '121', '233', '125', '79', '174',
       '155', '73', '104', '135', '156', '167', '131', '164', '94', '81',
       '53', '71', '106', '142', '83', '178', '74', '114', '96', '90',
       '172', '123', '103', '405', '99', '108', '150', '185', '145',
       '202', '184', '128', '130', '147', '77', '260', '134', '67', '351',
       '136', '379', '179', '92', '204', '242', '151', '133', '305

Step 3: Regex replace with capture group

In this step we will take a deeper look on regex and capture groups in Pandas.

They are powerful tool to match a pattern and extract only part of it.

Let's say that we would like to match : 63 applicants but only extract the numbers. In other words, to search for a numeric sequence followed by anything.

We are capturing two groups but will keep only the first one - the numbers.

In [8]:
df['applicants'].unique()

array(['119', '194', '113', '183', '205', '457', '54', '618',
       'Be an early applicant', '34', '28', '32', '41', '35', '31', '46',
       '49', '57', '38', '43', '44', '75', '39', '37', '48', '47', '55',
       '66', '27', '26', '52', '62', '68', '72', '50', '78', '70', '84',
       '120', '139', '33', '45', '263', '59', '95', '36', '64', '89',
       '86', '76', '63', '107', '143', '80', '42', '58', '127', '110',
       '309', '91', '30', '82', '93', '85', '129', '117', '29', '60',
       '40', '61', '51', '122', '192', '230', '102', '100', '88', '268',
       '115', '265', '56', '87', '65', '238', '144', '69', '299', '121',
       '233', '125', '79', '174', '155', '73', '104', '135', '156', '167',
       '131', '164', '94', '81', '53', '71', '106', '142', '83', '178',
       '74', '114', '96', '90', '172', '123', '103', '405', '99', '108',
       '150', '185', '145', '202', '184', '128', '130', '147', '77',
       '260', '134', '67', '351', '136', '379', '179', '92', '204', '242

In [9]:
df['applicants_3']=df['applicants'].replace(to_replace=r"([0-9,\.]+)(.*)", value=r"\1", regex=True)
df['applicants_3'].unique()

array(['119', '194', '113', '183', '205', '457', '54', '618',
       'Be an early applicant', '34', '28', '32', '41', '35', '31', '46',
       '49', '57', '38', '43', '44', '75', '39', '37', '48', '47', '55',
       '66', '27', '26', '52', '62', '68', '72', '50', '78', '70', '84',
       '120', '139', '33', '45', '263', '59', '95', '36', '64', '89',
       '86', '76', '63', '107', '143', '80', '42', '58', '127', '110',
       '309', '91', '30', '82', '93', '85', '129', '117', '29', '60',
       '40', '61', '51', '122', '192', '230', '102', '100', '88', '268',
       '115', '265', '56', '87', '65', '238', '144', '69', '299', '121',
       '233', '125', '79', '174', '155', '73', '104', '135', '156', '167',
       '131', '164', '94', '81', '53', '71', '106', '142', '83', '178',
       '74', '114', '96', '90', '172', '123', '103', '405', '99', '108',
       '150', '185', '145', '202', '184', '128', '130', '147', '77',
       '260', '134', '67', '351', '136', '379', '179', '92', '204', '242

Step 4: Regex replace only special characters

What if we would like to clean or remove all special characters while keeping numbers and letters.

In that case we can use one of the next regex:

r'[^0-9a-zA-Z:,\s]+' - keep numbers, letters, semicolon, comma and space
r'[^0-9a-zA-Z:,]+' - keep numbers, letters, semicolon and comma
So the code looks like:

In [13]:
df['internship'].unique()

array(['Software Testing',
       'Technical Operations - Networking And Monitoring',
       'Software Project Management', 'Web Development',
       'Front End Development', 'Web Design', 'Node.js Development',
       'C++ Development (QT Creation)', 'Quality Control & Testing',
       'Software Support Engineering', 'Full Stack Development (MERN)',
       'Business Analysis',
       'Server Side Software Engineering (Backend-GoLang)',
       'International Technical Support', 'iOS App Development',
       'Mobile App Development', 'AWS Engineering', 'React Development',
       'Product Management', 'Flutter Development', 'Java Development',
       'WordPress Development', 'WordPress Website Design',
       'Cyber Security Audit', 'Android Development', 'Rust Development',
       'Internet Of Things (IoT)',
       'Front End Development (React/Angular)', '.Net Development',
       'PHP Development', 'Full Stack Development', 'Backend Development',
       'Quality Analysis', 'iOS Resea

In [11]:
df4= df['internship'].str.replace(r'[^0-9a-zA-Z:,\s]+', '', regex=True)


In [12]:
df4.unique()

array(['Software Testing',
       'Technical Operations  Networking And Monitoring',
       'Software Project Management', 'Web Development',
       'Front End Development', 'Web Design', 'Nodejs Development',
       'C Development QT Creation', 'Quality Control  Testing',
       'Software Support Engineering', 'Full Stack Development MERN',
       'Business Analysis',
       'Server Side Software Engineering BackendGoLang',
       'International Technical Support', 'iOS App Development',
       'Mobile App Development', 'AWS Engineering', 'React Development',
       'Product Management', 'Flutter Development', 'Java Development',
       'WordPress Development', 'WordPress Website Design',
       'Cyber Security Audit', 'Android Development', 'Rust Development',
       'Internet Of Things IoT', 'Front End Development ReactAngular',
       'Net Development', 'PHP Development', 'Full Stack Development',
       'Backend Development', 'Quality Analysis',
       'iOS Research Video Playback

Step 5: Regex replace numbers or non-digit characters

Now let's check how we can** replace all non digit characters and convert the value to int or remove all numbers from a column**.

Replace all non numeric symbols and map in case of missing
In this example we are going to replace everything which is not a number with a regex. In case of a value which doesn't have a number we will map the value to 0.

In [14]:
df['applicants'].unique()

array(['119', '194', '113', '183', '205', '457', '54', '618',
       'Be an early applicant', '34', '28', '32', '41', '35', '31', '46',
       '49', '57', '38', '43', '44', '75', '39', '37', '48', '47', '55',
       '66', '27', '26', '52', '62', '68', '72', '50', '78', '70', '84',
       '120', '139', '33', '45', '263', '59', '95', '36', '64', '89',
       '86', '76', '63', '107', '143', '80', '42', '58', '127', '110',
       '309', '91', '30', '82', '93', '85', '129', '117', '29', '60',
       '40', '61', '51', '122', '192', '230', '102', '100', '88', '268',
       '115', '265', '56', '87', '65', '238', '144', '69', '299', '121',
       '233', '125', '79', '174', '155', '73', '104', '135', '156', '167',
       '131', '164', '94', '81', '53', '71', '106', '142', '83', '178',
       '74', '114', '96', '90', '172', '123', '103', '405', '99', '108',
       '150', '185', '145', '202', '184', '128', '130', '147', '77',
       '260', '134', '67', '351', '136', '379', '179', '92', '204', '242

In [15]:
df['ap_5']=df['applicants'].str.replace(r'\D+', '', regex=True)
df['ap_5'].unique()

array(['119', '194', '113', '183', '205', '457', '54', '618', '', '34',
       '28', '32', '41', '35', '31', '46', '49', '57', '38', '43', '44',
       '75', '39', '37', '48', '47', '55', '66', '27', '26', '52', '62',
       '68', '72', '50', '78', '70', '84', '120', '139', '33', '45',
       '263', '59', '95', '36', '64', '89', '86', '76', '63', '107',
       '143', '80', '42', '58', '127', '110', '309', '91', '30', '82',
       '93', '85', '129', '117', '29', '60', '40', '61', '51', '122',
       '192', '230', '102', '100', '88', '268', '115', '265', '56', '87',
       '65', '238', '144', '69', '299', '121', '233', '125', '79', '174',
       '155', '73', '104', '135', '156', '167', '131', '164', '94', '81',
       '53', '71', '106', '142', '83', '178', '74', '114', '96', '90',
       '172', '123', '103', '405', '99', '108', '150', '185', '145',
       '202', '184', '128', '130', '147', '77', '260', '134', '67', '351',
       '136', '379', '179', '92', '204', '242', '151', '133', '305

In [62]:
df['ap_5'].sort_values().unique()

array(['', '100', '1000', '101', '102', '103', '104', '105', '106', '107',
       '108', '110', '111', '112', '113', '114', '115', '116', '117',
       '118', '119', '120', '121', '122', '123', '124', '125', '126',
       '127', '128', '129', '130', '131', '132', '133', '134', '135',
       '136', '137', '138', '139', '140', '142', '143', '144', '145',
       '147', '149', '150', '151', '153', '154', '155', '156', '157',
       '160', '161', '162', '163', '164', '166', '167', '169', '170',
       '171', '172', '173', '174', '177', '178', '179', '181', '182',
       '183', '184', '185', '186', '187', '190', '191', '192', '194',
       '197', '199', '202', '203', '204', '205', '206', '208', '209',
       '211', '212', '214', '215', '219', '220', '222', '223', '225',
       '229', '230', '231', '233', '236', '238', '240', '241', '242',
       '245', '246', '247', '252', '254', '256', '257', '259', '26',
       '260', '263', '265', '266', '268', '27', '271', '275', '277',
       '279', '28

In [68]:
f= df['ap_5'] ==''
df[f].fillna

<bound method DataFrame.fillna of                                 internship  \
8                Quality Control & Testing   
9             Software Support Engineering   
10           Full Stack Development (MERN)   
11                       Business Analysis   
12                        Software Testing   
...                                    ...   
2559                Mobile App Development   
2563                       Summer Research   
2564  Academic Research (Computer Science)   
2565                      Computer Science   
2566                      Computer Science   

                                           company_name  \
8                 Veritos Infosolutions Private Limited   
9                                                   Ori   
10                                          Blackcoffer   
11                                          Blackcoffer   
12    iTalent India Management Consultants Private L...   
...                                                 ...   


In [71]:
df['ap_5'].isnull

<bound method Series.isnull of 0       119
1       194
2       113
3       183
4       205
       ... 
2562     30
2563       
2564       
2565       
2566       
Name: ap_5, Length: 2567, dtype: object>

In [115]:
df['regex_1']= df['applicants'].apply(lambda x: re.findall(r'\d+', x))
df['regex_1']

0       [119]
1       [194]
2       [113]
3       [183]
4       [205]
        ...  
2562     [30]
2563       []
2564       []
2565       []
2566       []
Name: regex_1, Length: 2567, dtype: object

In [121]:
type(df['regex_1'])

pandas.core.series.Series

In [123]:
df['regex_1'].values

array([list(['119']), list(['194']), list(['113']), ..., list([]),
       list([]), list([])], dtype=object)

In [92]:
df['regex_1'][0]

['119']

In [145]:
df['regex_1'][0][0]

'119'

In [117]:
len(df['regex_1'])

2567

In [77]:
df[['applicants', 'regex_1']]

Unnamed: 0,applicants,regex_1
0,119 applicants,[119]
1,194 applicants,[194]
2,113 applicants,[113]
3,183 applicants,[183]
4,205 applicants,[205]
...,...,...
2562,30 applicants,[30]
2563,Be an early applicant,[]
2564,Be an early applicant,[]
2565,Be an early applicant,[]


In [150]:
for i in range(len(df['ap_5'])):
    print(df['ap_5'][i])

119
194
113
183
205
457
54
618












34


28







32





41





41



35





31


46


49


34


57

32


38


49
35



43


44






75





39








37

34


48

47














47


55





66

31



31









27


26


32



52






62
31


68

37









66



68

47




72



34









37
26
50



27

35

27
78

47





27
37
70

34


84

120
37
139
75

27





66
33

45

263
59
120
95



36
28
119

64
66
89

43
31
55
39


75

86





59

76



63






35



27

107

68
31



44

143
80
42


37
33
39
58
127

110
42



36
36
309
43


38
66



32
27
72




55
62

39
91
43
33


30
120




82




35






93

75


85

33
129



32
38







117





29


60
38

31
48

40





64


61


51

37
60




37


45

29
45

44
























38

122

27
192


49
48
49
48

27







70
93


230
40
95







34


34








43






31

102


68
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
29
39
39
39
39
80
93
84
28

100
32
61

88
268

55

In [167]:
from warnings import catch_warnings
import numpy as np
y=0
x=[]
for i in range(len(df['regex_1'])):
    try: 
        x.append(df['regex_1'][i][0])
    
    except:
        y=y+1
    


In [169]:
x,y

(['119',
  '194',
  '113',
  '183',
  '205',
  '457',
  '54',
  '618',
  '34',
  '28',
  '32',
  '41',
  '41',
  '35',
  '31',
  '46',
  '49',
  '34',
  '57',
  '32',
  '38',
  '49',
  '35',
  '43',
  '44',
  '75',
  '39',
  '37',
  '34',
  '48',
  '47',
  '47',
  '55',
  '66',
  '31',
  '31',
  '27',
  '26',
  '32',
  '52',
  '62',
  '31',
  '68',
  '37',
  '66',
  '68',
  '47',
  '72',
  '34',
  '37',
  '26',
  '50',
  '27',
  '35',
  '27',
  '78',
  '47',
  '27',
  '37',
  '70',
  '34',
  '84',
  '120',
  '37',
  '139',
  '75',
  '27',
  '66',
  '33',
  '45',
  '263',
  '59',
  '120',
  '95',
  '36',
  '28',
  '119',
  '64',
  '66',
  '89',
  '43',
  '31',
  '55',
  '39',
  '75',
  '86',
  '59',
  '76',
  '63',
  '35',
  '27',
  '107',
  '68',
  '31',
  '44',
  '143',
  '80',
  '42',
  '37',
  '33',
  '39',
  '58',
  '127',
  '110',
  '42',
  '36',
  '36',
  '309',
  '43',
  '38',
  '66',
  '32',
  '27',
  '72',
  '55',
  '62',
  '39',
  '91',
  '43',
  '33',
  '30',
  '120',
  '82'