In [None]:
!pip install argostranslate

In [None]:
import pandas as pd
import re
import argostranslate.package
import argostranslate.translate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load data into a Data Frame

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/Scraped/NIDS.csv')

### Data Visualization

In [None]:
data.shape

(140, 4)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Name             140 non-null    object
 1   Education        71 non-null     object
 2   Experiences      91 non-null     object
 3   Graduation_Year  140 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 4.5+ KB


In [None]:
data.head()

Unnamed: 0,Name,Education,Experiences,Graduation_Year
0,Paul Yoan KOME,Ecole Supérieure Privée d'Ingénierie et de Tec...,"Within the executive B2B division, SFR needs i...",1973
1,Marwen Ben Mihoub,Ecole Supérieure Privée d'Ingénierie et de Tec...,Currently part of the Attack and Penetration T...,2017
2,Florin Kiss,,.lumen builds glasses that empower the blind t...,0
3,Mehdi JAAFAR,,-Conception and full stack implementation of a...,0
4,Ayoub Belhadjyahia,Ecole Supérieure Privée d'Ingénierie et de Tec...,,2024


In [None]:
data.isnull().sum()

Name                0
Education          69
Experiences        49
Graduation_Year     0
dtype: int64

In [None]:
data['Name'].value_counts()

Paul Yoan KOME          1
Jérémie L.              1
Nesrine Talbi           1
Siwar Dahmani           1
Mohamed Aymen Ourabi    1
                       ..
sara ibrahim            1
HNIA M'hamed            1
Ahmed Belkahla          1
Oussama FILALI          1
Dr Magda Chelly         1
Name: Name, Length: 140, dtype: int64

In [None]:
data.nunique()

Name               140
Education           71
Experiences         91
Graduation_Year     18
dtype: int64

In [None]:
data = data.astype(str)

# Data Cleaning ###


#### Remove duplicate profiles

In [None]:
data = data.drop_duplicates(subset=['Name'])

In [None]:
data.shape

(140, 4)

In [None]:
data.head(10)

Unnamed: 0,Name,Education,Experiences,Graduation_Year
0,Paul Yoan KOME,Ecole Supérieure Privée d'Ingénierie et de Tec...,"Within the executive B2B division, SFR needs i...",1973
1,Marwen Ben Mihoub,Ecole Supérieure Privée d'Ingénierie et de Tec...,Currently part of the Attack and Penetration T...,2017
2,Florin Kiss,,.lumen builds glasses that empower the blind t...,0
3,Mehdi JAAFAR,,-Conception and full stack implementation of a...,0
4,Ayoub Belhadjyahia,Ecole Supérieure Privée d'Ingénierie et de Tec...,,2024
5,Yasmine Karaoui,Ecole Supérieure Privée d'Ingénierie et de Tec...,"During my summer internship, I had the opportu...",2024
6,Yosr GHOZZI,,Skills: Automatic Learning · Improvement of bu...,0
7,Christopher (Kit) Henry,,Cyber Security Program Manager / System Engine...,0
8,Arij M'tar,Ecole Supérieure Privée d'Ingénierie et de Tec...,BUSINESS INTELLIGENCE Project: Development of ...,2020
9,raed bahria,Ecole Supérieure Privée d'Ingénierie et de Tec...,,2024


#### Remove text leading and trailing whitespaces

In [None]:
data['Experiences'] = data['Experiences'].str.strip()

In [None]:
data['Education'] = data['Education'].str.strip()

#### Remove empty experiences

In [None]:
data = data[~data['Experiences'].isin(['', 'nan', 'No Result'])]

In [None]:
data.shape

(91, 4)

In [None]:
data

Unnamed: 0,Name,Education,Experiences,Graduation_Year
0,Paul Yoan KOME,Ecole Supérieure Privée d'Ingénierie et de Tec...,"Within the executive B2B division, SFR needs i...",1973
1,Marwen Ben Mihoub,Ecole Supérieure Privée d'Ingénierie et de Tec...,Currently part of the Attack and Penetration T...,2017
2,Florin Kiss,,.lumen builds glasses that empower the blind t...,0
3,Mehdi JAAFAR,,-Conception and full stack implementation of a...,0
5,Yasmine Karaoui,Ecole Supérieure Privée d'Ingénierie et de Tec...,"During my summer internship, I had the opportu...",2024
...,...,...,...,...
132,Omar YAZIDI,Ecole Supérieure Privée d'Ingénierie et de Tec...,"Design, Implementation and Evaluation of an An...",2017
134,BALLESTEROS Hugo,,Governance:\n- Development of security policie...,0
135,Richard Dufour,,Skills: DNS · Nmap · AngularJS · Python (progr...,0
138,Roman Vitkovitsky,,Dedicated Plankholder and Servant Leader Opera...,0


#### Translation of Experiences to English

In [None]:
from_code = "fr"
to_code = "en"

# Download and install Argos Translate package
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download()) # type: ignore

# Define a function to translate a text from French to English
def translate_fr_to_en(text):
    if isinstance(text, str):
        translatedText = argostranslate.translate.translate(text, from_code, to_code)
        return translatedText
    else:
        return text

In [None]:
# Apply the translation function to the Experiences column
data['Experiences'] = data['Experiences'].apply(translate_fr_to_en)

In [None]:
data.head(10)

Unnamed: 0,Name,Education,Experiences,Graduation_Year
0,Paul Yoan KOME,Ecole Supérieure Privée d'Ingénierie et de Tec...,"Within the executive B2B division, SFR needs i...",1973
1,Marwen Ben Mihoub,Ecole Supérieure Privée d'Ingénierie et de Tec...,Currently part of the Attack and Penetration T...,2017
2,Florin Kiss,,.lumen builds glasses that empower the blind t...,0
3,Mehdi JAAFAR,,-Conception and full stack implementation of a...,0
5,Yasmine Karaoui,Ecole Supérieure Privée d'Ingénierie et de Tec...,"During my summer internship, I had the opportu...",2024
6,Yosr GHOZZI,,Skills: Automatic Learning · Improvement of bu...,0
7,Christopher (Kit) Henry,,Cyber Security Program Manager / System Engine...,0
8,Arij M'tar,Ecole Supérieure Privée d'Ingénierie et de Tec...,BUSINESS INTELLIGENCE Project: Development of ...,2020
10,Saif Faleh,Ecole Supérieure Privée d'Ingénierie et de Tec...,Design and Development of an application for D...,2014
15,Talel CHELBI,Ecole Supérieure Privée d'Ingénierie et de Tec...,Cyber security compliance\nCyber security asse...,2017


#### Extraction of graduation year from ESPRIT

In [None]:
# Regular expression pattern to match a four-digit year
year_pattern = r"\b\d{4}\b"

# Function to extract graduation year
def extract_grad_year(education_str):
    match = re.findall(year_pattern, education_str)
    return pd.Series(match[-1] if match else 0)

# Apply the extract_grad_year function to the 'Education' column to create a new 'Graduation_Year' column
data['Graduation_Year'] = data['Education'].apply(extract_grad_year)

In [None]:
data.head(10)

Unnamed: 0,Name,Education,Experiences,Graduation_Year
0,Paul Yoan KOME,Ecole Supérieure Privée d'Ingénierie et de Tec...,"Within the executive B2B division, SFR needs i...",1973
1,Marwen Ben Mihoub,Ecole Supérieure Privée d'Ingénierie et de Tec...,Currently part of the Attack and Penetration T...,2017
2,Florin Kiss,,.lumen builds glasses that empower the blind t...,0
3,Mehdi JAAFAR,,-Conception and full stack implementation of a...,0
5,Yasmine Karaoui,Ecole Supérieure Privée d'Ingénierie et de Tec...,"During my summer internship, I had the opportu...",2024
6,Yosr GHOZZI,,Skills: Automatic Learning · Improvement of bu...,0
7,Christopher (Kit) Henry,,Cyber Security Program Manager / System Engine...,0
8,Arij M'tar,Ecole Supérieure Privée d'Ingénierie et de Tec...,BUSINESS INTELLIGENCE Project: Development of ...,2020
10,Saif Faleh,Ecole Supérieure Privée d'Ingénierie et de Tec...,Design and Development of an application for D...,2014
15,Talel CHELBI,Ecole Supérieure Privée d'Ingénierie et de Tec...,Cyber security compliance\nCyber security asse...,2017


In [None]:
data[data['Education']=='nan'].count()

Name               48
Education          48
Experiences        48
Graduation_Year    48
dtype: int64

In [None]:
data.Graduation_Year.value_counts()

0       49
2020     7
2019     6
2017     5
2022     4
2023     4
2014     3
2018     3
2024     2
2025     2
1973     1
2016     1
2008     1
2021     1
2013     1
2010     1
Name: Graduation_Year, dtype: int64

In [None]:
data.shape

(91, 4)

#### Export results to CSV

In [None]:
data.to_csv('/content/drive/MyDrive/Datasets/Clean/NIDS.csv', index=False)