In [243]:
import pandas as pd
import os

In [244]:
path = os.path.join('OriginalData', 'skills.csv')
data = pd.read_csv(path)

In [245]:
data

Unnamed: 0,user_id,skill
0,1,Mühendislik
1,1,Eğitim
2,2,Android
3,2,Java
4,2,3D Studio Max
...,...,...
1398438,66273,CI/CD
1398439,66273,Terraform
1398440,66273,MongoDB
1398441,66273,Go (Programming Language)


In [246]:
# Show amount of rows
data.shape

(1398443, 2)

In [247]:
# Show amount of unique skills
data['skill'].nunique()

53025

In [248]:
# Lowercase all the skills
data['skill'] = data['skill'].str.lower()

In [249]:
# remove trailing and leading spaces
data['skill'] = data['skill'].str.strip()

In [250]:
# if key is found in dataframe, we will rename to value from this dict
skill_rename_dict = {
    'sql': 'sql',
    'selenium': 'selenium',
    'microsoft': 'microsoft',
    'amazon': 'aws',
    'aws': 'aws',
    'liderlik': 'leadership',
    'lider': 'leadership',
    'oyun': 'game',
    'agile': 'agile',
    'algorit': 'algorithm',
    'program': 'programming',
    'proje': 'project',
    'web': 'web',
}

# Rename now
for key, value in skill_rename_dict.items():
    data['skill'] = data['skill'].apply(lambda x: value if key in x else x)

In [252]:
# Rows to drop
drop_keywords = ['eğitim', 'mühendislik',
                'yabancı dil', 'yabancı dil bilgisi',
                'yabancı', 'ingilizce', 'i̇ngilizce', 'almanca', 'fransızca',
                'rusça', 'arapça', 'türkçe'
                ]
# If keyword is found in skill name, we will drop the row
for keyword in drop_keywords:
    data = data[~data['skill'].str.contains(keyword)]

In [253]:
# Drop rows where skill is only found once in the dataset
data = data.groupby('skill').filter(lambda x: len(x) > 1)

In [254]:
# Remove duplicates
data = data.drop_duplicates()

In [255]:
# Show amount of unique skills after cleaning
data['skill'].nunique()

23451

In [256]:
# Show skills that appear the most
data['skill'].value_counts()

sql                       34539
java                      24911
javascript                24304
c#                        23745
programming               23002
                          ...  
time sensitive network        1
hxc                           1
igaming                       1
neo                           1
b-pro                         1
Name: skill, Length: 23451, dtype: int64

In [257]:
# Show amount of rows
data.shape

(1272196, 2)

In [258]:
# We show how many different skills a user has
data = data.groupby('user_id')['skill'].nunique().reset_index()
# Rename skill into skill_count
data = data.rename(columns={'skill': 'skill_count'})

In [259]:
# Save data to a new csv file in PreparedData folder
data.to_csv(os.path.join('PreparedData', 'skills.csv'), index=False)