In [908]:
import pandas as pd
import os

In [909]:
path = os.path.join('OriginalData', 'skills.csv')
data = pd.read_csv(path)
data_backup = data.copy()

In [910]:
data = data.head(1000)

In [911]:
# Show amount of unique skills
data['skill'].nunique()

481

In [912]:
# Lowercase all the skills
data['skill'] = data['skill'].str.lower()

In [913]:
# remove trailing and leading spaces
data['skill'] = data['skill'].str.strip()

In [914]:
# Remove skills that are longer than 3 words
data['skill'] = data['skill'].str.split().apply(lambda x: ' '.join(x[:3]))

In [915]:
from fuzzywuzzy import fuzz
from collections import defaultdict

def group_strings(strings, reference_strings, similarity_threshold):
    groups = defaultdict(list)
    for i, string1 in enumerate(reference_strings):
        for j, string2 in enumerate(strings):
            similarity = fuzz.token_set_ratio(string1, string2)
            if similarity >= similarity_threshold:
                groups[string1].append(string2)
    return groups

In [916]:
it_groups = ["Web", "Programming", "C", "C++", "Python", "Java", ".NET", "command line", "yapay zeka", "Project", "Testing", "Git", "Oyun", "Development", "LaTeX", "Math", "Technology", "Data", "Database", "Mühendis", "Sorun", "Object Oriented", "Code"]
it_groups += ["Linux", "Windows", "Mac", "Android", "iOS", "SQL", "Database", "Network", "Security", "Cloud", "DevOps", "Data", "Machine Learning"]
business_group = ["Finance", "Marketing", "Business", "İş", "Pazarlama", ]
design_group = ["Design", "Photoshop", "Illustrator", "InDesign", "After Effects", "Premiere", "3D", "Animation", "Video", "Audio", "Motion Graphics"]
# Lower all the groups
it_groups = [x.lower() for x in it_groups]
business_group = [x.lower() for x in business_group]
design_group = [x.lower() for x in design_group]
skill_groups = it_groups + business_group + design_group
skill_groups

['web',
 'programming',
 'c',
 'c++',
 'python',
 'java',
 '.net',
 'command line',
 'yapay zeka',
 'project',
 'testing',
 'git',
 'oyun',
 'development',
 'latex',
 'math',
 'technology',
 'data',
 'database',
 'mühendis',
 'sorun',
 'object oriented',
 'code',
 'linux',
 'windows',
 'mac',
 'android',
 'ios',
 'sql',
 'database',
 'network',
 'security',
 'cloud',
 'devops',
 'data',
 'machine learning',
 'finance',
 'marketing',
 'business',
 'i̇ş',
 'pazarlama',
 'design',
 'photoshop',
 'illustrator',
 'indesign',
 'after effects',
 'premiere',
 '3d',
 'animation',
 'video',
 'audio',
 'motion graphics']

In [917]:
skills = data.skill.unique().tolist()
skills.sort()

In [918]:
skill_groups = group_strings(skills, skill_groups, 50)

In [919]:
# Combine it skills
it_skills = []
for key, value in skill_groups.items():
    if key in it_groups:
        it_skills += value
it_skills = list(set(it_skills))
it_skills.sort()



# Combine business skills
business_skills = []
for key, value in skill_groups.items():
    if key in business_group:
        business_skills += value
business_skills = list(set(business_skills))
business_skills.sort()

# Combine design skills
design_skills = []
for key, value in skill_groups.items():
    if key in design_group:
        design_skills += value
design_skills = list(set(design_skills))
design_skills.sort()


In [920]:
# Create a new column for the skill groups count for each user
data['it_skills_count'] = 0
data['business_skills_count'] = 0
data['design_skills_count'] = 0


In [921]:
# Group the data by user
grouped_data = data.groupby('user_id')

# For each user, count the number of skills in each group
for user_id, group in grouped_data:
    # Get the skills of the user
    skills = group['skill'].tolist()
    # Count the number of skills in each group
    it_skills_count = len([x for x in skills if x in it_skills])
    business_skills_count = len([x for x in skills if x in business_skills])
    design_skills_count = len([x for x in skills if x in design_skills])

    # Update the data
    data.loc[data['user_id'] == user_id, 'it_skills_count'] = it_skills_count
    data.loc[data['user_id'] == user_id, 'business_skills_count'] = business_skills_count
    data.loc[data['user_id'] == user_id, 'design_skills_count'] = design_skills_count

# Drop the skill column
data = data.drop(columns=['skill'])

# Drop duplicates
data = data.drop_duplicates()

In [923]:
# Save data to a new csv file in PreparedData folder
data.to_csv(os.path.join('PreparedData', 'skills.csv'), index=False)