In [14]:
import pandas as pd
import os

In [15]:
path = os.path.join('OriginalData', 'skills.csv')
data = pd.read_csv(path)
data_backup = data.copy()

In [16]:
# Show amount of unique skills
data['skill'].nunique()

53025

In [17]:
# Lowercase all the skills
data['skill'] = data['skill'].str.lower()

In [18]:
# remove trailing and leading spaces
data['skill'] = data['skill'].str.strip()

In [19]:
# Remove skills that are longer than 3 words
data['skill'] = data['skill'].str.split().apply(lambda x: ' '.join(x[:3]))

In [20]:
from fuzzywuzzy import fuzz
from collections import defaultdict

def group_strings(strings, reference_strings, similarity_threshold):
    groups = defaultdict(list)
    for i, string1 in enumerate(reference_strings):
        for j, string2 in enumerate(strings):
            similarity = fuzz.token_set_ratio(string1, string2)
            if similarity >= similarity_threshold:
                groups[string1].append(string2)
    return groups

In [21]:
it_groups = ["Web", "Programming", "C", "C++", "Python", "Java", ".NET", "command line", "yapay zeka", "Project", "Testing", "Git", "Oyun", "Development", "LaTeX", "Math", "Technology", "Data", "Database", "Mühendis", "Sorun", "Object Oriented", "Code"]
it_groups += ["Linux", "Windows", "Mac", "Android", "iOS", "SQL", "Database", "Network", "Security", "Cloud", "DevOps", "Data", "Machine Learning"]
business_group = ["Finance", "Marketing", "Business", "İş", "Pazarlama", ]
design_group = ["Design", "Photoshop", "Illustrator", "InDesign", "After Effects", "Premiere", "3D", "Animation", "Video", "Audio", "Motion Graphics"]
# Lower all the groups
it_groups = [x.lower() for x in it_groups]
business_group = [x.lower() for x in business_group]
design_group = [x.lower() for x in design_group]
skill_groups = it_groups + business_group + design_group
skill_groups

['web',
 'programming',
 'c',
 'c++',
 'python',
 'java',
 '.net',
 'command line',
 'yapay zeka',
 'project',
 'testing',
 'git',
 'oyun',
 'development',
 'latex',
 'math',
 'technology',
 'data',
 'database',
 'mühendis',
 'sorun',
 'object oriented',
 'code',
 'linux',
 'windows',
 'mac',
 'android',
 'ios',
 'sql',
 'database',
 'network',
 'security',
 'cloud',
 'devops',
 'data',
 'machine learning',
 'finance',
 'marketing',
 'business',
 'i̇ş',
 'pazarlama',
 'design',
 'photoshop',
 'illustrator',
 'indesign',
 'after effects',
 'premiere',
 '3d',
 'animation',
 'video',
 'audio',
 'motion graphics']

In [22]:
skills = data.skill.unique().tolist()
skills.sort()

In [23]:
skill_groups = group_strings(skills, skill_groups, 50)

In [24]:
# Combine it skills
it_skills = []
for key, value in skill_groups.items():
    if key in it_groups:
        it_skills += value
it_skills = list(set(it_skills))
it_skills.sort()



# Combine business skills
business_skills = []
for key, value in skill_groups.items():
    if key in business_group:
        business_skills += value
business_skills = list(set(business_skills))
business_skills.sort()

# Combine design skills
design_skills = []
for key, value in skill_groups.items():
    if key in design_group:
        design_skills += value
design_skills = list(set(design_skills))
design_skills.sort()


In [25]:
# Create a new column for the skill groups count for each user
data['it_skills_count'] = 0
data['business_skills_count'] = 0
data['design_skills_count'] = 0


In [26]:
# Group the data by user
grouped_data = data.groupby('user_id')

# Empty list to keep track of users completed in for loop
users_completed = []

# For each user, count the number of skills in each group
for user_id, group in grouped_data:
    # Get the skills of the user
    skills = group['skill'].tolist()
    # Count the number of skills in each group
    it_skills_count = len([x for x in skills if x in it_skills])
    business_skills_count = len([x for x in skills if x in business_skills])
    design_skills_count = len([x for x in skills if x in design_skills])

    # Update the data
    data.loc[data['user_id'] == user_id, 'it_skills_count'] = it_skills_count
    data.loc[data['user_id'] == user_id, 'business_skills_count'] = business_skills_count
    data.loc[data['user_id'] == user_id, 'design_skills_count'] = design_skills_count

    # Add the user to the list of completed users
    users_completed.append(user_id)
    # Show progress
    print(f'{len(users_completed)} / {len(grouped_data)}')

# Drop the skill column
data = data.drop(columns=['skill'])

# Drop duplicates
data = data.drop_duplicates()

1 / 62402
2 / 62402
3 / 62402
4 / 62402
5 / 62402
6 / 62402
7 / 62402
8 / 62402
9 / 62402
10 / 62402
11 / 62402
12 / 62402
13 / 62402
14 / 62402
15 / 62402
16 / 62402
17 / 62402
18 / 62402
19 / 62402
20 / 62402
21 / 62402
22 / 62402
23 / 62402
24 / 62402
25 / 62402
26 / 62402
27 / 62402
28 / 62402
29 / 62402
30 / 62402
31 / 62402
32 / 62402
33 / 62402
34 / 62402
35 / 62402
36 / 62402
37 / 62402
38 / 62402
39 / 62402
40 / 62402
41 / 62402
42 / 62402
43 / 62402
44 / 62402
45 / 62402
46 / 62402
47 / 62402
48 / 62402
49 / 62402
50 / 62402
51 / 62402
52 / 62402
53 / 62402
54 / 62402
55 / 62402
56 / 62402
57 / 62402
58 / 62402
59 / 62402
60 / 62402
61 / 62402
62 / 62402
63 / 62402
64 / 62402
65 / 62402
66 / 62402
67 / 62402
68 / 62402
69 / 62402
70 / 62402
71 / 62402
72 / 62402
73 / 62402
74 / 62402
75 / 62402
76 / 62402
77 / 62402
78 / 62402
79 / 62402
80 / 62402
81 / 62402
82 / 62402
83 / 62402
84 / 62402
85 / 62402
86 / 62402
87 / 62402
88 / 62402
89 / 62402
90 / 62402
91 / 62402
92 / 624

In [27]:
# Save data to a new csv file in PreparedData folder
data.to_csv(os.path.join('PreparedData', 'skills.csv'), index=False)