# Preparing the work_experiences.csv file

In [119]:
import pandas as pd
import os
import numpy as np

In [120]:
path = os.path.join('OriginalData', 'work_experiences.csv')
data = pd.read_csv(path)

In [121]:
data

Unnamed: 0,user_id,company_id,location,start_year_month
0,53442,2651,"Istanbul, Turkey",201509
1,34558,815,"Istanbul, Turkey",201210
2,63761,26354,,200010
3,10738,89,,201610
4,8711,3113,"Istanbul, Turkey",201801
...,...,...,...,...
187014,22180,15065,Gebze,201205
187015,55822,25076,stajer,201506
187016,13750,1607,,201901
187017,3679,1414,"Ankara, Turkey",201807


In [122]:
# Show summary of NaN values
data.isnull().sum()

user_id                 0
company_id              0
location            61414
start_year_month        0
dtype: int64

In [123]:
# Show how many unique values there are for each column
data.nunique()

user_id             57079
company_id          20837
location             6127
start_year_month      397
dtype: int64

Change - Assume Nan locations are unknown

In [124]:
data.fillna('Unknown', inplace=True)

Format date

In [125]:
# Put a '-' at in between the year and day in the format "yyyydd"
data['start_year_month'] = data['start_year_month'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:])
# Turn the start_year_month column into a datetime object
data['start_year_month'] = pd.to_datetime(data['start_year_month'], format='%Y-%m')
# Rename the column to "start_date"
data.rename(columns={'start_year_month': 'start_date'}, inplace=True)

Sort by user 

In [126]:
data.sort_values(by='user_id', ascending=True, inplace=True)

In [127]:
data.head()

Unnamed: 0,user_id,company_id,location,start_date
147720,0,0,Serbest Çalışmalar,2005-09-01
174454,0,0,Visual Studio Asp.Net Developer,2005-09-01
180157,2,10,"Mersin, Turkey",2018-06-01
19762,2,7,"Elazig, Turkey",2017-06-01
760,2,9,"Elazig, Turkey",2016-12-01


Mark people who started work after 2019

In [128]:
data["StartAfter2019"] = 0 
data.loc[data['start_date'] >= '2019-01-01',"StartAfter2019"] = 1

Create #companies changed column

In [129]:
# Create a new column called "companies_changed" for each user_id
# This column will be the amount of companies the user has changed
data['companies_worked'] = data.groupby('user_id')['company_id'].transform('nunique')

## The location column is all over the place, so we normalize it.

Some users worked only in turkey, others abroad and/or in Turkey
Mark users as TR, MIX, or Other if unknown

In [130]:
# save location data for inspection
data.location.value_counts().to_frame().to_csv('locations.csv')

In [131]:
# Make country column lowercase
data['location'] = data['location'].apply(lambda x: str(x).lower())

In [132]:
from unidecode import unidecode

In [133]:
data.location = data.location.astype(str)
locations =  data.location.unique().tolist()
locations = [unidecode(string) for string in locations]
locations.sort()

In [134]:
from fuzzywuzzy import fuzz
from collections import defaultdict

def group_strings(strings, reference_strings, similarity_threshold):
    groups = {key: [] for key in reference_strings.keys()}
    groups["Other"] = []
    key_list = [key for key in reference_strings.keys()]
    for i, string1 in enumerate(strings):
        max_similarity = 0
        similarities = []
        for key, values in reference_strings.items():
            similarities.append(max([fuzz.partial_ratio(string1, val) for val in values]))
        idx = np.argmax(similarities)
        if similarities[idx] >= similarity_threshold:
            groups[key_list[idx]].append((string1,similarities[idx]))
        else:
            groups["Other"].append((string1,similarities[idx]))
    for key in groups.keys():
        groups[key] = [string for string, sim in sorted(groups[key], key=lambda x: -x[1])]
    return groups

In [135]:
location_groups = {
    "TR":["Turkey", "Türkiye","Adana", "Adiyaman", "Afyonkarahisar", "Agri", "Aksaray", "Amasya", "Ankara", "Antalya", "Ardahan", "Artvin", "Aydin", 
    "Balikesir", "Bartin", "Batman", "Bayburt", "Bilecik", "Bingol", "Bitlis", "Bolu", "Burdur", "Bursa", "Canakkale", "Cankiri", "Corum", "Denizli",
     "Diyarbakir", "Düzce", "Edirne", "Elazig", "Erzincan", "Erzurum", "EskiSehir", "Gaziantep", "Giresun", "GümüShane", "HakkAri", "Hatay", "Igdir", 
     "Isparta", "İstanbul", "İzmir", "Kahramanmaras", "Karabük", "Karaman", "Kars", "Kastamonu", "Kayseri", "Kilis", "Kirikkale", "Kirklareli", "KirSehir",
      "Kocaeli", "Konya", "Kütahya", "Malatya", "Manisa", "Mardin", "Mersin", "Mugla", "MuS", "NevSehir", "Nigde", "Ordu", "Osmaniye", "Rize", "Sakarya", 
      "Samsun", "Sanliurfa", "Siirt", "Sinop", "Sivas", "Sirnak", "Tekirdag", "Tokat", "Trabzon", "Tunceli", "USak", "Van", "Yalova", "Yozgat", "Zonguldak",
      "Akbank"],
    "Unsure":["Unknown"]
}

In [136]:
groups= group_strings(locations, location_groups, 60)

In [137]:
for key,value in groups.items():
    data.loc[data.location.isin(groups[key]), "country"]  = key

New features for working days

In [143]:
# Create a new column called "max_duration" for each user_id
# This is the maximum difference between the start dates of the user's work experiences
data['max_duration'] = data.groupby('user_id')['start_date'].transform('max') - data.groupby('user_id')['start_date'].transform('min')

In [144]:
# For rows where the max_duration is 0, calculate the time between it's start_date and today
data.loc[data['max_duration'] == '0 days', 'max_duration'] = pd.to_datetime('2019-01-01') - data.loc[data['max_duration'] == '0 days', 'start_date']

In [145]:
# Show user 53442
data[data['user_id'] == 53442]

Unnamed: 0,user_id,company_id,location,start_date,StartAfter2019,companies_worked,country,max_duration
44058,53442,2651,"istanbul, turkey",2018-02-01,0,3,TR,1918 days
57313,53442,3803,unknown,2013-09-01,0,3,Unsure,1918 days
23603,53442,4957,"kocaeli, türkiye",2012-11-01,0,3,,1918 days
0,53442,2651,"istanbul, turkey",2015-09-01,0,3,TR,1918 days


In [147]:
# Drop location column
data.drop('location', axis=1, inplace=True)
# Drop start_date column
data.drop('start_date', axis=1, inplace=True)

Now we only want a single row for each user_id, a summary cleaned up for each user

In [148]:
# Show user 53442
data[data['user_id'] == 53442]

Unnamed: 0,user_id,company_id,StartAfter2019,companies_worked,country,max_duration
44058,53442,2651,0,3,TR,1918 days
57313,53442,3803,0,3,Unsure,1918 days
23603,53442,4957,0,3,,1918 days
0,53442,2651,0,3,TR,1918 days


In [149]:
# Drop company_id column
data.drop('company_id', axis=1, inplace=True)
# Leave only the first row for each user_id
data.drop_duplicates(subset='user_id', keep='first', inplace=True)

In [150]:
# Show user 53442
data[data['user_id'] == 53018]

Unnamed: 0,user_id,StartAfter2019,companies_worked,country,max_duration
95334,53018,0,4,TR,1765 days


- Sort and convert days to int

In [151]:
data.sort_values(by='user_id', inplace=True)
data.max_duration = data.max_duration.apply(lambda x: x.days)

In [152]:
data

Unnamed: 0,user_id,StartAfter2019,companies_worked,country,max_duration
147720,0,0,1,,4870
180157,2,0,3,TR,547
30839,5,0,1,,579
161935,7,0,2,TR,883
133573,10,0,2,TR,1734
...,...,...,...,...,...
186660,66269,0,1,TR,5357
157505,66270,0,1,,1491
74600,66271,0,8,TR,6543
13280,66272,0,5,TR,2344


In [153]:
# Save data to a new csv file in PreparedData folder
data.to_csv(os.path.join('PreparedData', 'work_experiences.csv'), index=False)

## For the country column
TR means user only worked at TR  
MIX means user worked at TR and Other  
Other means user only worked abroad  