# Preparing the work_experiences.csv file

In [88]:
import pandas as pd
import os
pd.options.display.max_rows = 50

In [89]:
path = os.path.join('Originaldata', 'work_experiences.csv')
df = pd.read_csv(path)
df.head()

Unnamed: 0,user_id,company_id,location,start_year_month
0,53442,2651,"Istanbul, Turkey",201509
1,34558,815,"Istanbul, Turkey",201210
2,63761,26354,,200010
3,10738,89,,201610
4,8711,3113,"Istanbul, Turkey",201801


Check nan values

In [90]:
df.isnull().sum()

user_id                 0
company_id              0
location            61414
start_year_month        0
dtype: int64

Check unique values

In [91]:
df.nunique()

user_id             57079
company_id          20837
location             6127
start_year_month      397
dtype: int64

Fix up date formatting - convert to datetime

In [92]:
# Put a '-' at in between the year and day in the format "yyyydd"
df['start_year_month'] = df['start_year_month'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:])
# Turn the start_year_month column into a datetime object
df['start_year_month'] = pd.to_datetime(df['start_year_month'], format='%Y-%m')
# Rename the column to "start_date"
df.rename(columns={'start_year_month': 'start_date'}, inplace=True)

Sort by user 

In [93]:
df.sort_values(by=['user_id', 'start_date'], ascending=True, inplace=True)

In [94]:
df.head()

Unnamed: 0,user_id,company_id,location,start_date
147720,0,0,Serbest Çalışmalar,2005-09-01
174454,0,0,Visual Studio Asp.Net Developer,2005-09-01
760,2,9,"Elazig, Turkey",2016-12-01
19762,2,7,"Elazig, Turkey",2017-06-01
180157,2,10,"Mersin, Turkey",2018-06-01


####  Mark rows where the start date is after 2019, using datetime

In [95]:
df["StartAfter2019"] = 0 
df.loc[df['start_date'] >= '2019-01-01',"StartAfter2019"] = 1

## Remove duplicates

Some users have multiple jobs starting at the same day at the same company, delete these.

In [96]:
df = df.sort_values(by=['user_id', 'start_date'])
df = df.drop_duplicates(subset=['user_id', 'start_date', 'company_id'], keep='first')

### New features

- companies_worked : # unique companies worked at
- positions_worked : # positions, recounting the ones at the same company

In [97]:
df['companies_worked'] = df.groupby('user_id')['company_id'].transform('nunique')
df['positions_worked'] = df.groupby('user_id')['company_id'].transform('count')

Check if these two columns are ever different

In [101]:
df[df['companies_worked'] != df['positions_worked']].shape

(82379, 7)

Yeah they are so lets keep them both

Working Length

- Number of days worked at each position
     - Subtract first starting date from the next, or from 2019-01-01 if there is no next start date

In [105]:
df['next_start_date'] = df.groupby('user_id')['start_date'].shift(-1)
df['next_start_date'].fillna(pd.Timestamp('2019-01-01'), inplace=True)
df['days_worked'] = (df['next_start_date'] - df['start_date']).dt.days
df.drop(['next_start_date'], axis=1, inplace=True)

In [106]:
df

Unnamed: 0,user_id,company_id,location,start_date,StartAfter2019,companies_worked,positions_worked,days_worked
147720,0,0,Serbest Çalışmalar,2005-09-01,0,1,1,4870
760,2,9,"Elazig, Turkey",2016-12-01,0,3,3,182
19762,2,7,"Elazig, Turkey",2017-06-01,0,3,3,365
180157,2,10,"Mersin, Turkey",2018-06-01,0,3,3,214
30839,5,15,İstanbul,2017-06-01,0,1,1,579
...,...,...,...,...,...,...,...,...
24178,66273,1509,,2017-08-01,0,10,10,396
92682,66273,2412,,2018-09-01,0,10,10,30
108913,66273,3843,,2018-10-01,0,10,10,273
184334,66273,13444,,2019-07-01,1,10,10,92


Create four new features
- Max days worked
- Min days worked
- Average days worked
- Total days worked


In [110]:
# Create for new columns: max days worked, min days worked, average days worked total days worked
df['max_days_worked'] = df.groupby('user_id')['days_worked'].transform('max')
df['min_days_worked'] = df.groupby('user_id')['days_worked'].transform('min')
df['avg_days_worked'] = df.groupby('user_id')['days_worked'].transform('mean').astype(int)
df['total_days_worked'] = df.groupby('user_id')['days_worked'].transform('sum')

Now keep the first 

In [None]:
df = df.sort_values(by=['user_id', 'start_date'])
df = df.drop_duplicates(subset=['user_id', 'start_date', 'company_id'], keep='first')

## Ignore location column for now as it is very hard to accurately categorize it

The location column is all over the place, so we normalize it.

Some users worked only in turkey, others abroad and/or in Turkey
Mark users as TR, MIX, or Other if unknown

In [None]:
# save location df for inspection
df.location.value_counts().to_frame().to_csv('locations.csv')

In [None]:
# Make country column lowercase
df['location'] = df['location'].apply(lambda x: str(x).lower())

In [None]:
from unidecode import unidecode

In [None]:
df.location = df.location.astype(str)
locations =  df.location.unique().tolist()
locations = [unidecode(string) for string in locations]
locations.sort()

In [None]:
from fuzzywuzzy import fuzz
from collections import defaultdict

def group_strings(strings, reference_strings, similarity_threshold):
    groups = {key: [] for key in reference_strings.keys()}
    groups["Other"] = []
    key_list = [key for key in reference_strings.keys()]
    for i, string1 in enumerate(strings):
        max_similarity = 0
        similarities = []
        for key, values in reference_strings.items():
            similarities.append(max([fuzz.partial_ratio(string1, val) for val in values]))
        idx = np.argmax(similarities)
        if similarities[idx] >= similarity_threshold:
            groups[key_list[idx]].append((string1,similarities[idx]))
        else:
            groups["Other"].append((string1,similarities[idx]))
    for key in groups.keys():
        groups[key] = [string for string, sim in sorted(groups[key], key=lambda x: -x[1])]
    return groups

In [None]:
location_groups = {
    "TR":["Turkey", "Türkiye","Adana", "Adiyaman", "Afyonkarahisar", "Agri", "Aksaray", "Amasya", "Ankara", "Antalya", "Ardahan", "Artvin", "Aydin", 
    "Balikesir", "Bartin", "Batman", "Bayburt", "Bilecik", "Bingol", "Bitlis", "Bolu", "Burdur", "Bursa", "Canakkale", "Cankiri", "Corum", "Denizli",
     "Diyarbakir", "Düzce", "Edirne", "Elazig", "Erzincan", "Erzurum", "EskiSehir", "Gaziantep", "Giresun", "GümüShane", "HakkAri", "Hatay", "Igdir", 
     "Isparta", "İstanbul", "İzmir", "Kahramanmaras", "Karabük", "Karaman", "Kars", "Kastamonu", "Kayseri", "Kilis", "Kirikkale", "Kirklareli", "KirSehir",
      "Kocaeli", "Konya", "Kütahya", "Malatya", "Manisa", "Mardin", "Mersin", "Mugla", "MuS", "NevSehir", "Nigde", "Ordu", "Osmaniye", "Rize", "Sakarya", 
      "Samsun", "Sanliurfa", "Siirt", "Sinop", "Sivas", "Sirnak", "Tekirdag", "Tokat", "Trabzon", "Tunceli", "USak", "Van", "Yalova", "Yozgat", "Zonguldak",
      "Akbank"],
    "Unsure":["Unknown"]
}

In [None]:
groups= group_strings(locations, location_groups, 60)

In [None]:
for key,value in groups.items():
    df.loc[df.location.isin(groups[key]), "country"]  = key

For the country column:
- TR means user only worked at TR  
- MIX means user worked at TR and Other  
- Other means user only worked abroad  

## Keep single row for each user

Drop some cols

In [112]:
df.drop('company_id', axis=1, inplace=True)
df.drop('location', axis=1, inplace=True)
df.drop('start_date', axis=1, inplace=True)

In [113]:
df.drop_duplicates(subset='user_id', keep='first', inplace=True)

In [116]:
# Save df to a new csv file in Prepareddf folder
df.to_csv(os.path.join('Prepareddata', 'work_experiences.csv'), index=False)