In [431]:
import pandas as pd
import numpy as np

In [432]:
df = pd.read_csv("OriginalData/education.csv")
df

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
0,0,Anadolu Üniversitesi,,,,
1,0,Adıyaman Üniversitesi,,,,
2,0,Fırat Üniversitesi,,,,
3,1,Fırat Üniversitesi,Yüksek Lisans,Yazılım Mühendisliği,,
4,1,Fırat Üniversitesi,Lisans,Yazılım Mühendisliği,,
...,...,...,...,...,...,...
142570,66272,Halmstad University,,Computer Science,,
142571,66272,Bogazici University,,Chemistry,,
142572,66273,Istanbul Technical University,Bachelor's degree,Electronic and Communication Engineering,,
142573,66273,Bogazici University,Master's degree,Electrical and Electronics Engineering,201509.0,


School_names

In [433]:
df["school_name"].value_counts()

Anadolu Üniversitesi                              6562
İstanbul Üniversitesi                             4891
Sakarya Üniversitesi                              4310
Kocaeli Üniversitesi                              3836
İstanbul Teknik Üniversitesi                      3835
                                                  ... 
Aydin Science High School                            1
Corum Anatolian Teacher High School                  1
건국대학교                                                1
Barbaros İlköğretim Okulu/Bulancak/Giresun           1
Sivas Şehit Muhammet Onur Demir Anadolu Lisesi       1
Name: school_name, Length: 11113, dtype: int64

- There are about 11k schools, it doesn't really make sense to retain this information
- It could be sensible to assign a rank to each school, and have these as values, but I don't think an API exists to help me with that
- So I will drop school_name later

Check how many entries each user has

In [434]:
df["user_id"].value_counts()

56967    25
43913    25
19566    25
30605    20
22125    15
         ..
27885     1
27887     1
47860     1
27889     1
33137     1
Name: user_id, Length: 66271, dtype: int64

In [435]:
df[df["user_id"] == 22125]

Unnamed: 0,user_id,school_name,degree,fields_of_study,start_year_month,end_year_month
47864,22125,Dumlupinar University,,,,
47865,22125,Alberk q.a technic,,Iso 9001:2015 baş denetçi,,
47866,22125,BSI Training Academy,,13485:2016 Baş Denetçi,,
47867,22125,Anadolu Üniversitesi,,,,
47868,22125,Atatürk Üniversitesi,,,,
47869,22125,İstanbul Üniversitesi,,,,
47870,22125,•\t5 S & Kaizen Achievement Certificate (2017)...,,,,
47871,22125,•\tİSO 140001 Achievement Certificate (2017),,,,
47872,22125,•\tProduction Management Training (2019),,,,
47873,22125,İstanbul Üniversitesi,,Leadership and Business Management (2019),,


Some people have lots of entries but not much to show for it
- It seems like they list any place they took a course from, or did a speaking event in

### Degrees

In [436]:
pd.set_option('display.max_rows', 500)

In [437]:
df.degree.unique().shape

(5889,)

In [438]:
df.degree.value_counts().to_frame().iloc[0:10]

Unnamed: 0,degree
Lisans Derecesi,19144
Bachelor's degree,13993
Yüksek Lisans (Master),5246
Master's degree,4859
Lisans,2974
Master of Science - MS,2344
Bachelor of Science - BS,2051
High School Diploma,1720
Lise,1602
High School,1503


Lots of unique values, group these into the following categories:
- High school = 0
- Pre - Bachelors = 1
- Bachelor = 2
- Masters = 3
- PhD = 4
- Other = 2 - assuming bachelor like
Consider the highest degree earned only

In [439]:
df.degree = df.degree.astype(str)
degrees =  df.degree.unique().tolist()
degrees.sort()

In [440]:
from fuzzywuzzy import fuzz
from collections import defaultdict

def group_strings(strings, reference_strings, similarity_threshold):
    groups = {key: [] for key in reference_strings.keys()}
    groups["Other"] = []
    key_list = [key for key in reference_strings.keys()]
    for i, string1 in enumerate(strings):
        max_similarity = 0
        similarities = []
        for key, values in reference_strings.items():
            similarities.append(max([fuzz.token_sort_ratio(string1, val) for val in values]))
        idx = np.argmax(similarities)
        if similarities[idx] >= similarity_threshold:
            groups[key_list[idx]].append((string1,similarities[idx]))
        else:
            groups["Other"].append((string1,similarities[idx]))
    for key in groups.keys():
        groups[key] = [string for string, sim in sorted(groups[key], key=lambda x: -x[1])]
    return groups

In [441]:
degree_groups = {
    "Bachelor":["Bachelor", "Bachelors", "Lisans","Lisans Derecesi", "Bachelors Degree", "Bachelor of Science", "Bachelor of Applied Science",
    "Lisans (Açiköğretim)",  "Undergraduate","Açiköğretim Lisans","Engineer's degree","Faculty of engineering and architecture","Business"],

    "Masters": ["Masters", "Yüksek Lisans", "Master of ", "Master of Science", "Master of Business", "Master Degree","MBA", "Graduate Degree", "Graduate",
    "Bachelor of Architecture ","Bachelor of Science"],

    "High_School": ["High School", "Lise","Endüstri lise"],

    "Pre-Bachelors": ["Ön Lisans", "On Lisans Derecesi","Onlisans", "ÖNLİSANS","Pre Bachelor","Önlisans AÖF","not graduated","Associate Degree",
    "AÇIKÖĞRETİM FAKÜLTESİ - ÖNLİSANS","Associate","Yüksekokul",'Lisans(terk)','Açiköğretim',"two-year degree","Minor's Degree", "Grade", "Associate of Arts"],
    
    "PhD": ["PhD", "Doktora"]
}


In [442]:
groups= group_strings(degrees, degree_groups, 50)

In [443]:
groups

{'Bachelor': [' Bachelor of Science',
  ' Undergraduate',
  ' undergraduate',
  'BACHELOR',
  'Bachelor',
  'Bachelor ',
  'Bachelor Of Science',
  'Bachelor of Science',
  'Bachelor of Science ',
  'Bachelor of science',
  'Bachelors',
  'Bachelors Degree',
  'Bachelors degree',
  'Bachelors degree ',
  'Bachelor\x80\x99s Degree',
  'Bachelor´s degree.',
  'Business',
  "Engineer's Degree",
  "Engineer's degree",
  "Engineer's degree,",
  'Engineer’s Degree',
  'Faculty of  Engineering and Architecture',
  'Faculty of Engineering and Architecture',
  'Faculty of engineering and architecture',
  'LISANS',
  'Lisans',
  'Lisans ',
  'Lisans Derecesi',
  'Lisans Derecesi ',
  'Lisans Derecesi  ',
  'Lisans derecesi',
  'UNDERGRADUATE',
  'UnderGraduate',
  'Undergraduate',
  'Undergraduate ',
  'bachelor',
  'bachelor of science',
  'lisans',
  'lisans ',
  'lisans derecesi',
  'undergraduate',
  'undergraduate ',
  'Faculity of Engineering and Architecture',
  ' Bachelor Degree',
  ' Ba

Repalce the values in the dataframe with the groups

In [444]:
for key,value in groups.items():
    df.loc[df.degree.isin(groups[key]), "degree"]  = key

In [445]:
df.degree.unique()

array(['Pre-Bachelors', 'Masters', 'Bachelor', 'Other', 'High_School',
       'PhD'], dtype=object)

Assign numerical values

In [446]:
vals = {"High_School" :0 ,  "Pre-Bachelors": 1,"Bachelor" : 2, "Other": 2, "Masters" : 3, "PhD": 4}
df.replace({"degree": vals}, inplace=True)

In [447]:
# Group by dataframe by user_id, keep the minimum start date and maximum end date, and the highest degree for each user, and the corresponding field of study
df = df.groupby('user_id').agg({'start_year_month': 'min', 'end_year_month': 'max', 'degree': 'max', 'fields_of_study': 'first'}).reset_index()

Now lets categorize the fields

In [448]:
df.fields_of_study.unique().tolist()

[None,
 'Yazılım Mühendisliği',
 'Bilgisayar Mühendisliği',
 'Bilgisayar Yazılımı Mühendisliği',
 'Software Engineering',
 'yazılım mühendisliği',
 'Adli Bilişim Mühendisliği',
 'Computer Software Engineering',
 'Computer Engineering',
 'Electrical and Electronics Engineering',
 'Veterinary Medicine',
 'Adli Bilim ve Teknolojisi',
 'Geotechnical Engineer',
 'Remote Sensing and Geographic Information Systems',
 'Yönetim Bilişim Sistemleri',
 'Mechatronics, Robotics, and Automation Engineering',
 'Computer Science',
 'Software Engineer',
 'Bilgisayar Programlama/Programcı, Genel',
 'Computer Engineer',
 'Bilgisayar Programlama',
 'Elektrik ve Elektronik Mühendisliği',
 'Otomotiv Mühendisliği',
 'Makine Mühendisliği',
 'Mechatronics Engineer',
 'İşletme ve Yönetim, Genel',
 'İstatistik',
 'Adli Bilişim Mühendisliği ',
 'İnşaat Mühendisliği',
 'Bilgisayar Teknolojisi ve programlama',
 'Digital Forensic Engineering',
 'Business Administration and Management, General',
 'Computer Programmer'

Lots of fields, lets use the same approach as above and put these into broad categories

In [449]:
df.fields = df.fields_of_study.astype(str)
fields =  df.fields_of_study.unique().tolist()

  df.fields = df.fields_of_study.astype(str)


In [450]:
field_groups = {
    "Programming":["Yazilim", "Yazilim Mühendisligi", "Bilgisayar", "Computer Science", "Computer Engineering", "Software Engineering", "Data Science", "Cybersecurity"],
    "Engineering": ["Engineering", "Mühendislik", "Mühendisliği", "Electronics Engineering","Elektrik","Makine", "Endüstri", "Automation","Marine","Automotive Engineer", 
    "Communication Engineer", "Environmental engineer","Energy systems engineer","Industrial engineer","Mechanical engineer","Nuclear engineer","Petroleum engineer"
    ],
    "Business": ["İşletme", "Ekonomi", "Business", "Economics", "Management", "Finans", "Finance","Actuarial Science"],
    "Science": ["Fen", "Biyoloji", "Kimya", "Bilim", "Science", "Biology", "Chemistry", "Physics", "Materials", "Food Science", "Aquaculture"],
    "Mathematics": ["Matematik", "Mathematics","Istatistil","Statistics"],
    "Other": ["Scene Arts", "Political Science",]
}

In [451]:
groups = group_strings(fields, field_groups, 50)
groups 

{'Programming': ['Software Engineering',
  'Computer Engineering',
  'Computer Science',
  'software engineering',
  ' Software Engineering ',
  'Software Engineering ',
  'Software  Engineering',
  'Software engineering',
  'computer engineering',
  'SOFTWARE ENGINEERING ',
  'Computer Science ',
  'Computer Engineering ',
  'Computer engineering ',
  'Data Science',
  'Cybersecurity',
  'Computer  Engineering',
  'Computer engineering',
  'COMPUTER SCIENCE',
  'Computer science',
  'Bilgisayar',
  ' Computer Engineering',
  'computer science',
  'CyberSecurity',
  'COMPUTER ENGINEERING',
  ' Computer Engineering ',
  'computer Engineering',
  'Data Science ',
  'Computer   Engineering',
  'Computer Enginieering',
  'Computer Enginneering',
  'Sofware Engineering',
  'Yazilim Muhendisligi',
  'Computer Enginering',
  'Computer Sciences',
  'Computer engineerig',
  'Computer Engineerin',
  'Compuer Science',
  'Software Enginering',
  'computer engineerig',
  'Compter Engineering',
  '

In [452]:
for key,value in groups.items():
    df.loc[df.fields_of_study.isin(groups[key]), "fields_of_study"]  = key

In [453]:
df

Unnamed: 0,user_id,start_year_month,end_year_month,degree,fields_of_study
0,0,,,1,Other
1,1,,,3,Other
2,2,,,2,Engineering
3,3,,,2,Other
4,4,201709.0,,2,Other
...,...,...,...,...,...
66266,66269,,,3,Engineering
66267,66270,,,3,Science
66268,66271,,,4,Other
66269,66272,,,2,Other


## Fix up dates

Fill nan dates temporarily 

In [454]:
df.fillna(190001.0, inplace=True)

In [455]:
# Put a '-' at in between the year and day in the format "yyyydd"
df['start_year_month'] = df['start_year_month'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:6])
df['end_year_month'] = df['end_year_month'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:6])

In [456]:
# Turn the start_year_month column into a datetime object
df['start_year_month'] = pd.to_datetime(df['start_year_month'], format='%Y-%m')
df['end_year_month'] = pd.to_datetime(df['end_year_month'], format='%Y-%m')

Add column to indicate whether study is ongoing and finished as of 2019
- Finished if end date 2019
- Ongoing if started after 2019 or started before 2019 and ended after 2019

In [457]:
df["Edu_Finished"] = 0
df.loc[(df["end_year_month"] < '2019-01-01') & (df["end_year_month"] > '1900-01-01'), "Edu_Finished"] = 1


df["Edu_Ongoing"] = 0
df.loc[df["start_year_month"] > '2019-01-01',"Edu_Ongoing"] = 1
df.loc[(df["start_year_month"] < '2019-01-01') & (df["end_year_month"] > '2019-01-01'), "Edu_Ongoing"] = 1

In [458]:
print(df[df["Edu_Finished"] ==1].shape[0] , " Not finished edu")
print(df[df["Edu_Ongoing"] ==1].shape[0], "Ongoing edu")
print(df[(df["Edu_Finished"] ==1) & (df["Edu_Ongoing"] ==1)].shape[0] , " finished and ongoing (should be zero)")
print(df[(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1)].shape[0], "finished, and not ongoing")
print(df[~(df["Edu_Finished"] ==1) & (df["Edu_Ongoing"] ==1)].shape[0], "not finished and ongoing")
print(df[~(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1)].shape[0], "not finished and not ongoing")

1225  Not finished edu
5293 Ongoing edu
1  finished and ongoing (should be zero)
1224 finished, and not ongoing
5292 not finished and ongoing
59754 not finished and not ongoing


Check the finished and ongoing ones

In [459]:
df[(df["Edu_Finished"] == 1) & (df["Edu_Ongoing"] ==1)]

Unnamed: 0,user_id,start_year_month,end_year_month,degree,fields_of_study,Edu_Finished,Edu_Ongoing
50897,50900,2021-10-01,2016-07-01,2,Engineering,1,1


- One entry, ignore

### Check the neither finished nor ongoing ones

In [460]:
df[~(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1)]

Unnamed: 0,user_id,start_year_month,end_year_month,degree,fields_of_study,Edu_Finished,Edu_Ongoing
0,0,1900-01-01,1900-01-01,1,Other,0,0
1,1,1900-01-01,1900-01-01,3,Other,0,0
2,2,1900-01-01,1900-01-01,2,Engineering,0,0
3,3,1900-01-01,1900-01-01,2,Other,0,0
4,4,2017-09-01,1900-01-01,2,Other,0,0
...,...,...,...,...,...,...,...
66266,66269,1900-01-01,1900-01-01,3,Engineering,0,0
66267,66270,1900-01-01,1900-01-01,3,Science,0,0
66268,66271,1900-01-01,1900-01-01,4,Other,0,0
66269,66272,1900-01-01,1900-01-01,2,Other,0,0


Those with a start date but no end date

In [461]:
df[~(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1) & (~(df["start_year_month"] == '1900-01-01')& (df["end_year_month"] == '1900-01-01'))].shape

(224, 7)

Set these all to ongoing if they started after 2015, and finished if they started before 

In [462]:
df.loc[~(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1) & ((df["start_year_month"] >= '2015-01-01')& (df["end_year_month"] == '1900-01-01')),"Edu_Ongoing"] = 1
df.loc[~(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1) & ((df["start_year_month"] <= '2015-01-01')& (df["end_year_month"] == '1900-01-01')),"Edu_Finished"] = 1

Those with no start date but an end date

In [463]:
df[~(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1) & ((df["start_year_month"] == '1900-01-01')& ~(df["end_year_month"] == '1900-01-01'))].shape
# one entry ignore

(1, 7)

Those with both dates nan

In [464]:
df[~(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1) & ((df["start_year_month"] == '1900-01-01')& (df["end_year_month"] == '1900-01-01'))]
# one entry ignore

Unnamed: 0,user_id,start_year_month,end_year_month,degree,fields_of_study,Edu_Finished,Edu_Ongoing


Set these to have "Edu_Status_Unknown"= True

In [465]:
df["Unknown_Edu_Status"] = 0
df.loc[~(df["Edu_Finished"] ==1) & ~(df["Edu_Ongoing"] ==1) & ((df["start_year_month"] == '1900-01-01')| (df["end_year_month"] == '1900-01-01')),"Unknown_Edu_DStatus"] = 1

Rename columns

In [466]:
col_names = {'start_year_month': 'start_date_edu',
'end_year_month': 'end_date_edu',
'fields_of_study': 'field_edu',
'degree': 'degree_edu',
'user_id': 'user_id'
}

df.rename(columns=col_names, inplace=True)

In [468]:
df

Unnamed: 0,user_id,start_date_edu,end_date_edu,degree_edu,field_edu,Edu_Finished,Edu_Ongoing,Unknown_Edu_Status,Unknown_Edu_DStatus
0,0,1900-01-01,1900-01-01,1,Other,1,0,0,
1,1,1900-01-01,1900-01-01,3,Other,1,0,0,
2,2,1900-01-01,1900-01-01,2,Engineering,1,0,0,
3,3,1900-01-01,1900-01-01,2,Other,1,0,0,
4,4,2017-09-01,1900-01-01,2,Other,0,1,0,
...,...,...,...,...,...,...,...,...,...
66266,66269,1900-01-01,1900-01-01,3,Engineering,1,0,0,
66267,66270,1900-01-01,1900-01-01,3,Science,1,0,0,
66268,66271,1900-01-01,1900-01-01,4,Other,1,0,0,
66269,66272,1900-01-01,1900-01-01,2,Other,1,0,0,


Drop date columns

In [469]:
df.drop(columns=['start_date_edu','end_date_edu'], inplace=True)

Save

In [471]:
df.to_csv("PreparedData/education.csv", index=False)