In [1]:
# Finalised mobilities started in 2014 - KA1 - src https://data.europa.eu/data/datasets/erasmus-mobility-statistics-2014-2018?locale=en

url = 'https://data.europa.eu/euodp/data/storage/f/2020-08-11T140550/Finalised%20mobilities%20started%20in%202018%20-%20KA1.csv'

import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv(url, on_bad_lines='skip', sep=';')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
df.columns

Index(['Project Reference', 'Academic Year', 'Mobility Start Month',
       'Mobility End Month', 'Mobility Duration', 'Activity (mob)',
       'Field of Education', 'Participant Nationality', 'Education Level',
       'Participant Gender', 'Participant Profile', 'Special Needs',
       'Fewer Opportunities', 'Participant Age', 'Sending Country Code',
       'Sending City', 'Sending Organization', 'Receiving Country Code',
       'Receiving City', 'Receiving Organization', 'Participants'],
      dtype='object')

In [15]:
df['Activity (mob)'].value_counts()

Student mobility for studies between Programme Countries         221921
Student mobility for traineeships between Programme Countries     92979
VET learners traineeships in companies abroad                     67722
Youth Exchanges - Programme Countries                             59034
Staff mobility for teaching between Programme Countries           33043
Structured Courses/Training Events                                28139
Staff mobility for training between Programme Countries           26030
Mobility of youth workers - Programme Countries                   21442
Youth Exchanges - Partner Countries                               20619
Staff training abroad                                             16389
Mobility of VET learners (2 weeks up to 3 months)                 14294
VET learners traineeships in vocational institutes abroad         14245
Student mobility for Studies To/From Partner Countries            12687
Mobility of youth workers - Partner Countries                   

In [16]:
df.drop(df.loc[df['Activity (mob)']!='Student mobility for studies between Programme Countries'].index, inplace=True)
df['Activity (mob)'].value_counts()

Student mobility for studies between Programme Countries    221921
Name: Activity (mob), dtype: int64

In [17]:
df.drop(df.loc[df['Participants']!=1].index, inplace=True)

In [18]:
df = df[['Participant Gender','Participant Age','Sending Country Code','Education Level','Receiving Country Code']]
df.head()

Unnamed: 0,Participant Gender,Participant Age,Sending Country Code,Education Level,Receiving Country Code
174,Female,24,AT,ISCED-6 - First cycle / Bachelor’s or equivale...,SE
876,Male,22,BE,ISCED-7 - Second cycle / Master’s or equivalen...,IE
877,Female,23,BE,ISCED-7 - Second cycle / Master’s or equivalen...,UK
878,Female,21,BE,ISCED-6 - First cycle / Bachelor’s or equivale...,IE
907,Male,19,BE,ISCED-6 - First cycle / Bachelor’s or equivale...,ES


In [19]:
from sklearn.model_selection import train_test_split

X = df[['Participant Gender','Participant Age','Education Level','Sending Country Code']]
Y = df['Receiving Country Code']

##Taking samples of the data for training of model and testing of the model
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
#One Hot Encoder for independent variables
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
ct = ColumnTransformer([("encoder transformer", enc, ['Participant Gender','Participant Age','Education Level','Sending Country Code'])], remainder="passthrough")

#Label Encoder for dependent variable
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

le = LabelEncoder()
le.fit(Y)
y_train_transformed = le.transform(y_train)

# How to reverse encoding
#print(label_encoder.inverse_transform(Y))

In [21]:
from sklearn.pipeline import Pipeline
from sklearn import tree



#Decision tree
dtc = tree.DecisionTreeClassifier()

# Define the pipeline steps. 
pipeline = Pipeline(steps=[
                            ("ct", ct),    # step 1 in pipeline: Transform the columns we specified above (onehote encode state)
                            ("dtc", dtc)    # step 2 in pipeline: Fit the chosen model using the transformed data
                        ])

model = pipeline.fit(x_train,y_train_transformed)
y_pred = model.predict(x_test)
y_pred = le.inverse_transform(y_pred)

# Concatenate the test dataset (what we hope to predict as close as possible) with the predictions made by our model:
pred = pd.concat([ 
                    y_test.reset_index(drop=True), 
                    pd.DataFrame(y_pred, columns=["predicted"])
                ], axis=1)
print(pred)

      Receiving Country Code predicted
0                         IE        UK
1                         FR        ES
2                         AT        PL
3                         IT        IT
4                         ES        ES
...                      ...       ...
42679                     CZ        CZ
42680                     PL        PL
42681                     PL        PL
42682                     HR        ES
42683                     ES        ES

[42684 rows x 2 columns]


In [22]:
pred['correct'] = pred.apply(lambda x: 1 if x['Receiving Country Code']==(x['predicted']) else 0, axis=1)

print(pred)

      Receiving Country Code predicted  correct
0                         IE        UK        0
1                         FR        ES        0
2                         AT        PL        0
3                         IT        IT        1
4                         ES        ES        1
...                      ...       ...      ...
42679                     CZ        CZ        1
42680                     PL        PL        1
42681                     PL        PL        1
42682                     HR        ES        0
42683                     ES        ES        1

[42684 rows x 3 columns]


In [23]:
succes_rate  = pred['correct'].sum()/len(pred)
print(succes_rate)


0.21188267266423016
