In [14]:
# import
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import os

In [20]:
# Get data (only train data)
path = os.path.join('PreparedData', 'train.csv')
df = pd.read_csv(path)

In [21]:
df.head(5)

Unnamed: 0,user_id,location,moved_after_2019,companies_worked,max_duration,Business,Design,IT,Other,N_Skills,...,German,Turkish,French,Spanish,Chinese,NLanguages,start_date_edu,end_date_edu,degree_edu,field_edu
0,1301,Turkey,1,3,1035,2,0,4,17,23,...,0,5,0,0,0,4,2023-01-01,2023-01-01,2,Other
1,6950,Turkey,0,4,1402,0,0,17,32,49,...,0,0,0,0,0,3,2023-01-01,2023-01-01,1,Other
2,4880,Turkey,0,4,1216,0,2,4,15,21,...,0,5,0,0,0,2,2023-01-01,2023-01-01,2,Other
3,26046,Turkey,0,1,1856,0,0,7,23,30,...,0,0,0,0,0,2,2023-01-01,2023-01-01,1,Business
4,11005,Turkey,0,3,2769,0,1,10,37,48,...,0,5,0,0,0,3,2023-01-01,2023-01-01,1,Other


In [22]:
# Feature Engineering (one hot encode)
df = pd.get_dummies(df,columns= ["location","field_edu"], drop_first=True)
df.head(5)

Unnamed: 0,user_id,moved_after_2019,companies_worked,max_duration,Business,Design,IT,Other,N_Skills,English,...,end_date_edu,degree_edu,location_Netherlands,location_Other,location_Pakistan,location_Turkey,location_United Kingdom,field_edu_Engineering,field_edu_Other,field_edu_Science
0,1301,1,3,1035,2,0,4,17,23,3,...,2023-01-01,2,0,0,0,1,0,0,1,0
1,6950,0,4,1402,0,0,17,32,49,3,...,2023-01-01,1,0,0,0,1,0,0,1,0
2,4880,0,4,1216,0,2,4,15,21,2,...,2023-01-01,2,0,0,0,1,0,0,1,0
3,26046,0,1,1856,0,0,7,23,30,2,...,2023-01-01,1,0,0,0,1,0,0,0,0
4,11005,0,3,2769,0,1,10,37,48,3,...,2023-01-01,1,0,0,0,1,0,0,1,0


In [25]:
df.drop(["end_date_edu","start_date_edu"], axis=1, inplace=True)

In [26]:
df.head()

Unnamed: 0,user_id,moved_after_2019,companies_worked,max_duration,Business,Design,IT,Other,N_Skills,English,...,NLanguages,degree_edu,location_Netherlands,location_Other,location_Pakistan,location_Turkey,location_United Kingdom,field_edu_Engineering,field_edu_Other,field_edu_Science
0,1301,1,3,1035,2,0,4,17,23,3,...,4,2,0,0,0,1,0,0,1,0
1,6950,0,4,1402,0,0,17,32,49,3,...,3,1,0,0,0,1,0,0,1,0
2,4880,0,4,1216,0,2,4,15,21,2,...,2,2,0,0,0,1,0,0,1,0
3,26046,0,1,1856,0,0,7,23,30,2,...,2,1,0,0,0,1,0,0,0,0
4,11005,0,3,2769,0,1,10,37,48,3,...,3,1,0,0,0,1,0,0,1,0


In [27]:
# Test train split
X_train, X_test, y_train, y_test = train_test_split(df.drop('moved_after_2019', axis=1), df['moved_after_2019'], test_size=0.2, random_state=42)

In [28]:
# Train model using training data
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

LogisticRegression()

In [29]:
# Predict
y_pred = LogReg.predict(X_test)

In [31]:
# Scoring the model
LogReg.score(X_test, y_test)

0.6145794039984911

In [32]:
# Evaluate
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.62      0.96      0.75      6474
           1       0.54      0.08      0.14      4130

    accuracy                           0.61     10604
   macro avg       0.58      0.52      0.44     10604
weighted avg       0.59      0.61      0.51     10604

[[6198  276]
 [3811  319]]


In [33]:
# Do prediction on test data
path = os.path.join('PreparedData', 'test.csv')
df_test = pd.read_csv(path)

# Feature Engineering (one hot encode)
df_test = pd.get_dummies(df_test, drop_first=True)

In [34]:
df_test.head(5)

Unnamed: 0,user_id,moved_after_2019,companies_worked,max_duration,Business,Design,IT,Other,N_Skills,English,...,end_date_edu_2025-07-01,end_date_edu_2025-08-01,end_date_edu_2025-09-01,end_date_edu_2025-10-01,end_date_edu_2026-06-01,end_date_edu_2026-07-01,end_date_edu_2027-12-01,field_edu_Engineering,field_edu_Other,field_edu_Science
0,17449,2,3,4017,1,0,11,22,34,2,...,0,0,0,0,0,0,0,1,0,0
1,33967,2,2,730,2,9,1,19,31,5,...,0,0,0,0,0,0,0,0,1,0
2,2110,2,2,0,3,0,2,8,13,2,...,0,0,0,0,0,0,0,0,1,0
3,55082,2,2,2011,0,0,12,25,37,2,...,0,0,0,0,0,0,0,0,0,0
4,37165,2,1,944,0,0,0,4,4,4,...,0,0,0,0,0,0,0,1,0,0


In [35]:
# Drop missing columns
missing_cols = set( X_train.columns ) - set( df_test.columns )
for c in missing_cols:
    df_test[c] = 0
df_test = df_test[X_train.columns]


In [36]:
df_test.head(5)

Unnamed: 0,user_id,companies_worked,max_duration,Business,Design,IT,Other,N_Skills,English,German,...,NLanguages,degree_edu,location_Netherlands,location_Other,location_Pakistan,location_Turkey,location_United Kingdom,field_edu_Engineering,field_edu_Other,field_edu_Science
0,17449,3,4017,1,0,11,22,34,2,0,...,2,1,0,0,0,1,0,1,0,0
1,33967,2,730,2,9,1,19,31,5,0,...,2,4,0,0,0,1,0,0,1,0
2,2110,2,0,3,0,2,8,13,2,1,...,4,2,0,0,0,1,0,0,1,0
3,55082,2,2011,0,0,12,25,37,2,0,...,1,1,0,0,0,1,0,0,0,0
4,37165,1,944,0,0,0,4,4,4,0,...,2,1,0,0,0,1,0,1,0,0


In [37]:
# Predict
y_pred = LogReg.predict(df_test)

In [38]:
# Show prediction
df_test['moved_after_2019'] = y_pred

In [39]:
# Show moved_after_2019 values
df_test['moved_after_2019'].value_counts()

0    12440
1      815
Name: moved_after_2019, dtype: int64

In [41]:
df['moved_after_2019'].value_counts()

0    32496
1    20523
Name: moved_after_2019, dtype: int64

In [52]:
# Turn into submission by keeping only user id and moved_after_2019
df_test = df_test[['user_id', 'moved_after_2019']]

In [53]:
df_test

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,0
...,...,...
13250,32847,0
13251,20054,0
13252,7029,0
13253,56130,0


In [54]:
# Save into csv without index
df_test.to_csv('submission.csv', index=False)