In [413]:
# import
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import os

In [414]:
# Get data (only train data)
path = os.path.join('PreparedData', 'train.csv')
df = pd.read_csv(path)

In [415]:
df.head(5)

Unnamed: 0,user_id,industry,location,moved_after_2019,companies_worked,max_duration,worked_abroad,English,German,Turkish,French,Spanish,Chinese,NLanguages,education_level,fields_of_study,education_count,attended_university,different_university_count,recent_study_label
0,1301,Information Technology and Services,Turkey,1,3,1035,0,3,0,5,0,0,0,4,bachelor,it,2,1,2,0
1,6950,Internet,Turkey,0,4,1402,0,3,0,0,0,0,0,3,other,it,1,1,1,0
2,4880,Other,Turkey,0,4,1216,0,2,0,5,0,0,0,2,bachelor,it,2,1,2,0
3,26046,Other,Turkey,0,1,1856,0,2,0,0,0,0,0,2,master,other,3,0,3,0
4,11005,Other,Turkey,0,3,2769,0,3,0,5,0,0,0,3,other,business,2,1,2,0


In [416]:
# Show all column names
df.columns

Index(['user_id', 'industry', 'location', 'moved_after_2019',
       'companies_worked', 'max_duration', 'worked_abroad', 'English',
       'German', 'Turkish', 'French', 'Spanish', 'Chinese', 'NLanguages',
       'education_level', 'fields_of_study', 'education_count',
       'attended_university', 'different_university_count',
       'recent_study_label'],
      dtype='object')

In [417]:
# Feature Engineering (one hot encode)
df = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,user_id,moved_after_2019,companies_worked,max_duration,worked_abroad,English,German,Turkish,French,Spanish,...,industry_Computer Networking,industry_Computer Software,industry_Information Technology and Services,industry_Internet,industry_Other,location_Turkey,education_level_master,education_level_other,fields_of_study_it,fields_of_study_other
0,1301,1,3,1035,0,3,0,5,0,0,...,0,0,1,0,0,1,0,0,1,0
1,6950,0,4,1402,0,3,0,0,0,0,...,0,0,0,1,0,1,0,1,1,0
2,4880,0,4,1216,0,2,0,5,0,0,...,0,0,0,0,1,1,0,0,1,0
3,26046,0,1,1856,0,2,0,0,0,0,...,0,0,0,0,1,1,1,0,0,1
4,11005,0,3,2769,0,3,0,5,0,0,...,0,0,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53014,57247,0,2,4232,0,2,0,5,0,0,...,0,0,1,0,0,1,0,0,0,0
53015,37784,0,2,1553,0,4,0,5,0,0,...,0,0,0,0,1,1,0,0,0,1
53016,33229,0,3,3042,0,2,0,2,0,0,...,0,0,1,0,0,1,0,0,0,1
53017,12165,1,3,396,0,3,1,5,0,0,...,0,0,0,1,0,1,1,0,0,1


In [418]:
# Test train split
X_train, X_test, y_train, y_test = train_test_split(df.drop('moved_after_2019', axis=1), df['moved_after_2019'], test_size=0.1, random_state=42)

In [419]:
# Train model using training data
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

In [420]:
# Predict
y_pred = LogReg.predict(X_test)

# If prediction is close to 0.45,

In [421]:
# Scoring the model
LogReg.score(X_test, y_test)

0.6112787627310449

In [422]:
# Evaluate
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.62      0.96      0.75      3232
           1       0.52      0.07      0.13      2070

    accuracy                           0.61      5302
   macro avg       0.57      0.51      0.44      5302
weighted avg       0.58      0.61      0.51      5302

[[3088  144]
 [1917  153]]


In [423]:
# Load df_train
path = os.path.join('PreparedData', 'train.csv')
df_train = pd.read_csv(path)

# Feature Engineering (one hot encode)
df_train = pd.get_dummies(df_train, drop_first=True)

In [424]:
# Do prediction on test data
path = os.path.join('PreparedData', 'test.csv')
df_test = pd.read_csv(path)

# Feature Engineering (one hot encode)
df_test = pd.get_dummies(df_test, drop_first=True)

In [425]:
df_test.head(5)

Unnamed: 0,user_id,companies_worked,max_duration,worked_abroad,English,German,Turkish,French,Spanish,Chinese,...,industry_Computer Networking,industry_Computer Software,industry_Information Technology and Services,industry_Internet,industry_Other,location_Turkey,education_level_master,education_level_other,fields_of_study_it,fields_of_study_other
0,17449,3,4017,0,2,0,4,0,0,0,...,0,0,0,0,1,1,0,1,0,1
1,33967,2,730,0,5,0,5,0,0,1,...,0,1,0,0,0,1,0,0,0,1
2,2110,2,1797,1,2,1,5,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,55082,2,2011,1,2,0,2,0,0,0,...,0,0,0,1,0,1,0,1,0,0
4,37165,1,944,0,4,0,5,0,0,0,...,0,0,0,0,1,1,0,1,1,0


In [426]:
df_test.head(5)

Unnamed: 0,user_id,companies_worked,max_duration,worked_abroad,English,German,Turkish,French,Spanish,Chinese,...,industry_Computer Networking,industry_Computer Software,industry_Information Technology and Services,industry_Internet,industry_Other,location_Turkey,education_level_master,education_level_other,fields_of_study_it,fields_of_study_other
0,17449,3,4017,0,2,0,4,0,0,0,...,0,0,0,0,1,1,0,1,0,1
1,33967,2,730,0,5,0,5,0,0,1,...,0,1,0,0,0,1,0,0,0,1
2,2110,2,1797,1,2,1,5,0,0,0,...,0,0,0,0,1,1,0,0,0,0
3,55082,2,2011,1,2,0,2,0,0,0,...,0,0,0,1,0,1,0,1,0,0
4,37165,1,944,0,4,0,5,0,0,0,...,0,0,0,0,1,1,0,1,1,0


In [427]:
# Predict
y_pred = LogReg.predict(df_test)

In [428]:
# Show prediction
df_test['moved_after_2019'] = y_pred

In [429]:
# Show moved_after_2019 values
df_test['moved_after_2019'].value_counts()

0    12523
1      732
Name: moved_after_2019, dtype: int64

In [430]:
# Turn into submission by keeping only user id and moved_after_2019
df_test = df_test[['user_id', 'moved_after_2019']]

In [431]:
df_test

Unnamed: 0,user_id,moved_after_2019
0,17449,0
1,33967,0
2,2110,0
3,55082,0
4,37165,0
...,...,...
13250,32847,0
13251,20054,0
13252,7029,0
13253,56130,0


In [432]:
# Save into csv without index
df_test.to_csv('submission.csv', index=False)