### import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### exploratory analysis

In [None]:
train = pd.read_csv('aug_train.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
test = pd.read_csv('aug_test.csv')

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test.info()

#### checking missing values

In [None]:
print("Missing values in the train dataset:", "\n", train.isna().sum())

In [None]:
fig,ax=plt.subplots(4,2,figsize=(30,20))
sns.countplot(x='gender', data=train, ax=ax[0,0])
sns.countplot(x='enrolled_university', data=train, ax=ax[0,1])
sns.countplot(x='major_discipline', data=train, ax=ax[1,0])
sns.countplot(x='experience', data=train, ax=ax[1,1])
sns.countplot(x='company_size', data=train, ax=ax[2,0])
sns.countplot(x='last_new_job', data=train, ax=ax[2,1])
sns.countplot(x='company_type', data=train, ax=ax[3,0])

#### checking how balanced the target variable is

In [None]:
sns.countplot(x='target', data=train)

# data cleaning

### missing values

In [None]:
print("Missing values in the train dataset:", "\n", train.isna().sum())

In [None]:
train.fillna('Unknown', inplace=True)

In [None]:
train.info()

In [None]:
train.isna().sum()

### balancing

In [None]:
y=train['target']
# X=train.drop('target',axis=1)
train.drop('target',axis=1,inplace=True)

# y.value_counts()  
# sns.countplot(x='target', data=train)

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter


rand=RandomOverSampler(random_state=42)
x_ros, y_ros = rand.fit_resample(train, y)
print(f"Imbalanced target class: {Counter(y)} Balanced target class:{Counter(y_ros)}")


In [None]:
sns.countplot(x=y_ros, data=train) 

### atribute 'city'

In [None]:
city_adjusted = train['city'].str.split("_",expand=True)[1].astype('int64')

In [None]:
train['city'] = city_adjusted

In [None]:
train.head()

### normalization

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
ordinal_atributes = ['education_level', 'experience', 'company_size', 'last_new_job']

In [None]:
cardinal_atributes = ['gender', 'relevent_experience', 'enrolled_university', 'major_discipline', 'company_type']

In [None]:
ordinal_pipeline = Pipeline([
    ('OrdinalEncoder',OrdinalEncoder(categories=[
                                    ['Unknown', 'Primary School',  'High School', 'Graduate', 'Masters', 'Phd'],
                                    'Unknown,<1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,>20'.split(','),
                                    ['Unknown','<10', '10/49', '50-99', '100-500', '500-999' ,'1000-4999', '5000-9999', '10000+'],
                                    ['Unknown','1', '2', '3', '4', '>4', 'never']
                                ]))
])

In [None]:
ordinal_pipeline.fit(train[ordinal_atributes])
train[ordinal_atributes] = ordinal_pipeline.transform(train[ordinal_atributes])
train = pd.get_dummies(train,columns=cardinal_atributes)

In [None]:
train.head()

In [None]:
train['city']

### split data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(train, y,test_size=0.2,stratify=y,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
X_train.head()

### apply model
#### logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
import seaborn as sn

In [None]:
logistic_regression = LogisticRegression(max_iter=1000, random_state=42)
logistic_regression.fit(X_train, y_train)
lr_prediction=logistic_regression.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,lr_prediction))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, lr_prediction)

In [None]:
from collections import Counter
Counter(y_test)

In [None]:
from sklearn import metrics


fpr, tpr, thresholds = metrics.roc_curve(y_test, lr_prediction, pos_label=2)

In [None]:
print(fpr)

In [None]:
print(tpr)

In [None]:
print(thresholds)

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn import svm

# clf = svm.SVC(kernel='linear', C=1, random_state=42)
# scores = cross_val_score(clf, X_train, y_train, cv=5)

In [None]:
# scores

In [None]:
# from sklearn import metrics

# scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro')

In [None]:
# scores