In [33]:
import pandas as pd

# Read data

In [34]:
DATA_PATH = 'data/'
DATASET_PATH = DATA_PATH + 'aug_train.csv'
label_col = 'target'

In [35]:
df_master = pd.read_csv(DATASET_PATH)
df = df_master.copy()
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


### Select proper columns

In [36]:
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

MODELS_DIR = 'models'
SCALER_PATH = MODELS_DIR + '/scaler.joblib'

FEATURE_LIST = ['training_hours', 'city_development_index', 'gender']
LABEL_COLUMN = 'target'
NUMERICAL_COLUMNS = ['training_hours', 'city_development_index']
CATEGORICAL_COLUMNS = ['gender']

# Filter on used columns
df = df[FEATURE_LIST + [LABEL_COLUMN]]

df.head()


Unnamed: 0,training_hours,city_development_index,gender,target
0,36,0.92,Male,1.0
1,47,0.776,Male,0.0
2,83,0.624,,0.0
3,52,0.789,,1.0
4,8,0.767,Male,0.0


### Scale continuous feautres

In [37]:
scaler = StandardScaler()
scaler.fit(df[NUMERICAL_COLUMNS])
joblib.dump(scaler, MODELS_DIR + '/scaler.joblib')
numerical_features_scaled = scaler.transform(df[NUMERICAL_COLUMNS])
numerical_features_scaled_df = pd.DataFrame(data=numerical_features_scaled, columns=NUMERICAL_COLUMNS)

numerical_features_scaled_df.head()


Unnamed: 0,training_hours,city_development_index
0,-0.488985,0.738919
1,-0.305825,-0.42841
2,0.293607,-1.66059
3,-0.222571,-0.323026
4,-0.955209,-0.501368


### Categorical features

In [38]:
# encode data
one_hot_encoder = OneHotEncoder()
one_hot_encoder.fit(df[CATEGORICAL_COLUMNS])
joblib.dump(one_hot_encoder, MODELS_DIR + '/one_hot_encoder.joblib')
categorical_features_encoded = one_hot_encoder.transform(df[CATEGORICAL_COLUMNS])
categorical_features_encoded_df = pd.DataFrame.sparse.from_spmatrix(
    data=categorical_features_encoded, columns=one_hot_encoder.get_feature_names_out()
)

# Model Building

## Model training

- Split dataset

In [39]:
# join dataframe
final_df = numerical_features_scaled_df.join(categorical_features_encoded_df).join(df[LABEL_COLUMN])

# Split data
X, y = final_df.drop(columns=[LABEL_COLUMN]), final_df[LABEL_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

def get_scaler(training_data: pd.DataFrame = None, training_mode: bool = False):
    if training_mode:
        scaler = StandardScaler()
        scaler.fit(training_data)
        joblib.dump(scaler, SCALER_PATH)
    else:
        scaler = joblib.load(SCALER_PATH)
    return scaler


- Train model

In [40]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [41]:
X_train.shape

(12835, 6)

In [42]:
y_train.shape

(12835,)

In [43]:
model.fit(X_train, y_train)



In [44]:
joblib.dump(model, MODELS_DIR + '/model.joblib')

['models/model.joblib']

## Model evaluation

In [45]:
y_pred = model.predict(X_test)



In [46]:
y_pred[y_pred < 0] = 0

y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [47]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [48]:
compute_rmsle(y_test, y_pred)

0.34

In [49]:
test = df.sample(50)

test.to_csv('data/test.csv')

# Model Inference

In [50]:
df_test_master = pd.read_csv(DATA_PATH +  'test.csv')
df = df_test_master.copy()
df.head()

Unnamed: 0.1,Unnamed: 0,training_hours,city_development_index,gender,target
0,14367,18,0.624,,0.0
1,3924,44,0.843,Male,0.0
2,1530,50,0.855,Female,0.0
3,573,104,0.624,Male,1.0
4,17608,35,0.91,Male,0.0


In [51]:
df = df[FEATURE_LIST]
df.head()

Unnamed: 0,training_hours,city_development_index,gender
0,18,0.624,
1,44,0.843,Male
2,50,0.855,Female
3,104,0.624,Male
4,35,0.91,Male


In [62]:
import joblib

scaler = joblib.load('models\scaler.joblib')
scaler.fit(df[NUMERICAL_COLUMNS])

numerical_features_scaled = scaler.transform(df[NUMERICAL_COLUMNS])
numerical_features_test_scaled_df = pd.DataFrame(data=numerical_features_scaled, columns=NUMERICAL_COLUMNS)

numerical_features_test_scaled_df.head()


Unnamed: 0,training_hours,city_development_index
0,-0.777153,-1.417842
1,-0.369281,0.218819
2,-0.275157,0.308499
3,0.571962,-1.417842
4,-0.510467,0.719533


In [53]:
# encode data
one_hot_encoder = joblib.load('models\one_hot_encoder.joblib')
one_hot_encoder.fit(df[CATEGORICAL_COLUMNS])

categorical_features_encoded = one_hot_encoder.transform(df[CATEGORICAL_COLUMNS])
categorical_features_test_encoded_df = pd.DataFrame.sparse.from_spmatrix(
    data=categorical_features_encoded, columns=one_hot_encoder.get_feature_names_out()
)

categorical_features_test_encoded_df.head()

Unnamed: 0,gender_Female,gender_Male,gender_Other,gender_nan
0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0


In [54]:
final_test_df = numerical_features_test_scaled_df.join(categorical_features_test_encoded_df)
final_test_df.head()


Unnamed: 0,training_hours,city_development_index,gender_Female,gender_Male,gender_Other,gender_nan
0,-0.777153,-1.417842,0.0,0.0,0.0,1.0
1,-0.369281,0.218819,0.0,1.0,0.0,0.0
2,-0.275157,0.308499,1.0,0.0,0.0,0.0
3,0.571962,-1.417842,0.0,1.0,0.0,0.0
4,-0.510467,0.719533,0.0,1.0,0.0,0.0


In [55]:
model = joblib.load('models\model.joblib')
predictions = model.predict(final_test_df)



In [56]:
predictions

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [63]:
list(predictions)

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [57]:
predictions.shape

(50,)

In [136]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load the ground truth test labels from a file
test_labels = pd.read_csv(DATA_PATH +  'sample_submission.csv')['SalePrice']
test_labels.head()

# Calculate the RMSE and MAE for the predicted test data
rmse = np.sqrt(mean_squared_error(test_labels, predictions))
mae = mean_absolute_error(test_labels, predictions)

print('RMSE:', rmse)
print('MAE:', mae)

FileNotFoundError: [Errno 2] No such file or directory: 'data/sample_submission.csv'

# Testing dataframe equality

In [None]:
final_df.to_parquet(DATA_PATH + 'processed_df.parquet', index=False)

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
!ls ../../data/house-prices/

In [None]:
processed_df = pd.read_parquet(DATA_PATH + 'processed_df.parquet')
processed_df.head()

In [None]:
pd.testing.assert_frame_equal(processed_df, final_df)

In [None]:
pd.testing.assert_frame_equal(processed_df, final_df.drop(columns=[label_col]))