# Preprocessing

## Load the datasets

In [7]:
import pandas as pd

df_dev = pd.read_csv("fall_project_dataset/development.csv", index_col=0)
df_eval = pd.read_csv("fall_project_dataset/evaluation.csv", index_col=0)

## Reduce the cardinality of the OCCP column

We map the OCCP column code to its text representation and we keep only the first 3 characters

In [8]:
import csv

# Create a dictionary from the OCCP code to the text representation
reader = csv.reader(open('produced_documents/occp_to_string.csv', 'r'), delimiter=';')
d = {}
for row in reader:
   k, v = row
   k = float(k)
   d[k] = v

# Map the OCCP column to its text values
df_dev["OCCP"] = df_dev["OCCP"].map(d)

# Keep only the first 3 characters 
df_dev["OCCP"] = df_dev["OCCP"].apply(lambda occp : occp[0:3])

## Prepare the preprocessing pipeline

We normalize the numeric features and one-hot encode the categorical ones

In [9]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['PINCP', 'WKHP']
numeric_transformer = StandardScaler()

all_categorical_features = ['FDEYEP', 'ENG', 'OC', 'COW', 'HICOV', 'LANP', 'FER', 'MIGSP', 'SCHL', 'MIG', 'VPS', 'MIL', 'MAR', 'OCCP', 'PAOC', 'PUBCOV', 'DEAR', 'JWAP', 'JWDP', 'POBP', 'SEX', 'RAC1P']
features_to_drop = ["SCHL", "POBP", "RAC1P", "MIG", "MIGSP", "LANP", "PAOC", "VPS", "DEAR", "FER"]
# ["SCHL", "POBP", "RAC1P", "MIG" (?), "MIGSP", "LANP", "PAOC", "VPS" (?), "DEAR", "FER"]
categorical_features = set(all_categorical_features) ^ set(features_to_drop)
categorical_features = ['COW', 'ENG', 'FDEYEP', 'HICOV', 'JWAP', 'JWDP', 'MAR', 'MIL', 'OC', 'OCCP', 'PUBCOV', 'SEX']
df_dev = df_dev.drop(features_to_drop, axis=1)
categorical_features

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## Train-test split

In [10]:
from sklearn.model_selection import train_test_split
from collections import Counter

X = df_dev.drop(columns=["JWMNP"])
y = df_dev["JWMNP"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

actual_samples = sorted(Counter(y_train).items())

# Print the number of samples in each class
print(actual_samples)

[(1.0, 704), (2.0, 649), (3.0, 547), (4.0, 217), (5.0, 5124), (6.0, 392), (7.0, 995), (8.0, 825), (9.0, 137), (10.0, 10669), (11.0, 121), (12.0, 963), (13.0, 230), (14.0, 112), (15.0, 13916), (16.0, 116), (17.0, 285), (18.0, 337), (19.0, 42), (20.0, 13569), (21.0, 44), (22.0, 217), (23.0, 116), (24.0, 59), (25.0, 6067), (26.0, 50), (27.0, 66), (28.0, 101), (29.0, 18), (30.0, 14124), (31.0, 7), (32.0, 61), (33.0, 25), (34.0, 30), (35.0, 2721), (36.0, 20), (37.0, 34), (38.0, 40), (39.0, 9), (40.0, 3799), (41.0, 1), (42.0, 32), (43.0, 6), (44.0, 7), (45.0, 5515), (46.0, 13), (47.0, 13), (48.0, 28), (49.0, 8), (50.0, 1483), (51.0, 3), (52.0, 8), (53.0, 5), (54.0, 4), (55.0, 388), (56.0, 16), (57.0, 3), (58.0, 19), (59.0, 25), (60.0, 4779), (61.0, 4), (62.0, 10), (63.0, 5), (64.0, 3), (65.0, 173), (66.0, 9), (67.0, 5), (68.0, 11), (69.0, 6), (70.0, 325), (72.0, 8), (73.0, 1), (74.0, 2), (75.0, 498), (76.0, 1), (77.0, 1), (78.0, 4), (80.0, 225), (83.0, 1), (85.0, 32), (87.0, 3), (88.0, 2), (

## Apply RandomOverSamples

We apply RandomOverSampler on the training dataset to slightly increase the number of samples in the minority classes in order to make SMOTENC work correctly

In [11]:
from imblearn.over_sampling import RandomOverSampler

desired_samples = {k: v if v >=10 else 10 for k, v in actual_samples}
# desired_samples = {k: v if v >=int(len(y_train)/len(Counter(y_train))) else int(len(y_train)/len(Counter(y_train))) for k, v in actual_samples}

ros = RandomOverSampler(random_state=42, sampling_strategy=desired_samples)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Print the number of samples in each class
print(sorted(Counter(y_resampled).items()))

[(1.0, 704), (2.0, 649), (3.0, 547), (4.0, 217), (5.0, 5124), (6.0, 392), (7.0, 995), (8.0, 825), (9.0, 137), (10.0, 10669), (11.0, 121), (12.0, 963), (13.0, 230), (14.0, 112), (15.0, 13916), (16.0, 116), (17.0, 285), (18.0, 337), (19.0, 42), (20.0, 13569), (21.0, 44), (22.0, 217), (23.0, 116), (24.0, 59), (25.0, 6067), (26.0, 50), (27.0, 66), (28.0, 101), (29.0, 18), (30.0, 14124), (31.0, 10), (32.0, 61), (33.0, 25), (34.0, 30), (35.0, 2721), (36.0, 20), (37.0, 34), (38.0, 40), (39.0, 10), (40.0, 3799), (41.0, 10), (42.0, 32), (43.0, 10), (44.0, 10), (45.0, 5515), (46.0, 13), (47.0, 13), (48.0, 28), (49.0, 10), (50.0, 1483), (51.0, 10), (52.0, 10), (53.0, 10), (54.0, 10), (55.0, 388), (56.0, 16), (57.0, 10), (58.0, 19), (59.0, 25), (60.0, 4779), (61.0, 10), (62.0, 10), (63.0, 10), (64.0, 10), (65.0, 173), (66.0, 10), (67.0, 10), (68.0, 11), (69.0, 10), (70.0, 325), (72.0, 10), (73.0, 10), (74.0, 10), (75.0, 498), (76.0, 10), (77.0, 10), (78.0, 10), (80.0, 225), (83.0, 10), (85.0, 32),

## Apply SMOTENC

We apply SMOTENC (Synthetic Minority Over-sampling Technique for Nominal and Continuous) on the training dataset to increase the number of samples in the minority classes

In [12]:
# from imblearn.over_sampling import SMOTENC

# desired_samples = {k: v if v >=int(len(y_train)/len(Counter(y_train))) else int(len(y_train)/len(Counter(y_train))) for k, v in desired_samples.items()}

# # smote_nc = SMOTENC(categorical_features, random_state=42)
# smote_nc = SMOTENC(categorical_features, random_state=42, sampling_strategy=desired_samples)
# X_resampled_nc, y_resampled_nc = smote_nc.fit_resample(X_resampled, y_resampled)

# # Print the number of samples in each class
# print(sorted(Counter(y_resampled_nc).items()))

# Pipeline definition and fit

In [16]:

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

# pipe.fit(X_resampled_nc, y_resampled_nc)
# pipe.fit(X_resampled, y_resampled)
pipe.fit(X_train, y_train)

# GridSearch

Best params:  

criterion: 'poisson' OR 'squared_error'
max_features': 'sqrt',  # But probably None
bootstrap: True,  
max_depth: None,  
min_samples_split: 2,  
min_samples_leaf: 1

In [38]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

param_grid={
    'regressor__n_estimators': [10, 50, 100],       # Number of trees in the forest
    'regressor__criterion': ['squared_error', 'friedman_mse', 'poisson'],
    'regressor__max_features': ['sqrt', 'log2'],
    'regressor__bootstrap': [True, False],
    'regressor__max_depth': [None, 10, 20, 30],     # Maximum depth of the trees
    'regressor__min_samples_split': [2, 5, 10],     # Minimum number of samples required to split a node
    'regressor__min_samples_leaf': [1, 2, 4]        # Minimum number of samples required at each leaf node
}

search = GridSearchCV(pipe, param_grid, scoring='r2', n_jobs=8)

search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f)" % search.best_score_)
print(f"Best parameters: {search.best_params_}")
print("Best estimator score on test data: %.3f" % search.best_estimator_.score(X_test, y_test))



Best parameter (CV score=0.570)
Best parameters: {'regressor__bootstrap': False, 'regressor__criterion': 'squared_error', 'regressor__max_depth': None, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best estimator score on test data: 0.621


In [28]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.799686521508713

# Results

In [30]:
from sklearn.metrics import r2_score
import numpy as np

print(search.score(X_test, y_test))
y_pred = np.rint(search.predict(X_test))
print(r2_score(y_test, y_pred,))

0.7837014001331504
0.7836799090728417


# Create the submission file

In [None]:
# TODO Check if the preprocessing to drop the columns is needed for df_eval
y_pred2 = pipe.predict(df_eval)

data = list(zip(df_eval.index, y_pred_2))

# Save the data to a CSV file
with open('submissions/submission_1.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Id', 'Predicted'])  # Header row
    csvwriter.writerows(data)