# Project Assignment September Call, A.Y. 2022/2023

In [2]:
import pandas as pd

df_dev = pd.read_csv("fall_project_dataset/development.csv", index_col=0)
df_eval = pd.read_csv("fall_project_dataset/evaluation.csv", index_col=0)

# Create a dictionary from the OCCP code to the text representation
import csv
reader = csv.reader(open('produced_documents/occp_to_string.csv', 'r'), delimiter=';')
d = {}
for row in reader:
   k, v = row
   k = float(k)
   d[k] = v
d

# Map the OCCP column to its text values
df_dev["OCCP"] = df_dev["OCCP"].map(d)
print(df_dev["OCCP"].head(5))

# Keep only the first 3 characters 
df_dev["OCCP"] = df_dev["OCCP"].apply(lambda occp : occp[0:3])
print(df_dev["OCCP"].head(5))

Id
0    CLN-First-Line Supervisors Of Housekeeping And...
1                                         SAL-Cashiers
2             SAL-Real Estate Brokers And Sales Agents
3                             ENG-Mechanical Engineers
4                                         SAL-Cashiers
Name: OCCP, dtype: object
Id
0    CLN
1    SAL
2    SAL
3    ENG
4    SAL
Name: OCCP, dtype: object


## Preprocessing step

I scale all the numeric features through a standard scaling and encode all the categorical features through one-hot encoding

In [56]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_features = ['PINCP', 'WKHP']
numeric_transformer = StandardScaler()

categorical_features = ['FDEYEP', 'ENG', 'OC', 'COW', 'HICOV', 'LANP', 'FER', 'MIGSP', 'SCHL', 'MIG', 'VPS', 'MIL', 'MAR', 'OCCP', 'PAOC', 'PUBCOV', 'DEAR', 'JWAP', 'JWDP', 'POBP', 'SEX', 'RAC1P']
categorical_transformer = OneHotEncoder(handle_unknown='infrequent_if_exist')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

## Pipeline definition

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=10000))])

## Train-test split

In [57]:
from sklearn.model_selection import train_test_split

X = df_dev.drop(columns=["JWMNP"])
y = df_dev["JWMNP"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Fit the model

In [32]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.644


## Pipeline with random forest

In [33]:
from sklearn.ensemble import RandomForestRegressor

forest = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestRegressor(10, random_state=42))])
forest.fit(X_train, y_train)
print("model score: %.3f" % forest.score(X_test, y_test))

model score: 0.739


## GridSearch RandomForest

First method with RandomForestRegressor

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestRegressor())])

param_grid={
    'classifier__n_estimators': [10, 50, 100],
    'classifier__max_features': [1.0, 'sqrt', 'log2'],
    'classifier__bootstrap': [True, False]
}

search = GridSearchCV(pipe, param_grid, n_jobs=6)

search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(f"Best parameters: {search.best_params_}")
print("Best estimator score on test data: %.3f" % search.best_estimator_.score(X_test, y_test))

Second method with RandomForestClassifier

Best parameter (CV score=0.267)  
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_features': 1.0, 'classifier__n_estimators': 100}  
Best estimator score on test data: 0.889  

#### Reducing the dimension of the OCCP column:

Best parameter (CV score=0.267)  
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_features': 1.0, 'classifier__n_estimators': 100}  
Best estimator score on test data: 0.888  

In [59]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

param_grid={
    'classifier__n_estimators': [10, 50, 100],
    'classifier__criterion': ['gini', 'entropy', 'log_loss'],
    'classifier__max_features': [1.0, 'sqrt', 'log2'],
    'classifier__bootstrap': [True, False]
}

search = GridSearchCV(pipe, param_grid, scoring="f1_macro", n_jobs=6)

search.fit(X_train, y_train)

print("Best parameter (CV score=%0.3f)" % search.best_score_)
print(f"Best parameters: {search.best_params_}")
print("Best estimator score on test data: %.3f" % search.best_estimator_.score(X_test, y_test))


The least populated class in y has only 1 members, which is less than n_splits=5.


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Best parameter (CV score=0.267)
Best parameters: {'classifier__bootstrap': True, 'classifier__criterion': 'gini', 'classifier__max_features': 1.0, 'classifier__n_estimators': 100}
Best estimator score on test data: 0.888


# Dimensionality reduction implementation

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

red_dim_forest = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dim_reduction', TruncatedSVD(n_components=100)),
                      ('classifier', RandomForestClassifier(random_state=42, criterion='gini', max_features=1.0, n_estimators=100, n_jobs=6))])

red_dim_forest.fit(X_train, y_train)
print("model score: %.3f" % red_dim_forest.score(X_test, y_test))

model score: 0.647


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

red_dim_forest = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dim_reduction', TruncatedSVD(n_components=500)),
                      ('classifier', RandomForestClassifier(random_state=42, criterion='gini', max_features=1.0, n_estimators=100, n_jobs=6))])

red_dim_forest.fit(X_train, y_train)
print("model score: %.3f" % red_dim_forest.score(X_test, y_test))

# Graphical Analysis

In [23]:
import plotly.express as px
# cityCount[cityCount["city"].str.contains("gsv")]
fig = px.histogram(df_dev[(df_dev["MIGSP"]!=0) & (df_dev["MIGSP"]!=6)], x="MIGSP")
fig.update_layout(
    bargap=0
)
fig.show()