# import

In [35]:
import os
import re
import pandas as pd
import numpy as np
from utils import *

# 0. Load data

In [43]:
# author code
x_tr_df = pd.read_csv('train_folder/x_train_ids.csv', index_col=0)
tr_dir = "train_folder/txt_files/"
tr_files = x_tr_df['filename'].values


tr_texts = []
for tr_file in tr_files:
    f = open(os.path.join(tr_dir, tr_file), "r")
    ff = f.read()
    ff = clean1(ff)
    tr_texts.append(ff)
# tr_texts has all the texts in it

In [44]:
#y_train data
y_tr_df = pd.read_csv('./Y_train_predilex.csv', index_col=0)

In [45]:
y_tr_df.head()

Unnamed: 0_level_0,sexe,date_accident,date_consolidation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,homme,1991-04-09,n.c.
1,homme,2005-06-10,2010-01-19
2,femme,1997-09-26,n.c.
3,femme,1982-08-07,1982-11-07
4,homme,1996-11-26,n.c.


# 1. Training

In [46]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier

In [47]:
# clean text without using stop words nor lemmitizer
def clean_text(text: str) -> str:
    """ remove unwanted characters from text """
    # remove line breaks
    text = text.replace('\n', '')
    # remove quotes
    text = text.replace('"', '')
    #lowercase text
    text = text.lower()
    #remove punctuation and special characters ?:!.,[]();
    text = text.translate(str.maketrans('', '', '?:!.,;[]()*-'))
    # remove possessive pronouns termination
    text = text.replace("'s", "")
    # remove unnecessery white spaces
    text = re.sub(r"\s\s+", " ", text)
        
    return text

In [48]:
# drop rows with unknow gender in training set
gender_na_idx = y_tr_df[y_tr_df['sexe'] == 'n.c.'].index
y_tr_df = y_tr_df[y_tr_df['sexe'] != 'n.c.']
y_tr_df.reset_index(inplace=True, drop=True)

tr_texts = [clean_text(tr_texts[i]) for i in range(len(tr_texts)) if i not in gender_na_idx]

In [51]:
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2), max_features=100, norm='l2', sublinear_tf=True)
x_train = tfidf_vec.fit_transform(tr_texts)
y_train = y_tr_df['sexe'].map({'homme': 1, 'femme': 0}).values

In [52]:
c_val, c_count = np.unique(y_train, return_counts=True)
for i, c in enumerate(c_val):
    print(f"Number of samples for class {c}: {c_count[i]}")

Number of samples for class 0: 206
Number of samples for class 1: 559


In [53]:
def apply_cv(cls, cv=3):
    cv_results = cross_validate(cls, x_train, y_train, scoring='accuracy', return_train_score=True, cv=cv)
    print(f"CV Accuracy: {round(cv_results['train_score'].mean() * 100, 2)}%")

In [54]:
def perform_grid_search(cls, param_grid):
    """ perform a grid search on a classifier using 3-fold cross validation and the passed prameters """
    grid_search = GridSearchCV(estimator=cls, 
                               param_grid=param_grid,
                               scoring='accuracy',
                               cv=3,
                               verbose=1)
    grid_search.fit(x_train, y_train)
    print(f"The best hyperparameters from Grid Seach are: {grid_search.best_params_}")
    return grid_search.best_estimator_

In [56]:
random_state = 42

param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [.01, .1],
    'subsample': [.7, .9, 1],
    'colsample_bytree': [.7, .9, 1],
}
xgb_cls = XGBClassifier(random_state=random_state)
xgb_cls = perform_grid_search(xgb_cls, param_grid)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


The best hyperparameters from Grid Seach are: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.9}


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   11.9s finished


In [62]:
apply_cv(xgb_cls, 10)

CV Accuracy: 100.0%


In [58]:
xgb_cls.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=0.9, tree_method=None,
              validate_parameters=False, verbosity=None)

# 2. Predict

In [63]:
#x_test data
x_te_df = pd.read_csv('test_folder/x_test_ids.csv', index_col=0)
te_dir = "test_folder/txt_files/"
te_files = x_te_df['filename'].values
te_texts = []

for te_file in te_files:
    f = open(os.path.join(te_dir, te_file), "r")
    ff = f.read()
    ff = clean1(ff)
    ff = clean_text(ff)
    te_texts.append(ff)
# tr_texts has all the texts in it

In [65]:
x_test = tfidf_vec.transform(te_texts).toarray()

In [68]:
res = xgb_cls.predict(x_test)

In [73]:
np.save('results/gender.npy', res)