## Load Data

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('nus_data_full_rand.csv', encoding= 'unicode_escape', index_col = False)

labels = data['E_OCC']
data['rand'] = pd.Series(np.random.uniform(0,1,len(labels.index)))
data = data.sort_values(by=['rand'])

## Pre-Processing

In [3]:
# Function to remove Punctuation

import string

def remove_punc(text):
    text_nopunc = "".join([char for char in text if char not in string.punctuation])  # discard all punctuation
    return text_nopunc

data['desc_clean'] = data['E_OCC_Desc'].apply(lambda x: remove_punc(x))

#data.head()

In [4]:
# Function to Tokenize words

import re

def tokenise(text):
    tokens = re.split('\W+', text)
    return tokens

data['desc_tokenised'] = data['desc_clean'].apply(lambda x: tokenise(x.lower()))

#data.head()

In [5]:
# Function to remove stopwords

import nltk
nltk.download('stopwords')

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(lst):
    text = [word for word in lst if word not in stopword]  # Remove all stopwords
    return text

data['desc_nostop'] = data['desc_tokenised'].apply(lambda x: remove_stopwords(x))

#data.head()

[nltk_data] Downloading package stopwords to C:\Users\Wu
[nltk_data]     Puyue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Lemmatizing

import nltk
nltk.download('wordnet')

wn = nltk.WordNetLemmatizer()

def lemmatizing(token):
    text = [wn.lemmatize(word) for word in token]
    return text

data['desc_lemmatized'] = data['desc_nostop'].apply(lambda x: lemmatizing(x))

# data.head(5)

In [7]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopword]
    return text

# 3 Word-Embedding Methods & Corresponding RFC

# 1. TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['E_OCC_Desc'])
X_tfidf_array = pd.DataFrame(X_tfidf.toarray())

In [10]:
cols = ['TENH','SEX','RACE','ID_TYP','MARITAL_ST', 'E_EMPST', 'E_IND_Desc_LE', 'EDUC_N', 'AGE_G']

train_features_tfidf = pd.concat([data[cols], X_tfidf_array], axis = 1)
#train_features_tfidf

In [11]:
# split the data into train and test
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train_features_tfidf,labels,test_size=0.2,random_state=23, stratify=labels)
#train_x

In [None]:
weights = dict(labels.value_counts())
total = len(labels.index)
unique = labels.nunique()
print(weights)
for k, v in weights.items():
    weights[k] = (total-v)/total/(unique-1)
#weights

## Model Fitting and Hyperparameters Tuning (TF-IDF)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time

start_time = time.time()
rfc = RandomForestClassifier()
parameter_space = {
    'bootstrap': [True],
    'max_depth': [20, 25, 30],
    'max_features': [8, 10, 12],
    'min_samples_leaf': [2, 3],
    'min_samples_split': [10, 12, 14],
    'n_estimators': [100, 200]
    # 'class_weight': [weights]
}

clf = GridSearchCV(rfc, parameter_space, n_jobs=-1, cv=5)
clf.fit(train_x, train_y)
print('Time taken for training the model: '+ str(time.time() - start_time))

optimised_rf = clf.best_estimator_

# Best parameter set
print('Best parameters found:\n', clf.best_params_)
print('Mean test score:', max(clf.cv_results_['mean_test_score']))



Time taken for training the model: 37.11024451255798
Best parameters found:
 {'bootstrap': True, 'max_depth': 30, 'max_features': 12, 'min_samples_leaf': 2, 'min_samples_split': 14, 'n_estimators': 200}
Mean test score: 0.31837477258944813


In [21]:
# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
#for mean, std, params in zip(means, stds, clf.cv_results_['params']):
 #   print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
y_true, y_pred = test_y , optimised_rf.predict(test_x)
from sklearn.metrics import classification_report

# precision = true positive/(true positive + false positive)
# recall = true positive/(true positive + false negative)
# f1 score = 2 * (precision * recall)/(precision + recall)

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)
from sklearn.metrics import confusion_matrix
#print(confusion_matrix(y_true, y_pred))

#  2. Bag of Words (CountVectorizer)


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['E_OCC_Desc'])
X_counts_array = pd.DataFrame(X_counts.toarray())

In [17]:
cols = ['TENH','SEX','RACE','ID_TYP','MARITAL_ST', 'E_EMPST', 'E_IND_Desc_LE', 'EDUC_N', 'AGE_G']

train_features = pd.concat([data[cols], X_counts_array], axis = 1)
#train_features

In [19]:
# split the data into train and test
from sklearn.model_selection import train_test_split
train_x2, test_x2, train_y2, test_y2 = train_test_split(train_features,labels,test_size=0.2,random_state=23, stratify=labels)
#train_x2

In [None]:
weights = dict(labels.value_counts())
total = len(labels.index)
unique = labels.nunique()
print(weights)
for k, v in weights.items():
    weights[k] = (total-v)/total/(unique-1)
#weights

## Model Fitting and Hyperparameters Tuning (BoW)

In [22]:
start_time = time.time()
rfc = RandomForestClassifier()
parameter_space = {
    'bootstrap': [True],
    'max_depth': [20, 25, 30, 35],
    'max_features': [8, 10, 12],
    'min_samples_leaf': [2, 3],
    'min_samples_split': [10, 12, 14],
    'n_estimators': [100, 200]
    # 'class_weight': [weights]
}

clf = GridSearchCV(rfc, parameter_space, n_jobs=-1, cv=5)
clf.fit(train_x2, train_y2)
print('Time taken for training the model: '+ str(time.time() - start_time))

optimised_rf = clf.best_estimator_

# Best parameter set
print('Best parameters found:\n', clf.best_params_)
print('Mean test score:', max(clf.cv_results_['mean_test_score']))



Time taken for training the model: 46.83394145965576
Best parameters found:
 {'bootstrap': True, 'max_depth': 25, 'max_features': 12, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Mean test score: 0.32989690721649484


In [None]:
y_true, y_pred = test_y2 , optimised_rf.predict(test_x2)
from sklearn.metrics import classification_report

# precision = true positive/(true positive + false positive)
# recall = true positive/(true positive + false negative)
# f1 score = 2 * (precision * recall)/(precision + recall)

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)
from sklearn.metrics import confusion_matrix
#print(confusion_matrix(y_true, y_pred))

# N-Grams

In [24]:
ngram_vect = CountVectorizer(ngram_range=(2,2), analyzer=clean_text)
X_counts2 = ngram_vect.fit_transform(data['E_OCC_Desc'])
X_counts2_array = pd.DataFrame(X_counts2.toarray())

In [25]:
cols = ['TENH','SEX','RACE','ID_TYP','MARITAL_ST', 'E_EMPST', 'E_IND_Desc_LE', 'EDUC_N', 'AGE_G']

train_features_NG = pd.concat([data[cols], X_counts2_array], axis = 1)
#train_features_NG

In [26]:
# split the data into train and test
#from sklearn.model_selection import train_test_split
train_x3, test_x3, train_y3, test_y3 = train_test_split(train_features_NG,labels,test_size=0.2,random_state=23, stratify=labels)
#train_x3

In [None]:
weights = dict(labels.value_counts())
total = len(labels.index)
unique = labels.nunique()
print(weights)
for k, v in weights.items():
    weights[k] = (total-v)/total/(unique-1)
#weights

## Model Fitting and Hyperparameters Tuning (N-Grams)

In [27]:
start_time = time.time()
rfc = RandomForestClassifier()
parameter_space = {
    'bootstrap': [True],
    'max_depth': [20, 25, 30],
    'max_features': [8, 10, 12],
    'min_samples_leaf': [2, 3],
    'min_samples_split': [10, 12, 14],
    'n_estimators': [100, 200]
    # 'class_weight': [weights]
}

clf = GridSearchCV(rfc, parameter_space, n_jobs=-1, cv=5)
clf.fit(train_x3, train_y3)
print('Time taken for training the model: '+ str(time.time() - start_time))

optimised_rf = clf.best_estimator_

# Best parameter set
print('Best parameters found:\n', clf.best_params_)
print('Mean test score:', max(clf.cv_results_['mean_test_score']))



Time taken for training the model: 33.70927691459656
Best parameters found:
 {'bootstrap': True, 'max_depth': 30, 'max_features': 12, 'min_samples_leaf': 2, 'min_samples_split': 12, 'n_estimators': 200}
Mean test score: 0.3292904790782292


In [None]:
y_true, y_pred = test_y3 , optimised_rf.predict(test_x3)
from sklearn.metrics import classification_report

# precision = true positive/(true positive + false positive)
# recall = true positive/(true positive + false negative)
# f1 score = 2 * (precision * recall)/(precision + recall)

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)
from sklearn.metrics import confusion_matrix
#print(confusion_matrix(y_true, y_pred))