# Loading and Pre-processing

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('nus_data_full_rand.csv', encoding= 'unicode_escape', index_col = False)

labels = data['E_OCC']
data['rand'] = pd.Series(np.random.uniform(0,1,len(labels.index)))
data = data.sort_values(by=['rand'])

In [2]:
# Function to remove Punctuation

import string

def remove_punc(text):
    text_nopunc = "".join([char for char in text if char not in string.punctuation])  # discard all punctuation
    return text_nopunc

data['desc_clean'] = data['E_OCC_Desc'].apply(lambda x: remove_punc(x))

# data.head()

In [3]:
# Function to Tokenize words

import re

def tokenise(text):
    tokens = re.split('\W+', text)
    return tokens

data['desc_tokenised'] = data['desc_clean'].apply(lambda x: tokenise(x.lower()))

# data.head()

In [4]:
# Function to remove stopwords

import nltk
nltk.download('stopwords')

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(lst):
    text = [word for word in lst if word not in stopword]  # Remove all stopwords
    return text

data['desc_nostop'] = data['desc_tokenised'].apply(lambda x: remove_stopwords(x))

# data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eugene\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Lemmatizing

import nltk
nltk.download('wordnet')

wn = nltk.WordNetLemmatizer()

def lemmatizing(token):
    text = [wn.lemmatize(word) for word in token]
    return text

data['desc_lemmatized'] = data['desc_nostop'].apply(lambda x: lemmatizing(x))

# data

In [6]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopword]
    return text

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['E_OCC_Desc'])

In [None]:
X_counts_array = pd.DataFrame(X_counts.toarray())
# X_counts_array

In [9]:
cols = ['TENH','SEX','RACE','ID_TYP','MARITAL_ST', 'E_EMPST', 'E_IND_Desc_LE', 'EDUC_N', 'AGE_G']

train_features = pd.concat([data[cols], X_counts_array], axis = 1)
# train_features

## Modelling 

In [10]:
# split the data into train and test
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train_features,labels,test_size=0.2,random_state=23, stratify=labels)

In [11]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler
import numpy as np

sc = StandardScaler()
train_x_scaled = sc.fit_transform(train_x)
test_x_scaled = sc.transform(test_x)
train_y = np.array(train_y)

In [12]:
print(train_features.shape)
print(train_x.shape)
print(test_x.shape)

(2062, 2460)
(1649, 2460)
(413, 2460)


## Running PCA 

In [18]:
from sklearn.decomposition import PCA

pca = PCA()
train_x_scaled = pca.fit_transform(train_x_scaled)
test_x_scaled = pca.transform(test_x_scaled)

In [19]:
# if n_components is not set all components are kept: n_components == min(n_samples, n_features)

print(train_x.shape)
print(test_x.shape)

(1649, 1649)
(413, 1649)


In [20]:
explained_variance = pca.explained_variance_ratio_
explained_variance[0:20]

array([0.01878803, 0.01560147, 0.01371565, 0.01320122, 0.01148068,
       0.00955322, 0.00838511, 0.00775567, 0.0074108 , 0.00678715,
       0.00663369, 0.0064126 , 0.00597942, 0.00568011, 0.00549014,
       0.00515012, 0.00466549, 0.00463785, 0.00453808, 0.00449982])

In [21]:
# checking for components/dimensions to cover 95% variance

def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components

# Run function
select_n_components(explained_variance, 0.95)

802

In [22]:
# sanity check

import numpy as np
arr = np.array(explained_variance).tolist()
count = 0
for i in arr[0:select_n_components(explained_variance, 0.95)]:
    count += i
print (count)

print(explained_variance.size)

0.9502226944683085
1649


In [23]:
from sklearn.decomposition import PCA

pca = PCA(n_components=802)
pca.fit(train_x_scaled)

train_x_scaled_pca = pca.transform(train_x_scaled)
test_x_scaled_pca = pca.transform(test_x_scaled)

## RFC with PCA variables 

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time

start_time = time.time()
rfc = RandomForestClassifier()
parameter_space = {
    'bootstrap': [True],
    'max_depth': [25, 30, 35],
    'max_features': [4, 5, 6],
    'min_samples_leaf': [3, 5],
    'min_samples_split': [10, 12],
    'n_estimators': [100, 200],
    # 'class_weight': [weights]
}

clf = GridSearchCV(rfc, parameter_space, n_jobs=-1, cv=5)
clf.fit(train_x_scaled_pca, train_y)
print('Time taken for training the model: '+ str(time.time() - start_time))

optimised_rf = clf.best_estimator_

# Best parameter set
print('Best parameters found:\n', clf.best_params_)
print('Mean test score:', max(clf.cv_results_['mean_test_score']))



Time taken for training the model: 249.20458364486694
Best parameters found:
 {'bootstrap': True, 'max_depth': 35, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 200}
Mean test score: 0.28987197199963155


In [None]:
y_true, y_pred_pca = test_y , optimised_rf.predict(test_x_scaled_pca)
from sklearn.metrics import classification_report

# precision = true positive/(true positive + false positive)
# recall = true positive/(true positive + false negative)
# f1 score = 2 * (precision * recall)/(precision + recall)

#print('Results on the test set:')
#print(classification_report(y_true, y_pred_pca))

In [None]:
from sklearn.metrics import accuracy_score
# accuracy_score(y_true, y_pred_pca)

In [None]:
from sklearn.metrics import confusion_matrix
# print(confusion_matrix(y_true, y_pred_pca))