# Loading and Pre-processing

In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('nus_data_full_rand.csv', encoding= 'unicode_escape', index_col = False)

# data.head()

In [None]:
labels = data['E_OCC']
data['rand'] = pd.Series(np.random.uniform(0,1,len(labels.index)))
data = data.sort_values(by=['rand'])

In [None]:
# Function to remove Punctuation

import string

def remove_punc(text):
    text_nopunc = "".join([char for char in text if char not in string.punctuation])  # discard all punctuation
    return text_nopunc

data['desc_clean'] = data['E_OCC_Desc'].apply(lambda x: remove_punc(x))

#data.head()

In [None]:
# Function to Tokenize words

import re

def tokenise(text):
    tokens = re.split('\W+', text)
    return tokens

data['desc_tokenised'] = data['desc_clean'].apply(lambda x: tokenise(x.lower()))

# data.head()

In [None]:
# Function to remove stopwords

import nltk
nltk.download('stopwords')

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(lst):
    text = [word for word in lst if word not in stopword]  # Remove all stopwords
    return text

data['desc_nostop'] = data['desc_tokenised'].apply(lambda x: remove_stopwords(x))

# data.head()

In [None]:
# Lemmatizing

import nltk
nltk.download('wordnet')

wn = nltk.WordNetLemmatizer()

def lemmatizing(token):
    text = [wn.lemmatize(word) for word in token]
    return text

data['desc_lemmatized'] = data['desc_nostop'].apply(lambda x: lemmatizing(x))

# data.head()

In [9]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopword]
    return text

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['E_OCC_Desc'])

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['E_OCC_Desc'])

In [11]:
X_counts_array = pd.DataFrame(X_counts.toarray())

X_tfidf_array = pd.DataFrame(X_tfidf.toarray())

In [12]:
cols = ['TENH','SEX','RACE','ID_TYP','MARITAL_ST', 'E_EMPST', 'E_IND_Desc_LE', 'EDUC_N', 'AGE_G']

train_features_counts = pd.concat([data[cols], X_counts_array], axis = 1)

train_features_tfidf = pd.concat([data[cols], X_tfidf_array], axis = 1)

In [None]:
# print(tfidf_vect.vocabulary_)

In [14]:
data['2_digit'] = data['E_OCC'].apply(lambda x: x//10)
labels2 = data['2_digit']

# 1. SVM

## a. CountVectorizer 3-digit codes

In [15]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train_features_counts,labels,test_size=0.2,random_state=1, stratify=labels)

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
import time
start_time = time.time()

svcmodel = OneVsRestClassifier(SVC())

parameters_space = {
    "estimator__C": [1,10],
    "estimator__kernel": ["poly","rbf"],
    "estimator__degree":[2, 3, 4],
}

svc_tunning = GridSearchCV(svcmodel, parameters_space, n_jobs=-1, cv=5)

svc_tunning.fit(train_x, train_y)
print('Time taken for training the model: '+ str(time.time() - start_time))

#print(svc_tunning.best_score_)
#print(svc_tunning.best_params_)

optimised_svc = svc_tunning.best_estimator_

# Best parameter set
print('Best parameters found:\n', svc_tunning.best_params_)
print('Mean test score:', max(svc_tunning.cv_results_['mean_test_score']))







Time taken for training the model: 809.7255673408508
Best parameters found:
 {'estimator__C': 10, 'estimator__degree': 2, 'estimator__kernel': 'rbf'}
Mean test score: 0.35779260157671317


In [17]:
optimised_svc.score(test_x, test_y)

0.4019370460048426

In [None]:
y_true, y_pred = test_y , optimised_svc.predict(test_x)
from sklearn.metrics import classification_report

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))

## b. CountVectorizer 2-digit codes

In [22]:
train_x2, test_x2, train_y2, test_y2 = train_test_split(train_features_counts,labels2,test_size=0.2,random_state=1, stratify=labels2)

In [28]:
start_time = time.time()

svcmodel = OneVsRestClassifier(SVC())

parameters_space = {
    "estimator__C": [1,10],
    "estimator__kernel": ["poly","rbf"],
    "estimator__degree":[2, 3, 4],
    "estimator__gamma" :['scale']
}

svc_tunning = GridSearchCV(svcmodel, parameters_space, n_jobs=-1, cv=5)

svc_tunning.fit(train_x2, train_y2)
print('Time taken for training the model: '+ str(time.time() - start_time))

#print(svc_tunning.best_score_)
#print(svc_tunning.best_params_)

SVMModel_2 = svc_tunning.best_estimator_

# Best parameter set
print('Best parameters found:\n', svc_tunning.best_params_)
print('Mean test score:', max(svc_tunning.cv_results_['mean_test_score']))



Time taken for training the model: 608.6899673938751
Best parameters found:
 {'estimator__C': 10, 'estimator__degree': 3, 'estimator__gamma': 'scale', 'estimator__kernel': 'poly'}
Mean test score: 0.4493632504548211


In [29]:
SVMModel_2.score(test_x2, test_y2)

0.44552058111380144

In [None]:
y_true, y_pred = test_y2 , SVMModel_2.predict(test_x2)
from sklearn.metrics import classification_report

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))

## c. TfidfVectorizer 3-digit codes

In [31]:
train_x3, test_x3, train_y3, test_y3 = train_test_split(train_features_tfidf,labels,test_size=0.2,random_state=1, stratify=labels)

In [32]:
start_time = time.time()

svcmodel = OneVsRestClassifier(SVC())

parameters_space = {
    "estimator__C": [1,10],
    "estimator__kernel": ["poly","rbf"],
    "estimator__degree":[2, 3, 4],
}

svc_tunning = GridSearchCV(svcmodel, parameters_space, n_jobs=-1, cv=5)

svc_tunning.fit(train_x3, train_y3)
print('Time taken for training the model: '+ str(time.time() - start_time))

#print(svc_tunning.best_score_)
#print(svc_tunning.best_params_)

SVMModel_3 = svc_tunning.best_estimator_

# Best parameter set
print('Best parameters found:\n', svc_tunning.best_params_)
print('Mean test score:', max(svc_tunning.cv_results_['mean_test_score']))







Time taken for training the model: 958.8412699699402
Best parameters found:
 {'estimator__C': 10, 'estimator__degree': 2, 'estimator__kernel': 'rbf'}
Mean test score: 0.3638568829593693


In [33]:
SVMModel_3.score(test_x3, test_y3)

0.4019370460048426

In [None]:
y_true, y_pred = test_y3 , SVMModel_3.predict(test_x3)
from sklearn.metrics import classification_report

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))

## d. TfidfVectorizer 2-digit codes

In [35]:
train_x4, test_x4, train_y4, test_y4 = train_test_split(train_features_tfidf,labels2,test_size=0.2, random_state=1, stratify=labels2)

In [36]:
start_time = time.time()

svcmodel = OneVsRestClassifier(SVC())

parameters_space = {
    "estimator__C": [1,10],
    "estimator__kernel": ["poly","rbf"],
    "estimator__degree":[2, 3, 4],
    "estimator__gamma" :['scale'],
}

svc_tunning = GridSearchCV(svcmodel, parameters_space, n_jobs=-1, cv=5)

svc_tunning.fit(train_x4, train_y4)
print('Time taken for training the model: '+ str(time.time() - start_time))

#print(svc_tunning.best_score_)
#print(svc_tunning.best_params_)

SVMModel_4 = svc_tunning.best_estimator_

# Best parameter set
print('Best parameters found:\n', svc_tunning.best_params_)
print('Mean test score:', max(svc_tunning.cv_results_['mean_test_score']))



Time taken for training the model: 689.7800760269165
Best parameters found:
 {'estimator__C': 10, 'estimator__degree': 2, 'estimator__gamma': 'scale', 'estimator__kernel': 'poly'}
Mean test score: 0.45967252880533654


In [37]:
SVMModel_4.score(test_x4, test_y4)

0.43341404358353514

In [None]:
y_true, y_pred = test_y4 , SVMModel_4.predict(test_x4)
from sklearn.metrics import classification_report

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))

# 2. Logistic regression 

## a. CountVectorizer 3-digit codes

In [39]:
train_x5, test_x5, train_y5, test_y5 = train_test_split(train_features_counts,labels,test_size=0.2,random_state=1, stratify=labels)

In [40]:
from sklearn.linear_model import LogisticRegression
LogRegModel_1 = OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=500))

LogRegModel_1.fit(train_x5, train_y5)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=500,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [41]:
LogRegModel_1.score(test_x5, test_y5)

0.39951573849878935

In [None]:
y_true, y_pred = test_y5 , LogRegModel_1.predict(test_x5)
from sklearn.metrics import classification_report

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))

## b. CountVectorizer 2-digit codes

In [43]:
train_x6, test_x6, train_y6, test_y6 = train_test_split(train_features_counts,labels2,test_size=0.2,random_state=1, stratify=labels2)

In [44]:
LogRegModel_2 = OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=500))

LogRegModel_2.fit(train_x6, train_y6)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=500,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [45]:
LogRegModel_2.score(test_x6, test_y6)

0.48184019370460046

In [None]:
y_true, y_pred = test_y6 , LogRegModel_2.predict(test_x6)
from sklearn.metrics import classification_report

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))

## c. TfidfVectorizer 3-digit codes

In [47]:
train_x7, test_x7, train_y7, test_y7 = train_test_split(train_features_tfidf,labels,test_size=0.2,random_state=1, stratify=labels)

In [48]:
LogRegModel_3 = OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=500))

LogRegModel_3.fit(train_x7, train_y7)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=500,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [49]:
LogRegModel_3.score(test_x7, test_y7)

0.3559322033898305

In [None]:
y_true, y_pred = test_y7 , LogRegModel_3.predict(test_x7)
from sklearn.metrics import classification_report

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))

## d. TfidfVectorizer 2-digit codes

In [51]:
train_x8, test_x8, train_y8, test_y8 = train_test_split(train_features_tfidf,labels2,test_size=0.2,random_state=1, stratify=labels2)

In [52]:
LogRegModel_4 = OneVsRestClassifier(LogisticRegression(solver='lbfgs',max_iter=500))

LogRegModel_4.fit(train_x8, train_y8)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=500,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [53]:
LogRegModel_4.score(test_x8, test_y8)

0.44552058111380144

In [None]:
y_true, y_pred = test_y8 , LogRegModel_4.predict(test_x8)
from sklearn.metrics import classification_report

#print('Results on the test set:')
#print(classification_report(y_true, y_pred))