<a href="https://colab.research.google.com/github/cosmicishan/Classification-on-text-data/blob/main/Job_tag_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing Libraries and Dataset**




In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/73-Strings/Train.csv')

In [3]:
X = df.iloc[:,1].values
y = df.iloc[:,-1].values

# **Text Preprocessing**

In [4]:
#Removing contraction from the text.

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase 

In [5]:
#We are removing the words from the stop words list: 'no', 'nor', 'not' 
#we are including them into stop words list

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [6]:
from tqdm import tqdm
preprocessed_description = []
# tqdm is for printing the status bar
for sentance in tqdm(X):
    sentance = str(sentance)
    sentance = re.sub(r"http\S+", "", sentance) #Remove links
    sentance = BeautifulSoup(sentance, 'lxml').get_text() #Remove tags
    sentance = decontracted(sentance) #Remove Contraction
    sentance = re.sub("\S*\d\S*", "", sentance).strip() #Remove words with numbers
    sentance = re.sub('[^A-Za-z]+', ' ', sentance) #Remove Special Character
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stopwords)
    preprocessed_description.append(sentance.strip())

100%|██████████| 6045/6045 [00:02<00:00, 2713.29it/s]


# **Featurization**

## *Bag of Words*


In [7]:
#BoW
count_vect = CountVectorizer() #in scikit-learn
count_vect.fit(preprocessed_description)
print("some feature names : ", count_vect.get_feature_names()[:10])
print('='*50)

final_counts = count_vect.transform(preprocessed_description)
print("the type of count vectorizer : ",type(final_counts))
print("the shape of out text BOW vectorizer : ",final_counts.get_shape())
print("the number of unique words : ", final_counts.get_shape()[1])

some feature names :  ['aaa', 'aad', 'aadc', 'aaftink', 'aagp', 'aagps', 'aallon', 'aam', 'aaon', 'aap']
the type of count vectorizer :  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer :  (6045, 34494)
the number of unique words :  34494


In [8]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(final_counts, y, test_size = 0.25)

**Training bag of word model on SVM algorithm**

In [9]:
classifier = SVC(kernel = 'sigmoid')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

bow_svm = accuracy_score(y_test,y_pred)

print("Accuracy of bag of words model in SVM alorithm : ",bow_svm)

Accuracy of bag of words model in SVM alorithm :  0.621031746031746


**Training bag of word model on Random Forest algorithm**

In [10]:
estimators = list(range(1,20))
cv_score = []

for n in estimators:
    random_forest = RandomForestClassifier(n_estimators = n, criterion = 'entropy')
    scores = cross_val_score(random_forest, X_train, y_train, cv=10, scoring='accuracy')
    cv_score.append(scores.mean())

bow_rf = max(cv_score)

print("Accuracy of bag of words model in Random Forest algorithm : ",bow_rf)

Accuracy of bag of words model in Random Forest algorithm :  0.40811622954167515


**Training bag of word model on Naive Bayes algorithm**

In [11]:
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

classifier = GaussianNB()
classifier.fit(X_train_dense, y_train)

y_pred = classifier.predict(X_test_dense)

bow_nb = accuracy_score(y_test,y_pred)

print("Accuracy of bag of words model in Naive Bayes algorithm : ",bow_nb)

Accuracy of bag of words model in Naive Bayes algorithm :  0.5244708994708994


##*Bi-Grams and n-Grams*

In [12]:
#bi-gram, tri-gram and n-gram

count_vect = CountVectorizer(ngram_range=(1,2), min_df=10, max_features=5000)
final_bigram_counts = count_vect.fit_transform(preprocessed_description)
print("the type of count vectorizer : ",type(final_bigram_counts))
print("the shape of out text BOW vectorizer : ",final_bigram_counts.get_shape())
print("the number of unique words including both unigrams and bigrams : ", final_bigram_counts.get_shape()[1])

the type of count vectorizer :  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer :  (6045, 5000)
the number of unique words including both unigrams and bigrams :  5000


In [13]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(final_bigram_counts, y, test_size = 0.25)

**Training unigrams/bigrams model on SVM algorithm**

In [14]:
classifier = SVC(kernel = 'sigmoid')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

grams_svm = accuracy_score(y_test,y_pred)

print("Accuracy of unigrams/bigrams model in SVM algorithm : ",grams_svm)

Accuracy of unigrams/bigrams model in SVM algorithm :  0.6283068783068783


**Training unigrams/bigrams model on Random Forest algorithm**

In [15]:
estimators = list(range(1,20))
cv_score = []

for n in estimators:
    random_forest = RandomForestClassifier(n_estimators = n, criterion = 'entropy')
    scores = cross_val_score(random_forest, X_train, y_train, cv=10, scoring='accuracy')
    cv_score.append(scores.mean())

grams_rf = max(cv_score)

print("Accuracy of unigrams/bigrams model in Random Forest algorithm : ",grams_rf)

Accuracy of unigrams/bigrams model in Random Forest algorithm :  0.4092355418113215


**Training unigrams/bigrams model on Naive Bayes algorithm**

In [16]:
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

classifier = GaussianNB()
classifier.fit(X_train_dense, y_train)

y_pred = classifier.predict(X_test_dense)

grams_nb = accuracy_score(y_test,y_pred)

print("Accuracy of unigrams/bigrams model in Naive Bayes algorithm : ",grams_nb)

Accuracy of unigrams/bigrams model in Naive Bayes algorithm :  0.5304232804232805


## *TF_IDF*


In [17]:
#tf-idf

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(preprocessed_description)
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

final_tf_idf = tf_idf_vect.transform(preprocessed_description)
print("the type of count vectorizer ",type(final_tf_idf))
print("the shape of out text TFIDF vectorizer ",final_tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_tf_idf.get_shape()[1])

some sample features(unique words in the corpus) ['ab', 'ab formerly', 'ab publ', 'ab sweden', 'ability', 'abitibi', 'abitibi greenstone', 'able', 'abroad', 'absorption']
the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text TFIDF vectorizer  (6045, 7682)
the number of unique words including both unigrams and bigrams  7682


In [18]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(final_tf_idf, y, test_size = 0.25)

**Training TF_IDF model on SVM algorithm**

In [19]:
classifier = SVC(kernel = 'sigmoid')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

idf_svm = accuracy_score(y_test,y_pred)

print("Accuracy of TF-IDF model in SVM algorithm : ",idf_svm)

Accuracy of TF-IDF model in SVM algorithm :  0.6825396825396826


**Training TF_IDF model on Random Forest algorithm**

In [20]:
estimators = list(range(1,20))
cv_score = []

for n in estimators:
    random_forest = RandomForestClassifier(n_estimators = n, criterion = 'entropy')
    scores = cross_val_score(random_forest, X_train, y_train, cv=10, scoring='accuracy')
    cv_score.append(scores.mean())

idf_rf = max(cv_score)

print("Accuracy of TF_IDF model in Random Forest algorithm : ",idf_rf)

Accuracy of TF_IDF model in Random Forest algorithm :  0.41208341842440516


**Training TF_IDF model on Naive Bayes algorithm**

In [21]:
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

classifier = GaussianNB()
classifier.fit(X_train_dense, y_train)

y_pred = classifier.predict(X_test_dense)

idf_nb = accuracy_score(y_test,y_pred)

print("Accuracy of TF_IDF model in Naive Bayes algorithm : ",idf_nb)

Accuracy of TF_IDF model in Naive Bayes algorithm :  0.5357142857142857


## *Word2Vec*

In [22]:
#Training Word2Vec using our own Text corpus.

list_of_sentance=[]
for sentance in preprocessed_description:
    list_of_sentance.append(sentance.split())

w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
print("Words similar to Company according to our corpus : ",w2v_model.wv.most_similar('company'))
print('='*50)
print("Words similar to Technologies according to our corpus : ",w2v_model.wv.most_similar('technologies'))

w2v_words = list(w2v_model.wv.vocab)
print('='*50)
print("Number of words that occured minimum 5 times : ",len(w2v_words))
print("Sample words ", w2v_words[0:50])

Words similar to Company according to our corpus :  [('principally', 0.6553246974945068), ('selling', 0.6519771218299866), ('empowered', 0.6504414677619934), ('tackle', 0.6491935849189758), ('acquiring', 0.6440767049789429), ('several', 0.6370248794555664), ('owning', 0.6361712217330933), ('sector', 0.6359339356422424), ('provision', 0.6304469108581543), ('cannabis', 0.6293812394142151)]
Words similar to Technologies according to our corpus :  [('diagnostics', 0.9453083872795105), ('bd', 0.9140072464942932), ('science', 0.8999008536338806), ('advanced', 0.8853769898414612), ('semiconductor', 0.8843767642974854), ('biomedical', 0.8799190521240234), ('pharmaceutical', 0.8762993812561035), ('medical', 0.8727603554725647), ('laboratory', 0.8724081516265869), ('pharmaceuticals', 0.8669586181640625)]
Number of words that occured minimum 5 times :  8311
Sample words  ['partners', 'inc', 'provides', 'advertising', 'agency', 'services', 'specializing', 'direct', 'response', 'media', 'campaigns'

  if np.issubdtype(vec.dtype, np.int):


## *Converting text into vectors using Avg W2V and TFIDF-W2V*

### *Average Word2Vec*

In [23]:
# average Word2Vec
# compute average word2vec for each description.
sent_vectors = []; # the avg-w2v for each sentence/description is stored in this list
for sent in tqdm(list_of_sentance): # for each description/sentence
    sent_vec = np.zeros(50) 
    cnt_words =0; # num of words with a valid vector in the sentence/description
    for word in sent: # for each word in a description/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)

print(len(sent_vectors))

100%|██████████| 6045/6045 [00:20<00:00, 297.21it/s]

6045





In [24]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(sent_vectors, y, test_size = 0.25)

**Training Average Word2Vec model on SVM algorithm**

In [25]:
classifier = SVC(kernel = 'sigmoid')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

w2v_svm = accuracy_score(y_test,y_pred)

print("Accuracy of Word2Vec model in SVM algorithm : ",w2v_svm)

Accuracy of Word2Vec model in SVM algorithm :  0.34656084656084657


**Training Average Word2Vec model on Random Forest algorithm**

In [26]:
estimators = list(range(1,20))
cv_score = []

for n in estimators:
    random_forest = RandomForestClassifier(n_estimators = n, criterion = 'entropy')
    scores = cross_val_score(random_forest, X_train, y_train, cv=10, scoring='accuracy')
    cv_score.append(scores.mean())

w2v_rf = max(cv_score)

print("Accuracy of Word2Vec model in Random Forest algorithm : ",w2v_rf)

Accuracy of Word2Vec model in Random Forest algorithm :  0.44627933210802195


**Training Average Word2Vec model on Naive Bayes**

In [27]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

w2v_nb = accuracy_score(y_test,y_pred)

print("Accuracy of Word2Vec model in Naive Bayes algorithm : ",w2v_nb)

Accuracy of Word2Vec model in Naive Bayes algorithm :  0.4312169312169312


### *TF-IDF Word2Vec*

In [28]:
model = TfidfVectorizer()
model.fit(preprocessed_description)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/description is stored in this list
row=0;
for sent in tqdm(list_of_sentance): # for each description/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/description
    for word in sent: # for each word in a description/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|██████████| 6045/6045 [02:42<00:00, 37.22it/s]


In [29]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(tfidf_sent_vectors, y, test_size = 0.25)

**Training TF-IDF Word2Vec model on SVM**

In [30]:
classifier = SVC(kernel = 'sigmoid')
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

tfw2v_svm = accuracy_score(y_test,y_pred)

print("Accuracy of TF-IDF Word2Vec model in SVM algorithm : ",tfw2v_svm)

Accuracy of TF-IDF Word2Vec model in SVM algorithm :  0.2791005291005291


**Training TF-IDF Word2Vec model on Random Forest**

In [31]:
estimators = list(range(1,20))
cv_score = []

for n in estimators:
    random_forest = RandomForestClassifier(n_estimators = n, criterion = 'entropy')
    scores = cross_val_score(random_forest, X_train, y_train, cv=10, scoring='accuracy')
    cv_score.append(scores.mean())

tfw2v_rf = max(cv_score)

print("Accuracy of TF_IDF Word2Vec model in Random Forest algorithm : ",tfw2v_rf)

Accuracy of TF_IDF Word2Vec model in Random Forest algorithm :  0.45443689159883693


**Training TF-IDF Word2Vec model on Naive Bayes**

In [32]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

tfw2v_nb = accuracy_score(y_test,y_pred)

print("Accuracy of TF_IDF Word2Vec model in Naive Bayes algorithm : ",tfw2v_nb)

Accuracy of TF_IDF Word2Vec model in Naive Bayes algorithm :  0.3994708994708995


# **Printing out table of accuracies on different models and algorithms**

In [58]:
from prettytable import PrettyTable

accuracy = {
    "Bag of word": {
        "SVM": {
            "acc": bow_svm
        },
        "Random Forest": {
            "acc": bow_rf
        },
        "Naive Bayes": {
            "acc": bow_nb
        }
    },
    "Bi-grams and N-grams": {
        "SVM": {
            "acc": grams_svm
        },
        "Random Forest": {
            "acc": grams_rf
        },
        "Naive Bayes": {
            "acc": grams_nb
        }
    },
    "TF-IDF": {
        "SVM": {
            "acc": idf_svm
        },
        "Random Forest": {
            "acc": idf_rf
        },
        "Naive Bayes": {
            "acc": idf_nb
        }
    },
    "Average Word2Vec": {
        "SVM": {
            "acc": w2v_svm
        },
        "Random Forest": {
            "acc": w2v_rf
        },
        "Naive Bayes": {
            "acc": w2v_nb
        }
    },
    "Tf-IDF word2vec": {
        "SVM": {
            "acc": tfw2v_svm
        },
        "Random Forest": {
            "acc": tfw2v_rf
        },
        "Naive Bayes": {
            "acc": tfw2v_nb
        }
    }
}

table_fields = ['Model', 'Algorithm', 'Accuracy']
pt = PrettyTable(table_fields)
pt.padding_width = 5
pt.title = 'Accuracy'
for accuracy1, technique in accuracy.items():
    pt.add_row(["{}".format(accuracy1), "", ""])
    for name, algo_acc in technique.items():
        pt.add_row(["", name, algo_acc["acc"]])
print(pt)

+------------------------------------------------------------------------------------+
|                                      Accuracy                                      |
+------------------------------+-----------------------+-----------------------------+
|            Model             |       Algorithm       |           Accuracy          |
+------------------------------+-----------------------+-----------------------------+
|         Bag of word          |                       |                             |
|                              |          SVM          |      0.621031746031746      |
|                              |     Random Forest     |     0.40811622954167515     |
|                              |      Naive Bayes      |      0.5244708994708994     |
|     Bi-grams and N-grams     |                       |                             |
|                              |          SVM          |      0.6283068783068783     |
|                              |     Random