In [None]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import string
import inflect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [None]:
def read_files(file_loc):
  '''
  This function reads txt data from a file in the drive

  args - a string containing the files location
  returns - a list containing the text data
  '''
  dataset = []
  with open(file_loc, 'r') as train_file:
    # i = 0
    # max = 20000
    for line in train_file:
      # if max>=i:
        dataset.append(line)
      # else: 
      #   break
      # i += 1

  return dataset

In [None]:
def separate_labels(dataset):
  '''This function will separate the labels/class and examples/documents from the dataset'''
  labels = []
  documents = []

  for line in dataset:
    splitted_line = line.strip().split('\t', 2)
    labels.append(splitted_line[1])
    documents.append(splitted_line[2])

  return labels, documents

In [None]:
def remove_url(documents):
  '''This function removes URL's from Texts'''
  url_removed = []

  # Your code here
  for line in documents:
    url_removed.append(re.sub('http[s]?://\S+', '', line))

  return url_removed

In [None]:
def remove_hashtag(documents):
  '''This function will remove all occurences of # from the texts'''
  hashtag_removed = []

  # map hashtag to space
  translator = str.maketrans('#', ' '*len('#'), '')

  for line in documents:
    hashtag_removed.append(line.translate(translator))

  return hashtag_removed

In [None]:
def remove_whitespaces(documents):
  '''This function removes multiple whitespaces and replace them with a single whitespace'''
  whitespace_removed = []

  for line in documents:
    whitespace_removed.append(' '.join(line.split()))

  return whitespace_removed

In [None]:
def text_lowercasing(documents):
  lowercased_docs = []

  for line in documents:
    lowercased_docs.append(line.lower())

  return lowercased_docs

In [None]:
def tokenize_sentence(documents):
  '''This function takes a line and provides tokens/words by splitting them using NLTK'''
  
  tokenized_docs = []
  
  for line in documents:
    tokenized_docs.append(word_tokenize(line))

  return tokenized_docs

In [None]:
def char_n_gram_ready(documents):
  '''An n-gram is a contiguous sequence of n items from a given sample of text or speech'''
  joined_docs = []

  for line in documents:
    joined_docs.append(' '.join(line))

  return joined_docs

In [None]:
def remove_punctuation(documents):

  punct_removed = []

  for doc in documents:
    temp = []
    for word in doc:
      if word not in string.punctuation:
        temp.append(word)
    
    punct_removed.append(temp)

  return punct_removed

In [None]:
def remove_stopwords(documents):
  
  stopword_removed = []

  stop_words = set(stopwords.words('english'))

  for doc in documents:
    temp = []
    for word in doc:
      if word not in stop_words:
        temp.append(word)
    
    stopword_removed.append(temp)

  return stopword_removed

In [None]:
def apply_stemmer(documents):
  stemmed_docs = []
  
  stemmer = PorterStemmer()

  for doc in documents:
    stemmed_docs.append([stemmer.stem(plural) for plural in doc])

  return stemmed_docs

In [None]:
def identity(X):
  return X

In [None]:
def vec_tfidf(tfidf = True):

  if tfidf:
    vec = TfidfVectorizer(preprocessor = identity, analyzer='word',
                          tokenizer = identity, ngram_range = (1,3))
    # vec = TfidfVectorizer(preprocessor = identity, 
    #                       tokenizer = identity)
  else:
    # vec = CountVectorizer(preprocessor = identity, lowercase=True, analyzer='char',
    #                       tokenizer = identity, ngram_range = (2,5))
    
    vec = CountVectorizer(preprocessor = identity,
                          tokenizer = identity)
    
  return vec

In [None]:
def SVM_Static(train_docs, train_lbls, test_docs, test_lbls):

  vec = vec_tfidf(tfidf = True)
    
  # combines the vectorizer with the SVM classifier
  classifier = Pipeline([('vec', vec),
                         ('cls', SVC(kernel='linear'))])
  
  classifier.fit(train_docs, train_lbls)

    # predict is for predicting label for document test data by using predict method
  prediction = classifier.predict(test_docs)

  print("SVM Accuracy = ", accuracy_score(test_lbls, prediction))
  print()

  print(classification_report(test_lbls, prediction, labels=classifier.classes_, digits=3))


In [None]:
def Naive_Bayes(train_docs, train_lbls, test_docs, test_lbls):

  vec = vec_tfidf(tfidf = False)
    
  # combines the vectorizer with the Naive Bayes classifier
  classifier = Pipeline([('vec', vec),
                         ('cls', MultinomialNB())])
  
  classifier.fit(train_docs, train_lbls)

  prediction = classifier.predict(test_docs)

  print("Naive Bayes Accuracy = ", accuracy_score(test_lbls, prediction))
  print()

  print(classification_report(test_lbls, prediction, labels=classifier.classes_, digits=3))

In [None]:
def pre_processing(documents):

  documents = remove_url(documents)

  # documents = remove_hashtag(documents)

  documents = remove_whitespaces(documents)

  # documents = text_lowercasing(documents)

  documents = tokenize_sentence(documents)

  documents = remove_punctuation(documents)

  documents = remove_stopwords(documents)

  documents = apply_stemmer(documents)

  # If we use character n_gram you have to enable it | else comment the below line
  documents = char_n_gram_ready(documents)

  return documents

In [None]:
def main():
  print('Reading The Dataset...')
  
  # Reading the training data
  training_dataset = read_files('/content/drive/MyDrive/Colab Notebooks/datasets/corona_data/train.tsv')
  train_labels, train_docs = separate_labels(training_dataset)

  # Reading the test data
  test_dataset = read_files('/content/drive/MyDrive/Colab Notebooks/datasets/corona_data/test.tsv')
  test_labels, test_docs = separate_labels(test_dataset)
  
  # calling the pre processing dunction
  train_docs = pre_processing(train_docs)
  test_docs = pre_processing(test_docs)
  # print(train_docs)

  print('\nTraining the Classifier...')
  SVM_Static(train_docs, train_labels, test_docs, test_labels)
  Naive_Bayes(train_docs, train_labels, test_docs, test_labels)

  for lbl, doc in zip(train_labels[:5], train_docs[:5]):
    print(lbl)
    print(doc)
    print()

if __name__ == '__main__':
  main()

Reading The Dataset...

Training the Classifier...
SVM Accuracy =  0.4

                    precision    recall  f1-score   support

Extremely Negative      0.479     0.240     0.320       146
Extremely Positive      0.591     0.340     0.431       162
          Negative      0.381     0.483     0.426       302
           Neutral      0.453     0.414     0.433       152
          Positive      0.324     0.424     0.367       238

          accuracy                          0.400      1000
         macro avg      0.446     0.380     0.396      1000
      weighted avg      0.427     0.400     0.399      1000

Naive Bayes Accuracy =  0.269

                    precision    recall  f1-score   support

Extremely Negative      0.296     0.199     0.238       146
Extremely Positive      0.272     0.136     0.181       162
          Negative      0.335     0.255     0.289       302
           Neutral      0.246     0.474     0.324       152
          Positive      0.232     0.290     0.257    