In [14]:
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

fp = open("../data/BL_2018_1.csv", encoding='utf-8')
lines = fp.readlines()[2:]
fp.close()

In [15]:
# Pre-processing
# Step: 1
tmp_data, string = list(), ""
for line in lines:
    line = line.strip()
    if line == ",,,,,":
        tmp_data.append(string)
        string = ""
    else:
        string+= line
tmp_data.append(string)

In [16]:
# Step: 2
data, Y = list(), list()
for idx, sample in enumerate(tmp_data):
    x = sample.split(',')
    year, month, day, text, label = int(x[0]), int(x[1]), int(x[2]), ", ".join(x[3:-1]), int(x[-1])
    if idx == 0:
        print("Actual Text:\n"+text)
    # Remove numbers
    text = text.replace('0', '').replace('1', '').replace('2', '').replace('3', '').replace('4', '').replace('5', '')
    text = text.replace('6', '').replace('7', '').replace('8', '').replace('9', '')
    # Remove punctuations
    text = text.replace('“', '').replace('”', '').replace('"', '').replace('!', '').replace('(', '').replace(')', '')
    text = text.replace('[', '').replace(']', '').replace('.', ' ').replace(', ', ' ').replace(':', ' ').replace('*', '')
    text = text.replace('#', '').replace('-', ' ').replace('%', '').replace('&', '').replace('$', '').replace('^', '')
    text = text.replace("'", "").replace('+', '').replace(';', '').replace('?', '').replace('/', '').replace('=', '')
    text = text.replace('@', '')
    # Above pre-processing might lead to multiple consecutive spaces within string
    text = re.sub(' +', ' ', text)
    if idx == 0:
        print("\n\nPredicted Text:\n"+text)
    data.append(text), Y.append(label)

Actual Text:
BusinessLine twenty years ago today: Poll schedule announced, "The Election Commission on Thursday announced that the Lok Sabha elections and polls to five State Assemblies would be held simultaneously over four days beginning February 16. Announcing the dates for holding the elections,  the Chief Election Commissioner,  Dr. M.S. Gill,  said voting would be held on February 16,  22 and 28 and on March 7. The process for holding bye-elections to fill casual vacancies in some State Legislative Assemblies would also be held during this period.I-T Dept crackdown likely on VDIS payment defaultersThe I-T Department may clamp down on declarants under the just-ended Voluntary Disclosure of Income Scheme (VDIS) who fail to pay up the obligatory 30 per cent tax by March 31. Although the VDIS came to a close on Wednesday,  declarants have been allowed to pay taxes within three months of filing the declaration after paying interest at the rate of 2 per cent per month or part of the mo

In [17]:
# Let's check out the frequency of each class label
Y.count(0), Y.count(1)
# The resulting data is highly imbalanced, i.e. 0 - 99.2367%    1 - 0.7632%

(5461, 42)

In [18]:
data, Y = np.array(data), np.array(Y)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words=set(stopwords.words('english')), tokenizer=nltk.word_tokenize, min_df=5)
X = vectorizer.fit_transform(data)

# We can check out the features(words) that represent a single vector
print(vectorizer.get_feature_names()[:100])

['aa', 'aaa', 'aaa ’', 'aadhaar', 'aadhaar authentication', 'aadhaar based', 'aadhaar card', 'aadhaar cards', 'aadhaar data', 'aadhaar linking', 'aadhaar mobile', 'aadhaar number', 'aadhaar numbers', 'aadhaar therefore', 'aadmi', 'aadmi party', 'aai', 'aam', 'aam aadmi', 'aam nagrik', 'aap', 'aap leader', 'aap mlas', 'aap ’', 'aashish', 'aayog', 'aayog member', 'aayog ’', 'ab', 'ab de', 'abandon', 'abandoned', 'abbas', 'abbott', 'abc', 'abdul', 'abe', 'abhay', 'abhijit', 'abhishek', 'abhiyan', 'abide', 'abiding', 'abilities', 'ability', 'able', 'able access', 'able bring', 'able come', 'able complete', 'able create', 'able cultivate', 'able find', 'able get', 'able give', 'able keep', 'able make', 'able meet', 'able sell', 'able strike', 'able take', 'able use', 'abled', 'abolish', 'abolished', 'abolition', 'abraham', 'abroad', 'abs', 'absence', 'absent', 'absolute', 'absolute decline', 'absolute terms', 'absolute value', 'absolutely', 'absorb', 'absorbed', 'absorbing', 'absorption', '

In [37]:
#Divide the data into train test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [38]:
rus = RandomUnderSampler(random_state=0, ratio={0:3500})
X_train, Y_train = rus.fit_resample(X_train, Y_train)

# Perform training and prediction
model = RandomForestClassifier(random_state=0)
model.fit(X_train, Y_train)
preds = model.predict(X_test)

print("%.2f" % (accuracy_score(Y_test, preds)))
print("%.2f %.2f %.2f" % (precision_score(Y_test, preds), recall_score(Y_test, preds), f1_score(Y_test, preds)))

0.99
1.00 0.30 0.46


