In [373]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
import string

delimiter = "\t"

# load the data into a pandas dataframe
df = pd.read_csv("TrainData.dat", delimiter=delimiter, header=None,names=['label','sentence'])

#Fill any null values if present
df['label'].fillna(0,inplace=True)
df['sentence'].fillna('NA',inplace=True)

#Seperate columns from training data into labels and sentences
labels = df['label'].tolist()
sentences = df['sentence'].tolist()

In [374]:

def preprocessor(text):
    # Lowercase the text
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.translate(str.maketrans("", "", string.digits))
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Join the tokens back into a sentence
    return " ".join(tokens)

In [375]:
#Apply count vectorizer to form a bag of words  and also convert them into lowercase
cv1 = CountVectorizer(preprocessor=preprocessor,max_df=0.75,min_df=1)
X = cv1.fit_transform(sentences)

In [376]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,labels, test_size=0.2, random_state=4)

# initialize the logistic regression model setting max_iter because the data is large
model = LogisticRegression(solver='lbfgs',max_iter=1000)

# fit the model to the training data
model.fit(X_train, y_train)

# make predictions on the test data
y_pred = model.predict(X_test)

# evaluate the model's accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.8733117233927606


In [377]:
#After our model is trained on the training data, we apply it on our test data
# Reading sentences from test data file
fo = open("TestData.dat", "r")
fread = fo.readlines()
sentences=[]
count=0
for line in fread:
    count+=1
    sentences.append(line.split("\n"))

fo.close() # Closing file
df_Test = pd.DataFrame({'sentence':sentences}) #Converting into dataFrame for predictions
#print(df_Test)

NewSentences= df['sentence'].tolist()

In [378]:
X1 = cv1.fit_transform(NewSentences) 
#Applying the same model with 0.87 accuracy on our Test data
y_pred = model.predict(X1)

In [379]:
# Open file in write mode
file = open("Deep_Vora_Output.txt", "w") 

for i, item in enumerate(y_pred):
    #if item == -1: #for debugging the results
       # print("Index:", i,sentences[i],":- ", "Value:", item) #for debugging the results
    file.write(str(item) + "\n") # Writing data to the output file

# Close the file to save changes
file.close()