In [None]:
import pandas as pd
import string
import re
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [37]:
df = pd.read_csv('data.csv')
df.dropna(inplace=True)
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']

In [38]:
data = train.iloc[:,2:27]
data.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

In [39]:
list1= [i for i in range(25)]
new_Index = [str(i) for i in list1]
data.columns = new_Index

In [42]:
for column in new_Index:
    data[column] = data[column].str.lstrip('b\'').str.lstrip('b"')

In [44]:
def caseLower(text) :
    text = text.lower()
    return text

def punctuationRemove(text) :
    text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub("[0-9]+", "", text)
    return text

def tokenization(text) :
    text = text.strip()
    text = re.split("\W+", text)
    return text

def singlesRemove(text) :
    text = [word for word in text if len(word) > 1]
    return text

def stopwordsRemove(text) :
    stopword = nltk.corpus.stopwords.words("english")
    text = [word for word in text if word not in stopword]
    return text

def lemmatizer(text) :
    text = [nltk.WordNetLemmatizer().lemmatize(word) for word in text]
    return text

def preprocess(text) :
    text = caseLower(text)
    text = punctuationRemove(text)
    text = tokenization(text)
    text = singlesRemove(text)
    text = stopwordsRemove(text)
    text = lemmatizer(text)
    text = " ".join(text)
    return text

In [46]:
for column in new_Index:
    data[column] = data[column].apply(lambda x: preprocess(x))

In [47]:
headlines = []
for row in range(0,len(data.index)):
    headlines.append(' '.join(str(x) for x in data.iloc[row,0:25]))

In [48]:
tfidfvector = TfidfVectorizer(ngram_range=(2,2))
traindataset = tfidfvector.fit_transform(headlines)

In [None]:
model = MultinomialNB()
model.fit(traindataset, train['Label'])

In [50]:
test_transform= []
for row in range(0,len(test.index)):
    test_transform.append(' '.join(str(x) for x in test.iloc[row,2:27]))
test_dataset = tfidfvector.transform(test_transform)
predictions = model.predict(test_dataset)

In [51]:
print(confusion_matrix(test["Label"], predictions))
print("---------------------------------------")
print(classification_report(test["Label"], predictions))

[[131  55]
 [  3 189]]
---------------------------------------
              precision    recall  f1-score   support

           0       0.98      0.70      0.82       186
           1       0.77      0.98      0.87       192

    accuracy                           0.85       378
   macro avg       0.88      0.84      0.84       378
weighted avg       0.87      0.85      0.84       378

