In [23]:
import pandas as pd
import string
import re
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dgdev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dgdev\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dgdev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
df = pd.read_csv('data.csv')
df.dropna(inplace=True)
train = df[df['Date'] < '20150101']
test = df[df['Date'] > '20141231']

In [25]:
X_train = train.iloc[:,2:27]
y_train = train["Label"]

In [26]:
for column in X_train.columns:
    X_train[column] = X_train[column].str.lstrip('b\'').str.lstrip('b"')

In [27]:
def caseLower(text) :
    text = text.lower()
    return text

def punctuationRemove(text) :
    text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub("[0-9]+", "", text)
    return text

def tokenization(text) :
    text = text.strip()
    text = re.split("\W+", text)
    return text

def singlesRemove(text) :
    text = [word for word in text if len(word) > 1]
    return text

def stopwordsRemove(text) :
    stopword = nltk.corpus.stopwords.words("english")
    text = [word for word in text if word not in stopword]
    return text

def lemmatizer(text) :
    text = [nltk.WordNetLemmatizer().lemmatize(word) for word in text]
    return text

def preprocess(text) :
    text = caseLower(text)
    text = punctuationRemove(text)
    text = tokenization(text)
    text = singlesRemove(text)
    text = stopwordsRemove(text)
    text = lemmatizer(text)
    text = " ".join(text)
    return text

In [28]:
for column in X_train.columns:
    X_train[column] = X_train[column].apply(lambda x: preprocess(x))

In [29]:
headlines = []
for row in range(0,len(X_train.index)):
    headlines.append(' '.join(str(x) for x in X_train.iloc[row,0:25]))

In [30]:
vectorizer = TfidfVectorizer(ngram_range=(2,2))
X_train = vectorizer.fit_transform(headlines)

In [31]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

In [32]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [33]:
test_transform = []
for row in range(0,len(test.index)):
    test_transform.append(' '.join(str(x) for x in test.iloc[row,2:27]))
test_dataset = vectorizer.transform(test_transform)
predictions_mnb = mnb.predict(test_dataset)

In [34]:
predictions_knn = knn.predict(test_dataset)

In [35]:
print("-------------------Naive Bayes------------------")
print(confusion_matrix(test["Label"], predictions_mnb))
print("---------------------------------------")
print(classification_report(test["Label"], predictions_mnb))

-------------------Naive Bayes------------------
[[130  56]
 [  2 190]]
---------------------------------------
              precision    recall  f1-score   support

           0       0.98      0.70      0.82       186
           1       0.77      0.99      0.87       192

    accuracy                           0.85       378
   macro avg       0.88      0.84      0.84       378
weighted avg       0.88      0.85      0.84       378



In [36]:
print("------------------KNN------------------")
print(confusion_matrix(test["Label"], predictions_knn))
print("---------------------------------------")
print(classification_report(test["Label"], predictions_knn))

------------------KNN------------------
[[122  64]
 [ 68 124]]
---------------------------------------
              precision    recall  f1-score   support

           0       0.64      0.66      0.65       186
           1       0.66      0.65      0.65       192

    accuracy                           0.65       378
   macro avg       0.65      0.65      0.65       378
weighted avg       0.65      0.65      0.65       378

