In [1]:
import pandas as pd
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
vect = CountVectorizer()

train_data = pd.read_csv('bbc/BBC News Train.csv')
test_data = pd.read_csv('bbc/BBC News Test.csv')
solution = pd.read_csv('bbc/BBC News Sample Solution.csv')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

y_train = train_data['Category'].tolist()

y_test = solution['Category'].tolist()


In [None]:
def new_finder(char):
    
    num = ord(char)
    
    if (num >= 65) and (num <= 90):
        return chr(num+32)
    if (num >= 97) and (num <= 122):
        return char
    if (num == 32) or (num == 39):
        return char
    return ''

In [3]:
def normalizer(text):
    text = re.sub(r'[^a-zA-z]', ' ', text).lower()
    word_tokens = word_tokenize(text)
    filtered_sentence = [stemmer.stem(w) for w in word_tokens if not w in stop_words and len(w) > 2]
    filtered_sentence = ' '.join(filtered_sentence)[0:]
    return filtered_sentence

In [4]:
def feature(data):
    data_text = data['Text'].tolist()
    feature_list = list(map(normalizer,data_text))
    return feature_list

In [5]:
def vectorize(feature_list):
    X_dtm = vect.fit_transform(feature_list)
    X_dtm = X_dtm.toarray()
    return X_dtm

In [6]:
def feature_selection(k, X_dtm, y):
    global chi2_features
    chi2_features = SelectKBest(chi2, k=k)
    X_kbest_features = chi2_features.fit_transform(X_dtm, y)
    return X_kbest_features

In [7]:
def proc(data):
    tokens = feature(data)
    transform_data = vect.transform(tokens).toarray()
    selection_feature = chi2_features.transform(transform_data)
    return selection_feature


In [8]:
X_dtm = vectorize(feature(train_data))
X_train = feature_selection(2000, X_dtm, y_train)

X_test = proc(test_data)

In [9]:
clf = MultinomialNB()
clf.fit(X_train,y_train)
predict_val_nb = clf.predict(X_test)