In [None]:
import os
import pandas as pd
import numpy as np
import nltk
import re
#Lemmatization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Linear SVM
from sklearn.svm import LinearSVC
#Cross-Validation
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

nltk.download('stopwords')
nltk.download('wordnet')


In [None]:
#Setting data path
data_path = "./datasets_coursework1/bbc/"
os.listdir(data_path) #listing content inside path

In [None]:
#Getting folders into a list
folders = [f for f in os.listdir(data_path) if f not in ["README.TXT", "bbc.csv"]]
folders

In [None]:
#One by one reading and retrieving data from data path
news = []
category = []
for folder in folders:
    internal_path = data_path + folder
    files = os.listdir(internal_path)
    for t_files in files:
        t_path = internal_path + '/' + t_files
        with open(t_path, 'r') as f:
            content = f.readlines()
        content = ' '.join(content)
        news.append(content)
        category.append(folder)

In [None]:
tempdict = {'News' :news, 'Category': category} #setting dictionary to transform data into data frame
df = pd.DataFrame(tempdict) #creating data frame
df.to_csv("./datasets_coursework1/bbc.csv") #saving data to csv file

## Preprocessing:

In [None]:
lem = WordNetLemmatizer() #initializing WordNetLemmatizer

In [None]:
#Preprocessing

processed_text = []
new_text = " "
for n in range(len(df.News)):
    new_text = re.sub(r"\W", " ", str(df.News[n])) #Replacing non-word characters with spaces
    new_text = new_text.lower() #Coverting corpus to lower-case
    new_text = re.sub(r"\s+[a-zA-Z]\s+", " ", new_text) #Replacing single characters with spaces
    new_text = re.sub(r"\s+", " ", new_text) #Removing extra spaces
    processed_text.append(new_text) #Getting pre-processed data

processed = map(lambda x:' '.join([lem.lemmatize(word) for word in x.split()]), processed_text) #mapping lemmatized data
processed_text = list(processed)

In [None]:
stopwords = nltk.corpus.stopwords.words("english") #Getting English Stopwords

## Bag of Words Model:

In [None]:
#Bag of Words

count = CountVectorizer(min_df = 5, max_df=0.6, stop_words=stopwords)
edit_text_1 = count.fit_transform(processed_text).toarray() #ndarray of bag of words
edit_text_1 = SelectKBest(chi2, k=1500).fit_transform(edit_text_1,df.Category) #1500 relevent features are selected
edit_text_1.shape

## Unigram TF-IDF Features:

In [None]:
#TF-IDF model

tfidf = TfidfVectorizer(min_df=3, stop_words=stopwords, norm='l2', ngram_range=(1,1))
edit_text_2 = tfidf.fit_transform(processed_text).toarray() #ndarray of TF-IDF unigrams
edit_text_2 = SelectKBest(chi2, k=1500).fit_transform(edit_text_2,df.Category) ##1500 relevent features are selected
edit_text_2.shape

## Bi-gram TF-IDF Features:

In [None]:
#Bi-gram model

tfidf = TfidfVectorizer(min_df=3, stop_words=stopwords, norm='l2', ngram_range=(2,2))
edit_text_3 = tfidf.fit_transform(processed_text).toarray() #ndarray of TF-IDF bigrams
edit_text_3 = SelectKBest(chi2, k=1500).fit_transform(edit_text_3,df.Category) #1500 relevent features are selected
edit_text_3.shape

## Feature Stacking:

In [None]:
#Feature horizontal stacking
edit_text = np.hstack((edit_text_1, edit_text_2, edit_text_3))
edit_text.shape

## Classifier:

In [None]:
#function to get Accuracy, F1-score, Precision, Recall
def get_scores_of(model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te) 
    Acc = metrics.accuracy_score(y_te,y_pred)
    F1 = metrics.f1_score(y_te,y_pred,average='macro')
    Pre = metrics.precision_score(y_te,y_pred, average='macro')
    Rec = metrics.recall_score(y_te,y_pred, average='macro')
    return Acc, F1, Pre, Rec

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
K = StratifiedKFold(n_splits=10) #Stratified K-Fold cross-validation
ModelScoresAcc, ModelScoresF1, ModelScoresPre, ModelScoresRec = [],[],[],[]

for train_i, test_i in K.split(edit_text,category):
    X_train, X_test, y_train, y_test = edit_text[train_i], edit_text[test_i], df.Category[train_i], df.Category[test_i]
    acc, f1, pre, rec = get_scores_of(LinearSVC(max_iter=6000, multi_class='ovr'),X_train, X_test, y_train, y_test)
    ModelScoresAcc.append(acc)
    ModelScoresF1.append(f1)
    ModelScoresPre.append(pre)
    ModelScoresRec.append(rec)

print("With LinearSVC:")
print("Accuracy of the model: {:.2f}".format(float(np.mean(ModelScoresAcc)*100)), "%")
print("Macro averaged F1 score of the model: {:.2f}".format(float(np.mean(ModelScoresF1)*100)), "%")
print("Macro averaged precision of the model: {:.2f}".format(float(np.mean(ModelScoresPre)*100)), "%")
print("Macro averaged Recall of the model: {:.2f}".format(float(np.mean(ModelScoresRec)*100)), "%")
