#Preprocessing

In [None]:
import json
import validators
import urllib.parse
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


# reduces the link to the webside nime, for example "https://www.google.com/search?q=fri" returns just "google"
def find_source(link):
    url = urllib.parse.urlparse(link).netloc
    url = '.'.join(url.split('.')[-2:])

    url = url.split('.')
    return url[0]

# adds all json entries to a single array in the form of a dictionary (except those with no headline or description)
articles = []
with open("News_Category_Dataset_IS_course.json", "r") as file:
    for line in file:
        j = json.loads(line)
        if j['headline'] is None or j['short_description'] is None:
            continue
        articles.append(j)

for article in articles:
    # solves the problem of missing author
    if len(article['authors']) < 1:
        article['authors'] = "Jane Doe"

    # solves the problem of invalid links
    if not validators.url(article['link']):
        article['link'] = article['link'][30:]

    source = find_source(article['link'])
    if source == "huffpost":
        article['source'] = "huffingtonpost"
    else:
        article['source'] = source
    article.pop('link')

# reduces the headlines and descriptions into base forms of words and removes stopwords
for article in articles:
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    headline = article['headline']
    desc = article['short_description']

    headline = word_tokenize(headline.lower())
    desc = word_tokenize(desc.lower())
    article['authors'] = article['authors'].lower()

    table = str.maketrans('', '', string.punctuation)
    headline = [word.translate(table) for word in headline if word.isalpha()]
    desc = [word.translate(table) for word in desc if word.isalpha()]

    stop_words = set(stopwords.words('english'))
    headline = [word for word in headline if word not in stop_words]
    desc = [word for word in desc if word not in stop_words]

    headline = [lemmatizer.lemmatize(word) for word in headline]
    desc = [lemmatizer.lemmatize(word) for word in desc]

    headline = [stemmer.stem(word) for word in headline]
    desc = [stemmer.stem(word) for word in desc]

    article['headline'] = ' '.join(headline)
    article['short_description'] = ' '.join(desc)

# saves the edited data
file = open("preprocessed.json", "a")
for article in articles:
    file.write(json.dumps(article))
    file.write("\n")

#Decision tree

In [None]:
import json
import sys
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# loads the preprocessed data
articles = []
with open("preprocessed.json", "r") as file:
    for line in file:
        j = json.loads(line)
        articles.append(j)

# joins the separate values into a single string for every article and vectorizes the data
X = []
Y = []
for article in articles:
    X.append(article['source'] + ' ' + article['headline'] + ' ' + article['authors'] + ' ' + article['short_description'] + ' ' + str(article['date']))
    Y.append(article['category'])

vectorizer = TfidfVectorizer()

X_vector = vectorizer.fit_transform(X)
# splits the data for training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X_vector, Y, test_size=0.2, random_state=42)

# trains the model
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, Y_train)
print(decision_tree_classifier.get_depth())

# tests the model
Y_predictions = decision_tree_classifier.predict(X_test)

# prints the report to a file
file = open("./results/tree", "w")
sys.stdout = file
report = classification_report(Y_test, Y_predictions)
print(report)
sys.stdout = sys.__stdout__
file.close()

#k-nearest neighbors

In [None]:
import json
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# loads the preprocessed data
articles = []
with open("preprocessed.json", "r") as file:
    for line in file:
        j = json.loads(line)
        articles.append(j)

# joins the separate values into a single string for every article and vectorizes the data
X = []
Y = []
for article in articles:
    X.append(article['source'] + ' ' + article['headline'] + ' ' + article['authors'] + ' ' + article['short_description'] + ' ' + str(article['date']))
    Y.append(article['category'])

vectorizer = TfidfVectorizer()

X_vector = vectorizer.fit_transform(X)

# splits the data for training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X_vector, Y, test_size=0.2, random_state=42)

# trains different knn models, changing the hyperparameter k each time
for i in [3, 4, 5, 6, 7]:
    knn_classifier = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)
    knn_classifier.fit(X_train, Y_train)

    file = open("./results/knn_result" + "_" + str(i), "a")
    sys.stdout = file

    Y_predictions = knn_classifier.predict(X_test)

    # writes the report of each iteration on a separate file
    report = classification_report(Y_test, Y_predictions)
    print(report)

    sys.stdout = sys.__stdout__
    file.close()

#Random forest

In [None]:
import json
import sys
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# loads the preprocessed data
articles = []
with open("preprocessed.json", "r") as file:
    for line in file:
        j = json.loads(line)
        articles.append(j)

# joins the separate values into a single string for every article
X = []
Y = []
for article in articles:
    X.append(article['source'] + ' ' + article['headline'] + ' ' + article['authors'] + ' ' + article['short_description'] + ' ' + str(article['date']))
    Y.append(article['category'])

# splits the data for training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# sets the parameters for grid search
grid = {
    'randomforestclassifier__max_features': ['sqrt', 'log2', None],
    'randomforestclassifier__n_estimators': [50, 75, 100],
    'randomforestclassifier__max_depth': [200, 400, None]
}

# creates the pipeline and trains the model
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier())
grid_search = GridSearchCV(pipeline, grid, cv=3, n_jobs=-1)

grid_search.fit(X_train, Y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

Y_predictions = best_model.predict(X_test)

# prints the repost to a separate file
file = open("./results/forest", "a")
sys.stdout = file
report = classification_report(Y_test, Y_predictions)
print(report)
sys.stdout = sys.__stdout__
file.close()

# prints the parameters of the best model to a separate file
file = open("./results/forest_params", "w")
sys.stdout = file
print(best_params)
sys.stdout = sys.__stdout__
file.close()

#XGBoost

In [None]:
import json
import xgboost as xbg
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import sys

# loads the preprocessed data
articles = []
with open("preprocessed.json", "r") as file:
    for line in file:
        j = json.loads(line)
        articles.append(j)

# joins the separate values into a single string for every article
X = []
Y = []
for article in articles:
    X.append(article['source'] + ' ' + article['headline'] + ' ' + article['authors'] + ' ' + article['short_description'] + ' ' + str(article['date']))
    Y.append(article['category'])

vectorizer = TfidfVectorizer()
encoder = LabelEncoder()

# vectorizes the data, encodes the array of categories and splits the data into training and testing
X_vector = vectorizer.fit_transform(X)
Y_encoded = encoder.fit_transform(Y)
X_train, X_test, Y_train, Y_test = train_test_split(X_vector, Y_encoded, test_size=0.2, random_state=42)

# trains the model
xgboost_classifier = xbg.XGBClassifier(n_jobs=-1)
xgboost_classifier.fit(X_train, Y_train)

# decodes the predictions
Y_predictions = xgboost_classifier.predict(X_test)
Y_predictions = encoder.inverse_transform(Y_predictions)
Y_test = encoder.inverse_transform(Y_test)

# prints the results to a separate file
file = open("./results/xgboost", "a")
sys.stdout = file
report = classification_report(Y_test, Y_predictions)
print(report)
sys.stdout = sys.__stdout__
file.close()