In [1]:
import os
import re

import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from nltk.corpus import wordnet

SRC_PATH="src"

# Change the value to True if you want to see the heatmap for correlations
display_accuracy_measures = False

In [2]:
def load_housing_data(src=SRC_PATH):
    csv_path = os.path.join(src, "src.csv")
    return pd.read_csv(csv_path)

In [3]:
df = load_housing_data()
df.head()

Unnamed: 0,Title,Links,Categories,Images,References,Length,Quality,Theme
0,Hammurabi,1103,45,44,97,15484,4.0,People
1,Hatshepsut,755,49,18,97,33266,3.0,People
2,Ramesses II,662,43,25,119,32507,2.0,People
3,Cyrus the Great,515,61,32,213,47616,2.0,People
4,Alexander the Great,1910,71,75,401,89053,4.0,People


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Title       998 non-null    object 
 1   Links       998 non-null    int64  
 2   Categories  998 non-null    int64  
 3   Images      998 non-null    int64  
 4   References  998 non-null    int64  
 5   Length      998 non-null    int64  
 6   Quality     997 non-null    float64
 7   Theme       998 non-null    object 
dtypes: float64(1), int64(5), object(2)
memory usage: 62.5+ KB


In [5]:
df["Theme"].value_counts()

Science                         210
Society and social sciences     146
People                          112
Geography                       106
Technology                       98
History                          83
Everyday life                    57
Philosophy and religion          55
Arts                             45
Mathematics                      45
Health, medicine and disease     41
Name: Theme, dtype: int64

In [6]:
df.describe()

Unnamed: 0,Links,Categories,Images,References,Length,Quality
count,998.0,998.0,998.0,998.0,998.0,997.0
mean,859.871743,38.109218,41.362725,218.013026,50315.901804,2.749248
std,513.129578,25.571013,37.425723,181.221618,25837.048931,0.959685
min,99.0,7.0,2.0,6.0,7047.0,1.0
25%,477.0,21.0,21.0,86.0,30272.0,2.0
50%,733.0,30.0,31.0,157.0,46976.5,3.0
75%,1140.75,44.0,48.75,297.75,63421.5,3.0
max,3662.0,153.0,339.0,1310.0,165877.0,5.0


In [7]:
# Drop missing values
df = df.dropna()

In [8]:
df[df.isna().any(axis=1)]

Unnamed: 0,Title,Links,Categories,Images,References,Length,Quality,Theme


In [9]:
df_all_numerical = df.copy()
df_all_numerical.head()

Unnamed: 0,Title,Links,Categories,Images,References,Length,Quality,Theme
0,Hammurabi,1103,45,44,97,15484,4.0,People
1,Hatshepsut,755,49,18,97,33266,3.0,People
2,Ramesses II,662,43,25,119,32507,2.0,People
3,Cyrus the Great,515,61,32,213,47616,2.0,People
4,Alexander the Great,1910,71,75,401,89053,4.0,People


In [10]:
categories = sorted(set(df['Theme']))

In [11]:
i = 0
mapping = {}
for category in categories:
    mapping.update({category: i})
    i+=1

df_all_numerical["T.Numerical"] = df_all_numerical.Theme.map(mapping)
df_all_numerical.head()

Unnamed: 0,Title,Links,Categories,Images,References,Length,Quality,Theme,T.Numerical
0,Hammurabi,1103,45,44,97,15484,4.0,People,6
1,Hatshepsut,755,49,18,97,33266,3.0,People,6
2,Ramesses II,662,43,25,119,32507,2.0,People,6
3,Cyrus the Great,515,61,32,213,47616,2.0,People,6
4,Alexander the Great,1910,71,75,401,89053,4.0,People,6


# Word Vector Creation

In [12]:
# Importing the necessary functions
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import re, string


def lemmatise(text):
    """Returns lemmatised tokens from the text."""
    """Based on our lab3_preprocessing.ipynb code."""
    text = text.lower().strip()
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)

    # We will use a tokenizer from the NLTK library
    filtered_sentence = []
    # Stop word lists can be adjusted for your problem
    stop_words = nltk.corpus.stopwords.words('english')

    # Tokenize the sentence
    words = word_tokenize(text)
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(w)
    text = " ".join(filtered_sentence)

    # Initialize the lemmatizer
    wl = WordNetLemmatizer()

    # This is a helper function to map NTLK position tags
    # Full list is available here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    lemmatized_sentence = []
    # Tokenize the sentence
    words = word_tokenize(text)
    # Get position tags
    word_pos_tags = nltk.pos_tag(words)
    # Map the position tag and lemmatize the word/token
    for idx, tag in enumerate(word_pos_tags):
        lemmatized_sentence.append(wl.lemmatize(tag[0], get_wordnet_pos(tag[1])))

    return lemmatized_sentence

In [13]:
list_of_df_all_numerical = df_all_numerical.values.tolist()
dict_of_df_all_numerical = {}
for article in list_of_df_all_numerical:
    dict_of_df_all_numerical[article[0]] = article[1:]
dict_of_df_all_numerical

{'Hammurabi': [1103, 45, 44, 97, 15484, 4.0, 'People', 6],
 'Hatshepsut': [755, 49, 18, 97, 33266, 3.0, 'People', 6],
 'Ramesses II': [662, 43, 25, 119, 32507, 2.0, 'People', 6],
 'Cyrus the Great': [515, 61, 32, 213, 47616, 2.0, 'People', 6],
 'Alexander the Great': [1910, 71, 75, 401, 89053, 4.0, 'People', 6],
 'Ashoka': [780, 46, 57, 149, 84837, 3.0, 'People', 6],
 'Qin Shi Huang': [400, 57, 23, 129, 39510, 3.0, 'People', 6],
 'Julius Caesar': [840, 93, 41, 358, 53213, 3.0, 'People', 6],
 'Augustus': [867, 65, 48, 145, 84168, 5.0, 'People', 6],
 'Charlemagne': [1099, 108, 58, 183, 83252, 2.0, 'People', 6],
 'Genghis Khan': [611, 82, 38, 173, 56848, 3.0, 'People', 6],
 'Mansa Musa': [133, 33, 10, 67, 17536, 2.0, 'People', 6],
 'Joan of Arc': [643, 84, 32, 937, 45538, 5.0, 'People', 6],
 'Suleiman the Magnificent': [403, 48, 31, 91, 37090, 3.0, 'People', 6],
 'Akbar': [646, 61, 32, 167, 91274, 3.0, 'People', 6],
 'Elizabeth I': [615, 81, 42, 138, 57770, 5.0, 'People', 6],
 'Catherine 

Create a dictionary of articles. Title as key, value contains the category and summary of lemmanized words

In [14]:
from wikipedia_page_data_fetcher import get_article_data_from_file
import os

# assign directory
directory = 'articles'

LIST_OF_ARTICLES = [x.removesuffix(".txt") for x in os.listdir(directory)]
all_article_data = {name: get_article_data_from_file(name, use_lemmatised_version = False) for name in LIST_OF_ARTICLES}

word_list = []
article_list = {}

for filename in LIST_OF_ARTICLES:
    summary = lemmatise(all_article_data[filename]["Summary"])
    try:
        category = dict_of_df_all_numerical[filename][6]
    except KeyError:
        continue
        # print(filename)
        # Files skipped:
            # Acid-base reaction
            # NiccolĽ Machiavelli
            # SĂo Paulo
            # Wind
    article_list[filename] = [category, summary]
    word_list.append(summary)


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Peter\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Peter\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Peter\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Peter\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
article_list

{'0': ['Mathematics',
  ['0',
   'zero',
   'number',
   'numerical',
   'digit',
   'use',
   'represent',
   'number',
   'numeral',
   'fulfill',
   'central',
   'role',
   'mathematics',
   'additive',
   'identity',
   'integer',
   'real',
   'number',
   'many',
   'algebraic',
   'structure',
   'digit',
   '0',
   'use',
   'placeholder',
   'place',
   'value',
   'system',
   'name',
   'number',
   '0',
   'english',
   'include',
   'zero',
   'nought',
   'uk',
   'naught',
   'u',
   'nil',
   'or—in',
   'context',
   'least',
   'one',
   'adjacent',
   'digit',
   'distinguishes',
   'letter',
   '—oh',
   'informal',
   'slang',
   'term',
   'zero',
   'include',
   'zilch',
   'zip',
   'ought',
   'aught',
   'well',
   'cipher',
   'also',
   'use',
   'historically']],
 'Abiogenesis': ['Science',
  ['biology',
   'abiogenesis',
   '‘',
   '’',
   'greek',
   'bios',
   '‘',
   'life',
   '’',
   'genesis',
   'origin',
   'origin',
   'life',
   'natural',
   '

In [16]:
# Flatten the word list
import functools
import operator
word_list = functools.reduce(operator.concat, word_list)

In [17]:
from collections import Counter

# import matlab.mode
# [my_value,my_frequency]= mode(word_list)
# Counter(word_list).most_common()

freq = Counter(word_list)
freq = sorted(freq, key=freq.get, reverse=True)
freq

['use',
 'include',
 'also',
 'world',
 'century',
 'one',
 'form',
 'many',
 'state',
 'large',
 'system',
 'human',
 'may',
 'first',
 'country',
 'know',
 'call',
 'time',
 'number',
 'early',
 'language',
 'year',
 'city',
 'modern',
 'often',
 'empire',
 'term',
 'become',
 'two',
 'make',
 'area',
 'million',
 'war',
 'people',
 'part',
 'new',
 'work',
 'well',
 'study',
 'earth',
 'age',
 'power',
 'period',
 'history',
 'high',
 'culture',
 'lead',
 'great',
 'life',
 'social',
 'begin',
 '1',
 'develop',
 'science',
 'development',
 'different',
 'region',
 'process',
 'europe',
 'group',
 'population',
 'natural',
 'water',
 'east',
 'since',
 'energy',
 'consider',
 'political',
 'asia',
 'around',
 'ancient',
 'western',
 '000',
 'european',
 'theory',
 'art',
 'cause',
 'south',
 'type',
 'write',
 'major',
 'change',
 'specie',
 'animal',
 'body',
 'result',
 'base',
 'united',
 'various',
 'chemical',
 'within',
 'example',
 'greek',
 'follow',
 'force',
 'field',
 'com

In [18]:
# Vocabulary of the top 5000 words
vocabulary = freq[:5000]
vocabulary

['use',
 'include',
 'also',
 'world',
 'century',
 'one',
 'form',
 'many',
 'state',
 'large',
 'system',
 'human',
 'may',
 'first',
 'country',
 'know',
 'call',
 'time',
 'number',
 'early',
 'language',
 'year',
 'city',
 'modern',
 'often',
 'empire',
 'term',
 'become',
 'two',
 'make',
 'area',
 'million',
 'war',
 'people',
 'part',
 'new',
 'work',
 'well',
 'study',
 'earth',
 'age',
 'power',
 'period',
 'history',
 'high',
 'culture',
 'lead',
 'great',
 'life',
 'social',
 'begin',
 '1',
 'develop',
 'science',
 'development',
 'different',
 'region',
 'process',
 'europe',
 'group',
 'population',
 'natural',
 'water',
 'east',
 'since',
 'energy',
 'consider',
 'political',
 'asia',
 'around',
 'ancient',
 'western',
 '000',
 'european',
 'theory',
 'art',
 'cause',
 'south',
 'type',
 'write',
 'major',
 'change',
 'specie',
 'animal',
 'body',
 'result',
 'base',
 'united',
 'various',
 'chemical',
 'within',
 'example',
 'greek',
 'follow',
 'force',
 'field',
 'com

In [19]:
import pickle
with open('vocabulary.pkl', 'wb') as f:
    pickle.dump(vocabulary, f)

In [20]:
# Importing numpy to store the vectors as numpy array
import numpy

def get_feature_vector_for_article(words_in_article):
    feature_vector= []
    for word in vocabulary:
        relative_word_occurrence = words_in_article.count(word) / len(words_in_article)
        feature_vector.append(relative_word_occurrence)
    return feature_vector

for article in article_list:
    feature_vector = get_feature_vector_for_article(article_list[article][1])
    feature_vector = numpy.array(feature_vector)
    article_list[article].insert(1, feature_vector)

Print name of the article if the Feature Vector is not 5000 (it shouldn’t print any names)

In [21]:
for k,v in article_list.items():
    if len(v[1]) != 5000:
        print(k)

In [22]:
df_vector = pd.DataFrame.from_dict(article_list, orient='index', columns=['Theme', 'FeatureVector', 'WordVector'])
df_vector.head()

Unnamed: 0,Theme,FeatureVector,WordVector
0,Mathematics,"[0.04838709677419355, 0.03225806451612903, 0.0...","[0, zero, number, numerical, digit, use, repre..."
Abiogenesis,Science,"[0.003003003003003003, 0.003003003003003003, 0...","[biology, abiogenesis, ‘, ’, greek, bios, ‘, l..."
Abortion,Society and social sciences,"[0.01282051282051282, 0.003205128205128205, 0....","[abortion, termination, pregnancy, removal, ex..."
Abraham Lincoln,People,"[0.0, 0.0030581039755351682, 0.006116207951070...","[abraham, lincoln, link, ən, february, 12, 180..."
Abraham,People,"[0.0, 0.005681818181818182, 0.0056818181818181...","[abraham, originally, abram, common, hebrew, p..."


In [23]:
file_name = "df_vector.pkl"
df_vector.to_pickle(file_name)

In [24]:
test_df_vector = pd.read_pickle(file_name)

In [25]:
test_df_vector.head()

Unnamed: 0,Theme,FeatureVector,WordVector
0,Mathematics,"[0.04838709677419355, 0.03225806451612903, 0.0...","[0, zero, number, numerical, digit, use, repre..."
Abiogenesis,Science,"[0.003003003003003003, 0.003003003003003003, 0...","[biology, abiogenesis, ‘, ’, greek, bios, ‘, l..."
Abortion,Society and social sciences,"[0.01282051282051282, 0.003205128205128205, 0....","[abortion, termination, pregnancy, removal, ex..."
Abraham Lincoln,People,"[0.0, 0.0030581039755351682, 0.006116207951070...","[abraham, lincoln, link, ən, february, 12, 180..."
Abraham,People,"[0.0, 0.005681818181818182, 0.0056818181818181...","[abraham, originally, abram, common, hebrew, p..."
