In [13]:

#NLP libraries
import nltk
from gensim.models import Doc2Vec
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim import corpora
import textstat

#Machine learning libraries
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#Helper libraries
import multiprocessing
import numpy as np
import pandas as pd
import math
from bs4 import BeautifulSoup
import re
import os


np.random.seed(2023)


nltk.download('punkt', download_dir="./data_model/")
nltk.download("stopwords", download_dir="./data_model/")
nltk.data.path.append(os.path.abspath("./data_model/"))

[nltk_data] Downloading package punkt to ./data_model/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ./data_model/...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
_stopwords = nltk.corpus.stopwords.words("english")
print(_stopwords)

def clean(text):  #Removing unecessary punctuation and all lower case.
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = text.replace('„','')
    text = text.replace('“','')
    text = text.replace('"','')
    text = text.replace('\'','')
    text = text.replace('-','')
    text = text.lower()
    return text

def remove_stopwords(content):
    for word in _stopwords:
        content = content.replace(' '+word+' ',' ')
    return content


def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 3:
                continue
            tokens.append(word.lower())
    return tokens

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [26]:
text_file_dir="./data/wikispeedia_articles_plaintext/plaintext_articles/"
file_data=[]

for filename in os.listdir(text_file_dir):
    with open(os.path.join(text_file_dir, filename), "r") as file:
        content = file.read()
    
        # Split the content into lines to remove the header
    lines = content.split('\n')

    #Removing the header (line 0)
    if lines:
        lines.pop(0)

            

    # Making it an array and removing all \n
    content = '\n'.join(lines)
    content = content.replace("\n", " ")


    new_file_content= {"text_content": content}
    
    file_data.append(new_file_content)

project_data = pd.DataFrame(file_data)

project_data['text_content'] = project_data['text_content'].apply(clean)
project_data['text_content'] = project_data['text_content'].apply(remove_stopwords)

project_tagged = project_data.apply(
   lambda r: TaggedDocument(words=tokenize_text(r['text_content']), tags=  []), axis=1)

project_data

Unnamed: 0,text_content
0,second crusade 2007 schools wikipedia selecti...
1,navassa island 2007 schools wikipedia selecti...
2,evan rachel wood 2007 schools wikipedia selec...
3,tropical storm henri (2003) 2007 schools wiki...
4,final fantasy adventure 2007 schools wikipedi...
...,...
4599,réunion 2007 schools wikipedia selection. rel...
4600,flower 2007 schools wikipedia selection. rela...
4601,banknote 2007 schools wikipedia selection. re...
4602,weyto language 2007 schools wikipedia selecti...


In [10]:




dictionary = corpora.Dictionary(project_tagged [567])

# Create a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in project_tagged[567]]

# Create an LDA model using gensim
lda_model = gensim.models.LdaMulticore(doc_term_matrix, num_topics=10, id2word=dictionary, passes=2, workers=2)

# Print topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.025*"national" + 0.012*"park" + 0.007*"historic" + 0.006*"monument" + 0.005*"site" + 0.005*"memorial" + 0.005*"area" + 0.004*"new" + 0.003*"affiliated" + 0.003*"virginia"
Topic: 1 
Words: 0.059*"national" + 0.019*"park" + 0.012*"site" + 0.010*"historic" + 0.009*"area" + 0.009*"monument" + 0.008*"new" + 0.008*"historical" + 0.007*"washington" + 0.006*"memorial"
Topic: 2 
Words: 0.014*"national" + 0.006*"park" + 0.006*"historic" + 0.005*"site" + 0.004*"monument" + 0.003*"area" + 0.003*"memorial" + 0.003*"new" + 0.002*"washington" + 0.002*"historical"
Topic: 3 
Words: 0.015*"national" + 0.004*"park" + 0.004*"monument" + 0.004*"site" + 0.004*"historic" + 0.003*"area" + 0.003*"memorial" + 0.003*"new" + 0.002*"historical" + 0.002*"affiliated"
Topic: 4 
Words: 0.021*"national" + 0.008*"park" + 0.007*"historic" + 0.005*"monument" + 0.005*"site" + 0.004*"new" + 0.004*"memorial" + 0.004*"area" + 0.003*"washington" + 0.003*"recreation"
Topic: 5 
Words: 0.100*"national" + 0.029*

In [36]:
import os
import pandas as pd

text_file_dir = "./data/wikispeedia_articles_plaintext/plaintext_articles/"
file_data = []

for filename in os.listdir(text_file_dir):
    if filename.endswith(".txt"):  # Ensure you're only reading .txt files
        with open(os.path.join(text_file_dir, filename), 'r', encoding='utf-8') as file:
            content = file.read()
        
            # Split the content into lines to remove the header
            lines = content.split('\n')

            # Removing the header (line 0) and join back the content
            if lines:
                title = lines.pop(2)  # Assuming the first line is the title
            
            content = ' '.join(lines).replace("\n", " ")
            
            # Create a dictionary with title and content
            file_data.append({"title": title, "text_content": content})

# Create a DataFrame
project_data = pd.DataFrame(file_data)

# Display the DataFrame structure
print(project_data.head())


                         title  \
0               Second Crusade   
1               Navassa Island   
2             Evan Rachel Wood   
3  Tropical Storm Henri (2003)   
4      Final Fantasy Adventure   

                                        text_content  
0     #copyright   2007 Schools Wikipedia Selecti...  
1     #copyright   2007 Schools Wikipedia Selecti...  
2     #copyright   2007 Schools Wikipedia Selecti...  
3     #copyright   2007 Schools Wikipedia Selecti...  
4     #copyright   2007 Schools Wikipedia Selecti...  


In [32]:
#Measure the complexity of an article 


score = textstat.flesch_reading_ease(project_data['text_content'][1])
print(f"The Flesch Reading-Ease score for the text is: {score}")

gunning_fog_score = textstat.gunning_fog(project_data['text_content'][1])
print(f"The Gunning Fog Index for the text is: {gunning_fog_score}")




The Flesch Reading-Ease score for the text is: 54.73
The Gunning Fog Index for the text is: 10.4


' Navassa Island  2007 Schools Wikipedia Selection. Related subjects: North American Geography     Navassa Island map from The World Factbook    Enlarge    Navassa Island map from The World Factbook     Navassa Island (French: La Navase, Haitian Kreyòl: Lanavaz or Lavash)    is a small, uninhabited island in the Caribbean Sea. The government of    the United States claims the island as an unorganized unincorporated    territory, part of the United States Minor Outlying Islands, where it    is administered by the U.S. Fish and Wildlife Service. However, the    island is also claimed by Haiti.  Geography and Topography     Navassa Island is about two square miles (5.2 km²). It is found at a    strategic location 160 km (90 nautical miles) south of the U.S. naval    base at Guantanamo Bay, Cuba, about one-fourth of the way from Haiti to    Jamaica in the Jamaica Channel. It reaches an elevation of 77 m at an    unnamed peak 100 m south of the lighthouse, Navassa Island Light. This    loca

In [46]:
project_data['flesch_reading_ease'] = project_data['text_content'].apply(textstat.flesch_reading_ease)
project_data['gunning_fog_index'] = project_data['text_content'].apply(textstat.gunning_fog)

print(project_data.head())


                         title  \
0               Second Crusade   
1               Navassa Island   
2             Evan Rachel Wood   
3  Tropical Storm Henri (2003)   
4      Final Fantasy Adventure   

                                        text_content  flesch_reading_ease  \
0     #copyright   2007 Schools Wikipedia Selecti...                63.83   
1     #copyright   2007 Schools Wikipedia Selecti...                54.73   
2     #copyright   2007 Schools Wikipedia Selecti...                52.43   
3     #copyright   2007 Schools Wikipedia Selecti...                55.24   
4     #copyright   2007 Schools Wikipedia Selecti...                58.92   

   gunning_fog_index  
0              11.82  
1              10.40  
2              13.67  
3               9.67  
4              11.33  


In [52]:



min_gunning_fog_index = project_data['gunning_fog_index'].min()
title_with_min_gunning_fog_index = project_data.loc[project_data['gunning_fog_index'] == min_gunning_fog_index, 'title'].iloc[0]
print(title_with_min_gunning_fog_index, min_gunning_fog_index)

min_flesch_reading_ease_index = project_data['flesch_reading_ease'].min() 
title_with_min_flesch_reading_ease_index = project_data.loc[project_data['flesch_reading_ease'] == min_flesch_reading_ease_index, 'title'].iloc[0]
print(title_with_min_flesch_reading_ease_index, min_flesch_reading_ease_index)

max_gunning_fog_index = project_data['gunning_fog_index'].max()
title_with_max_gunning_fog_index = project_data.loc[project_data['gunning_fog_index'] == max_gunning_fog_index, 'title'].iloc[0]
print(title_with_max_gunning_fog_index, max_gunning_fog_index)

max_flesch_reading_ease_index = project_data['flesch_reading_ease'].max() 
title_with_max_flesch_reading_ease_index = project_data.loc[project_data['flesch_reading_ease'] == max_flesch_reading_ease_index, 'title'].iloc[0]
print(title_with_max_flesch_reading_ease_index, max_flesch_reading_ease_index)



List of Canadian provinces and territories by area 5.09
List of sovereign states -81.73
List of sovereign states 59.84
Babe Ruth 83.25
