# Reading articles from Wikipedia 
based on the list of names
https://pypi.org/project/wikipedia/

In [1]:
import wikipedia
import pandas as pd
import glob
import csv
import requests

## Scrape articles for Eastern European Female Leaders

In [125]:
# Read the list of Women EE names
df = pd.read_csv('EE_name_country.csv', names = ['name', 'country'])
df.head()

Unnamed: 0,name,country
0,Sviatlana Tsikhanouskaya,Belarus
1,Olga Abramova (politician),Belarus
2,Alena Anisim,Belarus
3,Natallia Eismant,Belarus
4,Maria Kalesnikava,Belarus


In [127]:
#Save list of countries
country_list = df.country.unique()

# Number of articles in initial list by country
df.groupby('country').count()

Unnamed: 0_level_0,name
country,Unnamed: 1_level_1
Belarus,15
Bulgaria,22
CZ,15
Hungary,57
Moldova,59
Poland,142
RU,173
Romania,54
Ukraine,131


In [188]:
country = country_list[5]
df['name'].iloc[239:][df.country == country]

239                 Ewa Janik
240               Bogna Janke
241        Elżbieta Jankowska
242       Małgorzata Janowska
243    Izabela Jaruga-Nowacka
                ...          
305         Alicja Olechowska
306        Małgorzata Olejnik
307           Halina Olendzka
308      Daria Gosek-Popiołek
309               Dorota Olko
Name: name, Length: 71, dtype: object

In [192]:
# Save content of Wiki article pages from the list
articles = []

# Change country 
country = country_list[-1]

for n in df['name'].iloc[239:][df.country == country]:
    try:
        page = wikipedia.page(n)
    except wikipedia.exceptions.DisambiguationError as e:
        print(e.options)
    except wikipedia.exceptions.PageError as e:
        print(e)
    except requests.exceptions.ConnectionError as e:
        e.status_code = "Connection refused"
    else:
        text = page.content
        # Save only articles that are over 800 symbols
        if len(text) > 10:
            articles.append(text)
print('Articles downloaded:', len(articles))

Page id "viktoria kinzburska" does not match any pages. Try another id!
Page id "anna romanov" does not match any pages. Try another id!
Page id "kirk rudi" does not match any pages. Try another id!
Articles downloaded: 128


In [193]:
# How many articles has been downloaded 
print('Articles downloaded:', len(articles))
# what is the first article
print('First article:', articles[0][:100])
# what is the last article
print('Last article:', articles[-1][:100])

Articles downloaded: 128
First article: Olha Pavlivna Aivazovska (Ukrainian: Ольга Павлівна Айвазовська; born 9 February 1981 in Zalishchyky
Last article: Yaryna Bohdanivna Turchyn (Ukrainian: Турчин Ярина Богданівна, born (1975-10-22)October 22, 1975 in 


In [194]:
# Save downloaded acticles into CSV file
dic = {'articles': articles}
articles_df = pd.DataFrame(dic)
filename = country +'_articles.csv'
articles_df.to_csv('EE_articles/' + filename)

## Importing saved articles

In [12]:
# Read the list of Women EE names
df = pd.read_csv('EE_name_country.csv', names = ['name', 'country'])
#Save list of countries
country_list = df.country.unique()

# Import all CSV files from all Wiki articles and save them to one list
articles_df = pd.DataFrame({'articles': [], 'country': []})

for country in country_list:
    filename = country +'_*.csv'
    files = glob.glob('EE_articles/' + filename)
    
    all_articles = []
    for file in files:
        read_handle = open(file, "r")
        text = list(csv.reader(read_handle, delimiter=","))        
        for article in text[1:]:
            all_articles.append(article[1])     
    
    country_df = pd.DataFrame({'articles': all_articles, 'country': country})
    articles_df = pd.concat([articles_df, country_df], ignore_index=True)

# How many articles has been read
print('Number of articles:', len(articles_df))

# Drop duplicates
articles_df = articles_df.drop_duplicates()

# Count the number of rows after
lenAfter = len(articles_df)
print("After de-dupe: " + str(lenAfter))

# Number of articles in initial list by country
articles_df.groupby('country').count()

Number of articles: 655
After de-dupe: 653


Unnamed: 0_level_0,articles
country,Unnamed: 1_level_1
Belarus,14
Bulgaria,21
CZ,15
Hungary,55
Moldova,59
Poland,141
RU,169
Romania,51
Ukraine,128


## Cleaning and tokenisation
- Removing headers - https://www.w3schools.com/python/python_regex.asp
- Removing stop words and numbers
- Stemming and lemmitisation
- Tokenisation

In [3]:
import re
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [13]:
# Based on https://www.geeksforgeeks.org/nlp-gensim-tutorial-complete-guide-for-beginners/

all_tokens = []

# import stopwords
stop_words = stopwords.words('english')
wnl = WordNetLemmatizer()

# Check for latin alphabet 
# https://stackoverflow.com/questions/27084617/detect-strings-with-non-english-characters-in-python
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True


for article in articles_df['articles']:
    # Removing section headers and new line breaks
    text = re.sub("==.*==",'', article)
    text = re.sub("\n",'', text)
    
    # Convert a document into a list of tokens 
    # This lowercases, tokenizes, removes numerical values
    tokens = simple_preprocess(text)
    
    doc_out = []
    for word in tokens:    
        if word not in stop_words:  # to remove stopwords
            if isEnglish(word):
                Lemmatized_Word = wnl.lemmatize(word)  # lemmatize
                doc_out.append(Lemmatized_Word)
    
    all_tokens.append(doc_out)

# Print out infromation about articles and number of tokens for top 15
print('Tokens groups:', len(all_tokens),'\n')
print("{0:7}{1:10}".format("-No-","--Tokens--"))
for x, tokens in enumerate(all_tokens[:15]):
    print("{0:3}{1:10}".format(x + 1, len(tokens)))

Tokens groups: 653 

-No-   --Tokens--
  1      1468
  2       976
  3       140
  4       320
  5      1289
  6       343
  7        31
  8       579
  9       268
 10       197
 11       138
 12       273
 13       280
 14       248
 15       341


# Dictionary - Corpus - LDA

In [14]:
from gensim.corpora.dictionary import Dictionary
from gensim.corpora import MmCorpus
from gensim.models import LdaModel, LdaMulticore, CoherenceModel
import matplotlib.pyplot as plt

In [100]:
# create dictionary - a map of unique tokens
dictionary = Dictionary(all_tokens)
dictionary.filter_extremes(no_below = 15, no_above = 0.7)
print('Dictionary length:', len(dictionary.keys()))

# 100 tokens by frequency for cleaned up dictionary
t_most_freq = dictionary.most_common(100)
print('Top 20 tokens by frequency\n')

num = 1
for t, f in t_most_freq[:20]:
    print(str(num) + '.', t, '-', f)
    num = num + 1


# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(token) for token in all_tokens]
print('\nCorpus created. \nCorpus length:', len(corpus))

Dictionary length: 1308
Top 20 tokens by frequency

1. ukraine - 1395
2. party - 1343
3. russian - 1337
4. state - 1142
5. ukrainian - 1108
6. election - 1088
7. deputy - 1003
8. minister - 817
9. european - 797
10. president - 736
11. russia - 721
12. also - 720
13. national - 720
14. committee - 718
15. elected - 716
16. parliament - 700
17. parliamentary - 679
18. university - 660
19. became - 562
20. year - 556

Corpus created. 
Corpus length: 653


In [102]:
# Writing top 250 tokens for all articles by frequency to a CSV file
top_df = pd.DataFrame(dictionary.most_common(250))
top_df.columns = ['token', 'frequency']
top_df.to_csv('output/EE_frequency_top_250.csv', index = False, header = True)

In [296]:
# save dictionary to disk
dictionary.save('EE_Wiki_dictionary.dict')
# save corpus to disk
MmCorpus.serialize("EE_Wiki_corpus.mm", corpus)

## TF-IDF

In [17]:
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

In [202]:
# Term frequency - inverse document frequency model
tfidf = TfidfModel(corpus)

In [203]:
article = 87
print('\nTop-10 tokens for article No', article, '\n')
tfidf_int = tfidf[corpus[(article - 1)]]

# Sort the weights from highest to lowest: sorted_tfidf_weights
sorted_tfidf_weights = sorted(tfidf_int, key=lambda w: w[1], reverse=True)

# Print the top 10 weighted words
for term_id, weight in sorted_tfidf_weights[:10]:
    print("{0:15}{1:10}".format(dictionary.get(term_id), weight))


Top-10 tokens for article No 87 

moldova        0.6224351539558823
judge          0.3767640762765586
lawyer         0.3346266050167051
court          0.24184508916144154
jurist         0.1821387024914272
republic       0.16931240645894055
parliamentary  0.15275787175852834
constitutional 0.14905253723405457
appointment    0.1461174799751714
period         0.1394567227916222


### Bigrams

In [18]:
# Train a toy phrase model on our training corpus.
phrase_model = Phrases(all_tokens, min_count = 10, threshold = 50, connector_words = ENGLISH_CONNECTOR_WORDS)

print(phrase_model)

# Finding bigrams in the interview 
bigrams = phrase_model.find_phrases(all_tokens)

Phrases<129526 vocab, min_count=10, threshold=50, max_vocab_size=40000000>


In [19]:
#Saving Bigrams as a dictionary
bigrams_dic = {'bigram': [], 'score': []}
for i in bigrams:
    bigrams_dic['bigram'].append(i)
    bigrams_dic['score'].append(bigrams[i])
bigrams_df = pd.DataFrame(bigrams_dic)
bigrams_df = bigrams_df.sort_values( by = ['score'], ascending = False)
bigrams_df = bigrams_df.reset_index(drop=True)

# Write bigrams to a CSV
bigrams_df.to_csv('output/ee_full_wiki_bigrams.csv')
# Showing top 15 bigrams for all articles
bigrams_df.head(15)

Unnamed: 0,bigram,score
0,kluzik_rostkowska,3029.847953
1,katser_buchkovska,2698.458333
2,gronkiewicz_waltz,2502.917874
3,ivano_frankivsk,2299.278107
4,donald_tusk,2055.968254
5,nizhny_novgorod,1962.515152
6,liliia_hrynevych,1644.774603
7,sasha_bezuhanova,1555.867868
8,record_transcript,1530.223323
9,dmitry_medvedev,1488.804598
