# INF554 Team ==Baseline== Language Detection and Text Stemming
### Francisco, Alex and Aksel

In [1]:
import random
import numpy as np
import pandas as pd
import csv
from tqdm import tqdm
import seaborn as sns
import os
import networkx as nx
import pdb
import pickle
from collections import Counter
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction import text as fe
from sklearn.decomposition import NMF, LatentDirichletAllocation
import matplotlib.pyplot as plt
import datetime
import time
import keras
import lightgbm
import spacy
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
import nltk
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe, fmin

Using TensorFlow backend.


In [4]:
'''
load the corpus one file at a time, remove spaces and convert to lower case
'''
NODE_INFO_DIRECTORY = r"node_information/text/"

corpus_path = r"pickles/simple_corpus.PICKLE" 
ids_path = r"pickles/ids.PICKLE"
if os.path.exists(corpus_path):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)
    f.close()
    with open(ids_path, 'rb') as f:
        ids = pickle.load(f)
    f.close()
else:
    corpus = []
    ids = []
    for filename in tqdm(os.listdir(NODE_INFO_DIRECTORY), position=0, leave=True):
        with open(NODE_INFO_DIRECTORY + filename, 'r', encoding='UTF-8', errors='ignore') as f:
            doc_string = []
            for line in f:
                [doc_string.append(token.strip()) for token in line.lower().strip().split(" ") if token != ""]
            corpus.append(' '.join(doc_string))
            ids.append(filename[:-4])
    with open(corpus_path, '+wb') as f:
        pickle.dump(corpus, f)
    f.close()
    with open(ids_path, '+wb') as f:
        pickle.dump(ids, f)
    f.close() 

Training node info shape: (33226, 2)


In [146]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize

'''
find the most likely languages in a document by counting the number of stop words for each language.
Stopwords are those that are supported by the tokenizer
'''
def calculate_languages_ratios(text):
    languages_ratios = []
    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios.append(len(common_elements))
    if sum(languages_ratios) == 0:
        return np.zeros(len(languages_ratios))
    return np.array(languages_ratios)/sum(languages_ratios)

'''
find the most likely languages in a tokenized document by counting the number of stop words for each language.
Stopwords are those that are supported by the tokenizer and supported by the Snowball Stemmer. 
This function is used to stem the document and so will differ from the above (less supported languages)
'''
def calculate_languages_ratios_from_tokens(tokens):
    languages_ratios = []
    words = [word.lower() for word in tokens]
    supported_languages = set(stopwords.fileids()) & set(SnowballStemmer.languages)
    for language in supported_languages:
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios.append(len(common_elements))
    if sum(languages_ratios) == 0:
        return np.zeros(len(languages_ratios))
    return np.array(languages_ratios)/sum(languages_ratios)

In [5]:
'''
for each document compute the language ratio
Counting the stopwords is deterministic. Saved language ratios will be the same.
'''
languages_nltk_path = r"pickles/languages_nltk.PICKLE"
if os.path.exists(languages_nltk_path):
    with open(languages_nltk_path, 'rb') as f:
        languages_nltk = pickle.load(f)
    f.close()
else:
    languages_nltk = {}
    loop_size = len(node_info['Corpus'])
    # some documents have no text and therefore will raise an error
    for i in tqdm(range(loop_size), total=loop_size, leave=True, position=0):
        try:
            languages_nltk[node_info['ID'][i]] =  calculate_languages_ratios(node_info['Corpus'][i])
        except:
            print('Error: {}'.format(i))
            languages_nltk.append(None)
    
    with open(languages_nltk_path, '+wb') as f:
        pickle.dump(languages_nltk, f)
    f.close()

100%|████████████████████████████████████████████████████████████████████████████| 33226/33226 [17:04<00:00, 32.44it/s]


In [183]:
'''
this cell stems the document according to the most likely languages. 
If there is/are predominant language (25% of all stopwords detected belong to one language) then add these to the language.
Else take all languages with >10% stop word representation (good for multi-lingual texts)
FInally if none of the above hold, take the language which has the most ammount of stopwords in the document.
'''
supported_languages = list(set(stopwords.fileids()) & set(SnowballStemmer.languages))
stemmed_corpus = []
for text in tqdm(node_info['Corpus'].values, position=0, leave=True):
    tokens = word_tokenize(text)
    ratio = calculate_languages_ratios_from_tokens(tokens)
    if np.sum(ratio == 0):
        pass
    if np.any(ratio>=0.25):
        indices = np.where(ratio >= 0.25)[0]
        langs = [supported_languages[j] for j in indices]
    elif np.all(ratio<0.25) and np.any(ratio>0.10):
        indices = np.where(ratio > 0.10)[0]
        langs = [supported_languages[j] for j in indices]
    else:
        langs = [supported_languages[np.argmax(ratio)]]

    for lang in langs:
        lang_stopwords = stopwords.words(lang)
        stemmer = SnowballStemmer(lang)
        tokens = [stemmer.stem(word) for word in tokens if (word not in lang_stopwords) and word.isalpha()]
    stemmed_corpus.append(' '.join(tokens))

100%|██████████████████████████████████████████████████████████████████████████| 33226/33226 [1:40:17<00:00,  5.52it/s]


In [184]:
with open('pickles/stemmed_corpus.PICKLE', '+wb') as f:
    pickle.dump(stemmed_corpus, f)
f.close()